src/copysym.c

   1 /*
   2  * Copyright © 2023-2024 Nick Bowler
   3  *
   4  * Helper function to output the copyright symbol in a specified encoding.
   5  *
   6  * License WTFPL2: Do What The Fuck You Want To Public License, version 2.
   7  * This is free software: you are free to do what the fuck you want to.
   8  * There is NO WARRANTY, to the extent permitted by law.
   9  */
  10
  11 #if HAVE_CONFIG_H
  12 #       include <config.h>
  13 #endif
  14
  15 #if ENABLE_NLS
  16
  17 #include <stdlib.h>
  18 #include <string.h>
  19 #include <stddef.h>
  20
  21 #if HAVE_INTTYPES_H
  22 #include <inttypes.h>
  23 typedef uint_least32_t dx_u32;
  24 #else
  25 #include <limits.h>
  26 #if UINT_MAX >= 0xffffffff
  27 typedef unsigned dx_u32;
  28 #else
  29 typedef unsigned long dx_u32;
  30 #endif
  31 #endif
  32
  33 #define BSEARCH_ARRAY(key, arr, cmp) \
  34         bsearch(key, arr, sizeof (arr) / sizeof *(arr), sizeof *(arr), cmp)
  35
  36 enum { PREFIXLEN = 5 };
  37
  38 static int compar_prefix(const void *key, const void *elem_)
  39 {
  40         const char (*elem)[PREFIXLEN+1] = (void *)elem_;
  41
  42         return strncmp(key, *elem, PREFIXLEN);
  43 }
  44
  45 /*
  46  * Return, as a multibyte string, the copyright symbol for the
  47  * given character encoding, which is one of the strings returned
  48  * by Gnulib's locale_charset function.  In particular, we are
  49  * looking for one of the strings:
  50  *
  51  *     CP1129
  52  *     CP1250
  53  *     CP1251
  54  *     CP1252
  55  *     CP1253
  56  *     CP1254
  57  *     CP1256
  58  *     CP1257
  59  *     CP1258
  60  *     CP775
  61  *     CP850
  62  *     CP856
  63  *     CP857
  64  *     CP869
  65  *     CP922
  66  *     GEORGIAN-PS
  67  *     ISO-8859-1
  68  *     ISO-8859-13
  69  *     ISO-8859-14
  70  *     ISO-8859-15
  71  *     ISO-8859-7
  72  *     ISO-8859-8
  73  *     ISO-8859-9
  74  *     PT154
  75  *     EUC-JP
  76  *     GB18030
  77  *     KOI8-R
  78  *     KOI8-T
  79  *     KOI8-U
  80  *     UTF-8
  81  *
  82  * All of these are ASCII supersets.  EBCDIC code pages like CP1122 are
  83  * presently handled by returning (C), even if the character set does
  84  * include the copyright symbol.
  85  *
  86  * To simplify the implementation, we allow some slop in the matching,
  87  * as long as the result is valid for any actual encoding names.
  88  *
  89  * If NLS support is disabled, or if the character set does not
  90  * include the copyright symbol, then the string (C) is returned
  91  * in the C execution character set.
  92  */
  93 const char *copyright_symbol(const char *charset)
  94 {
  95         struct copysym_data {
  96                 char tab[15][PREFIXLEN+1];
  97                 char codes[24];
  98         };
  99
 100         /*
 101          * We need the list below to be in lexicographic order in
 102          * the C execution character encoding.
 103          */
 104 #if 'B'>'E' || 'C'>'E' || 'E'>'G' || 'G'>'I' || 'K'>'P' || 'P'>'U'
 105 #  error this character encoding is unsupported, please report a bug.
 106 #endif
 107
 108         static const struct copysym_data data = {
 109                 /*
 110                  * For character sets that include the copyright symbol,
 111                  * the first 5 characters suffices to distinguish amongst
 112                  * all the different possible encodings.
 113                  *
 114                  * The final byte of each entry indicates the corresponding
 115                  * offset into the codes array, except for CP112x and ISO-8859
 116                  * which use the values 0 and 1, respectively (handled below).
 117                  */
 118                 .tab =
 119                 {
 120                 "CP112\x00",
 121                 "CP125\x05",
 122                 "CP775\x09",
 123                 "CP850\x0b",
 124                 "CP856\x0b",
 125                 "CP857\x0b",
 126                 "CP869\x07",
 127                 "CP922\x05",
 128                 "EUC-J\x0f",
 129                 "GB180\x13",
 130                 "GEORG\x05",
 131                 "ISO-8\x01",
 132                 "KOI8-\x0d",
 133                 "PT154\x05",
 134                 "UTF-8\x04"
 135                 },
 136
 137                 /* All known encodings of the copyright symbol. */
 138                 .codes =
 139                 "(C)"          "\0"
 140                 "\xc2\xa9"     "\0"
 141                 "\x97"         "\0"
 142                 "\xa8"         "\0"
 143                 "\xb8"         "\0"
 144                 "\xbf"         "\0"
 145                 "\x8f\xa2\xed" "\0"
 146                 "\x81\x30\x84\x38"
 147         };
 148
 149         unsigned cindex = 0;
 150         const char *m;
 151
 152         if (!charset || !(m = BSEARCH_ARRAY(charset, data.tab, compar_prefix)))
 153                 goto no_conv;
 154
 155         cindex = m[PREFIXLEN];
 156         charset += PREFIXLEN;
 157
 158         /*
 159          * We now need to identify encodings that match one of the 5-character
 160          * prefixes above but don't actually have the copyright symbol in their
 161          * character set.  Specifically, these are:
 162          *
 163          *   CP1122 (does have it, but EBCDIC)
 164          *   CP1124
 165          *   CP1125
 166          *   ISO-8859-10
 167          *   ISO-8859-11
 168          *   ISO-8859-2
 169          *   ISO-8859-3
 170          *   ISO-8859-4
 171          *   ISO-8859-5
 172          *   ISO-8859-6
 173          */
 174         if (cindex == 0) {
 175                 /* CP112x, only CP1129 has copyright symbol. */
 176                 cindex = 5 * (*charset == '9');
 177         } else if (cindex == 1) {
 178                 /*
 179                  * ISO-8859 special case.  Simply find and look at the final
 180                  * two digits.  The set bits in the 'accept' value indicate
 181                  * which encodings have the copyright symbol.
 182                  */
 183                 dx_u32 accept  = 0x00380383;
 184                 dx_u32 collect = 0;
 185                 char c;
 186
 187                 while ((c = *charset++)) {
 188                         collect <<= 4;
 189
 190                         if (c != '-')
 191                                 collect |= c - '0';
 192                 }
 193
 194                 cindex = 5 * ((accept >> (collect & 0x1f)) & 1);
 195         }
 196 no_conv:
 197         return (char *)&data + offsetof(struct copysym_data, codes) + cindex;
 198 }
 199
 200 #endif