src/copysym.c

   1 /*
   2  * Copyright © 2023 Nick Bowler
   3  *
   4  * Helper function to output the copyright symbol in a specified encoding.
   5  *
   6  * License WTFPL2: Do What The Fuck You Want To Public License, version 2.
   7  * This is free software: you are free to do what the fuck you want to.
   8  * There is NO WARRANTY, to the extent permitted by law.
   9  */
  10
  11 #if HAVE_CONFIG_H
  12 #       include <config.h>
  13 #endif
  14
  15 #if ENABLE_NLS
  16
  17 #include <stdlib.h>
  18 #include <string.h>
  19 #include <stdint.h>
  20
  21 #define ARRAYSIZE(x) (sizeof (x) / sizeof (x)[0])
  22
  23 static int compar_5arr(const void *key, const void *elem_)
  24 {
  25         const char (*elem)[5] = (void *)elem_;
  26
  27         return strncmp(key, *elem, sizeof *elem);
  28 }
  29
  30 /*
  31  * Return, as a multibyte string, the copyright symbol for the
  32  * given character encoding, which is one of the strings returned
  33  * by Gnulib's locale_charset function.  In particular, we are
  34  * looking for one of the strings:
  35  *
  36  *     CP1129
  37  *     CP1250
  38  *     CP1251
  39  *     CP1252
  40  *     CP1253
  41  *     CP1254
  42  *     CP1256
  43  *     CP1257
  44  *     CP1258
  45  *     CP775
  46  *     CP850
  47  *     CP856
  48  *     CP857
  49  *     CP869
  50  *     CP922
  51  *     GEORGIAN-PS
  52  *     ISO-8859-1
  53  *     ISO-8859-13
  54  *     ISO-8859-14
  55  *     ISO-8859-15
  56  *     ISO-8859-7
  57  *     ISO-8859-8
  58  *     ISO-8859-9
  59  *     PT154
  60  *     EUC-JP
  61  *     GB18030
  62  *     KOI8-R
  63  *     KOI8-T
  64  *     KOI8-U
  65  *     UTF-8
  66  *
  67  * All of these are ASCII supersets.  EBCDIC code pages like CP1122 are
  68  * presently handled by returning (C), even if the character set does
  69  * include the copyright symbol.
  70  *
  71  * To simplify the implementation, we allow some slop in the matching,
  72  * as long as the result is valid for any actual encoding names.
  73  *
  74  * If NLS support is disabled, or if the character set does not
  75  * include the copyright symbol, then the string (C) is returned
  76  * in the C execution character set.
  77  */
  78 const char *copyright_symbol(const char *charset)
  79 {
  80         /* All known encodings of the copyright symbol */
  81         static const char codes[] =
  82                 "\xc2\xa9"         "\0"
  83                 "\x97"             "\0"
  84                 "\xa8"             "\0"
  85                 "\xb8"             "\0"
  86                 "\xbf"             "\0"
  87                 "\x8f\xa2\xed"     "\0"
  88                 "\x81\x30\x84\x38" "\0"
  89                 "(C)";
  90
  91         /*
  92          * We need the list below to be in lexicographic order in
  93          * the C execution character encoding.
  94          */
  95 #if 'B'>'E' || 'C'>'E' || 'E'>'G' || 'G'>'I' || 'K'>'P' || 'P'>'U'
  96 #  error this character encoding is unsupported, please report a bug.
  97 #endif
  98
  99         /*
 100          * For character sets that include the copyright symbol,
 101          * the first 5 characters suffices to distinguish amongst
 102          * all the different possible encodings.
 103          */
 104         static const char t1[][5] = {
 105                 "CP112",
 106                 "CP125",
 107                 "CP775",
 108                 "CP850",
 109                 "CP856",
 110                 "CP857",
 111                 "CP869",
 112                 "CP922",
 113                 "EUC-J",
 114                 "GB180",
 115                 "GEORG",
 116                 "ISO-8",
 117                 "KOI8-",
 118                 "PT154",
 119                 "UTF-8"
 120         };
 121
 122         /*
 123          * Each nibble in the results value contains the offset in the
 124          * codes array for the corresponding index in t1, except that
 125          * ISO-8859 matches the special value '2' (handled below).
 126          */
 127         uint_least64_t results = 0x001921fb13777511;
 128         const char (*m1)[sizeof *t1];
 129         unsigned x, cindex;
 130
 131         if (!charset)
 132                 goto no_conv;
 133
 134         m1 = bsearch(charset, t1, ARRAYSIZE(t1), sizeof *t1, compar_5arr);
 135         if (!m1)
 136                 goto no_conv;
 137         charset += 5;
 138
 139         x = m1-t1;
 140         cindex = (results >> (x << 2)) & 0xf;
 141
 142         /*
 143          * We now need to identify encodings that match one of the 5-character
 144          * prefixes above but don't actually have the copyright symbol in their
 145          * character set.  Specifically, these are:
 146          *
 147          *   CP1122 (does have it, but EBCDIC)
 148          *   CP1124
 149          *   CP1125
 150          *   ISO-8859-10
 151          *   ISO-8859-11
 152          *   ISO-8859-2
 153          *   ISO-8859-3
 154          *   ISO-8859-4
 155          *   ISO-8859-5
 156          *   ISO-8859-6
 157          */
 158         if ((x == 0) != (*charset == '9')) {
 159                 /* CP112x, x != '9', no copyright symbol. */
 160                 goto no_conv;
 161         } else if (cindex == 2) {
 162                 /*
 163                  * ISO-8859 special case.  Simply find and look at the final
 164                  * two digits.  The set bits in the 'accept' value indicate
 165                  * which encodings have the copyright symbol.
 166                  */
 167                 uint_least32_t accept  = 0x00380383;
 168                 uint_least32_t collect = 0;
 169                 char c;
 170
 171                 while ((c = *charset++)) {
 172                         collect <<= 4;
 173
 174                         if (c != '-')
 175                                 collect |= c - '0';
 176                 }
 177
 178                 cindex = (accept >> (collect & 0x1f)) & 1;
 179                 if (!cindex)
 180                         goto no_conv;
 181         }
 182
 183         return &codes[cindex];
 184 no_conv:
 185         return &codes[20];
 186 }
 187
 188 #endif