src/copysym.c

   1 /*
   2  * Copyright © 2023 Nick Bowler
   3  *
   4  * Helper function to output the copyright symbol in a specified encoding.
   5  *
   6  * License WTFPL2: Do What The Fuck You Want To Public License, version 2.
   7  * This is free software: you are free to do what the fuck you want to.
   8  * There is NO WARRANTY, to the extent permitted by law.
   9  */
  10
  11 #if HAVE_CONFIG_H
  12 #       include <config.h>
  13 #endif
  14
  15 #if ENABLE_NLS
  16
  17 #include <stdlib.h>
  18 #include <string.h>
  19
  20 #if HAVE_INTTYPES_H
  21 #include <inttypes.h>
  22 typedef uint_least32_t dx_u32;
  23 typedef uint_least64_t dx_u64;
  24 #else
  25 #include <limits.h>
  26 #if UINT_MAX >= 0xffffffff
  27 typedef unsigned dx_u32;
  28 #else
  29 typedef unsigned long dx_u32;
  30 #endif
  31 typedef unsigned long long dx_u64;
  32 #endif
  33
  34 #define ARRAYSIZE(x) (sizeof (x) / sizeof (x)[0])
  35
  36 static int compar_5arr(const void *key, const void *elem_)
  37 {
  38         const char (*elem)[5] = (void *)elem_;
  39
  40         return strncmp(key, *elem, sizeof *elem);
  41 }
  42
  43 /*
  44  * Return, as a multibyte string, the copyright symbol for the
  45  * given character encoding, which is one of the strings returned
  46  * by Gnulib's locale_charset function.  In particular, we are
  47  * looking for one of the strings:
  48  *
  49  *     CP1129
  50  *     CP1250
  51  *     CP1251
  52  *     CP1252
  53  *     CP1253
  54  *     CP1254
  55  *     CP1256
  56  *     CP1257
  57  *     CP1258
  58  *     CP775
  59  *     CP850
  60  *     CP856
  61  *     CP857
  62  *     CP869
  63  *     CP922
  64  *     GEORGIAN-PS
  65  *     ISO-8859-1
  66  *     ISO-8859-13
  67  *     ISO-8859-14
  68  *     ISO-8859-15
  69  *     ISO-8859-7
  70  *     ISO-8859-8
  71  *     ISO-8859-9
  72  *     PT154
  73  *     EUC-JP
  74  *     GB18030
  75  *     KOI8-R
  76  *     KOI8-T
  77  *     KOI8-U
  78  *     UTF-8
  79  *
  80  * All of these are ASCII supersets.  EBCDIC code pages like CP1122 are
  81  * presently handled by returning (C), even if the character set does
  82  * include the copyright symbol.
  83  *
  84  * To simplify the implementation, we allow some slop in the matching,
  85  * as long as the result is valid for any actual encoding names.
  86  *
  87  * If NLS support is disabled, or if the character set does not
  88  * include the copyright symbol, then the string (C) is returned
  89  * in the C execution character set.
  90  */
  91 const char *copyright_symbol(const char *charset)
  92 {
  93         /* All known encodings of the copyright symbol */
  94         static const char codes[] =
  95                 "\xc2\xa9"         "\0"
  96                 "\x97"             "\0"
  97                 "\xa8"             "\0"
  98                 "\xb8"             "\0"
  99                 "\xbf"             "\0"
 100                 "\x8f\xa2\xed"     "\0"
 101                 "\x81\x30\x84\x38" "\0"
 102                 "(C)";
 103
 104         /*
 105          * We need the list below to be in lexicographic order in
 106          * the C execution character encoding.
 107          */
 108 #if 'B'>'E' || 'C'>'E' || 'E'>'G' || 'G'>'I' || 'K'>'P' || 'P'>'U'
 109 #  error this character encoding is unsupported, please report a bug.
 110 #endif
 111
 112         /*
 113          * For character sets that include the copyright symbol,
 114          * the first 5 characters suffices to distinguish amongst
 115          * all the different possible encodings.
 116          */
 117         static const char t1[][5] = {
 118                 "CP112",
 119                 "CP125",
 120                 "CP775",
 121                 "CP850",
 122                 "CP856",
 123                 "CP857",
 124                 "CP869",
 125                 "CP922",
 126                 "EUC-J",
 127                 "GB180",
 128                 "GEORG",
 129                 "ISO-8",
 130                 "KOI8-",
 131                 "PT154",
 132                 "UTF-8"
 133         };
 134
 135         /*
 136          * Each nibble in the results value contains the offset in the
 137          * codes array for the corresponding index in t1, except that
 138          * ISO-8859 matches the special value '2' (handled below).
 139          */
 140         dx_u64 results = 0x001921fb13777511;
 141         const char (*m1)[sizeof *t1];
 142         unsigned x, cindex;
 143
 144         if (!charset)
 145                 goto no_conv;
 146
 147         m1 = bsearch(charset, t1, ARRAYSIZE(t1), sizeof *t1, compar_5arr);
 148         if (!m1)
 149                 goto no_conv;
 150         charset += 5;
 151
 152         x = m1-t1;
 153         cindex = (results >> (x << 2)) & 0xf;
 154
 155         /*
 156          * We now need to identify encodings that match one of the 5-character
 157          * prefixes above but don't actually have the copyright symbol in their
 158          * character set.  Specifically, these are:
 159          *
 160          *   CP1122 (does have it, but EBCDIC)
 161          *   CP1124
 162          *   CP1125
 163          *   ISO-8859-10
 164          *   ISO-8859-11
 165          *   ISO-8859-2
 166          *   ISO-8859-3
 167          *   ISO-8859-4
 168          *   ISO-8859-5
 169          *   ISO-8859-6
 170          */
 171         if ((x == 0) != (*charset == '9')) {
 172                 /* CP112x, x != '9', no copyright symbol. */
 173                 goto no_conv;
 174         } else if (cindex == 2) {
 175                 /*
 176                  * ISO-8859 special case.  Simply find and look at the final
 177                  * two digits.  The set bits in the 'accept' value indicate
 178                  * which encodings have the copyright symbol.
 179                  */
 180                 dx_u32 accept  = 0x00380383;
 181                 dx_u32 collect = 0;
 182                 char c;
 183
 184                 while ((c = *charset++)) {
 185                         collect <<= 4;
 186
 187                         if (c != '-')
 188                                 collect |= c - '0';
 189                 }
 190
 191                 cindex = (accept >> (collect & 0x1f)) & 1;
 192                 if (!cindex)
 193                         goto no_conv;
 194         }
 195
 196         return &codes[cindex];
 197 no_conv:
 198         return &codes[20];
 199 }
 200
 201 #endif