/* * Copyright © 2023 Nick Bowler * * Helper function to output the copyright symbol in a specified encoding. * * License WTFPL2: Do What The Fuck You Want To Public License, version 2. * This is free software: you are free to do what the fuck you want to. * There is NO WARRANTY, to the extent permitted by law. */ #if HAVE_CONFIG_H # include #endif #if ENABLE_NLS #include #include #if HAVE_INTTYPES_H #include typedef uint_least32_t dx_u32; typedef uint_least64_t dx_u64; #else #include #if UINT_MAX >= 0xffffffff typedef unsigned dx_u32; #else typedef unsigned long dx_u32; #endif typedef unsigned long long dx_u64; #endif #define ARRAYSIZE(x) (sizeof (x) / sizeof (x)[0]) static int compar_5arr(const void *key, const void *elem_) { const char (*elem)[5] = (void *)elem_; return strncmp(key, *elem, sizeof *elem); } /* * Return, as a multibyte string, the copyright symbol for the * given character encoding, which is one of the strings returned * by Gnulib's locale_charset function. In particular, we are * looking for one of the strings: * * CP1129 * CP1250 * CP1251 * CP1252 * CP1253 * CP1254 * CP1256 * CP1257 * CP1258 * CP775 * CP850 * CP856 * CP857 * CP869 * CP922 * GEORGIAN-PS * ISO-8859-1 * ISO-8859-13 * ISO-8859-14 * ISO-8859-15 * ISO-8859-7 * ISO-8859-8 * ISO-8859-9 * PT154 * EUC-JP * GB18030 * KOI8-R * KOI8-T * KOI8-U * UTF-8 * * All of these are ASCII supersets. EBCDIC code pages like CP1122 are * presently handled by returning (C), even if the character set does * include the copyright symbol. * * To simplify the implementation, we allow some slop in the matching, * as long as the result is valid for any actual encoding names. * * If NLS support is disabled, or if the character set does not * include the copyright symbol, then the string (C) is returned * in the C execution character set. */ const char *copyright_symbol(const char *charset) { /* All known encodings of the copyright symbol */ static const char codes[] = "\xc2\xa9" "\0" "\x97" "\0" "\xa8" "\0" "\xb8" "\0" "\xbf" "\0" "\x8f\xa2\xed" "\0" "\x81\x30\x84\x38" "\0" "(C)"; /* * We need the list below to be in lexicographic order in * the C execution character encoding. */ #if 'B'>'E' || 'C'>'E' || 'E'>'G' || 'G'>'I' || 'K'>'P' || 'P'>'U' # error this character encoding is unsupported, please report a bug. #endif /* * For character sets that include the copyright symbol, * the first 5 characters suffices to distinguish amongst * all the different possible encodings. */ static const char t1[][5] = { "CP112", "CP125", "CP775", "CP850", "CP856", "CP857", "CP869", "CP922", "EUC-J", "GB180", "GEORG", "ISO-8", "KOI8-", "PT154", "UTF-8" }; /* * Each nibble in the results value contains the offset in the * codes array for the corresponding index in t1, except that * ISO-8859 matches the special value '2' (handled below). */ dx_u64 results = 0x001921fb13777511; const char (*m1)[sizeof *t1]; unsigned x, cindex; if (!charset) goto no_conv; m1 = bsearch(charset, t1, ARRAYSIZE(t1), sizeof *t1, compar_5arr); if (!m1) goto no_conv; charset += 5; x = m1-t1; cindex = (results >> (x << 2)) & 0xf; /* * We now need to identify encodings that match one of the 5-character * prefixes above but don't actually have the copyright symbol in their * character set. Specifically, these are: * * CP1122 (does have it, but EBCDIC) * CP1124 * CP1125 * ISO-8859-10 * ISO-8859-11 * ISO-8859-2 * ISO-8859-3 * ISO-8859-4 * ISO-8859-5 * ISO-8859-6 */ if ((x == 0) != (*charset == '9')) { /* CP112x, x != '9', no copyright symbol. */ goto no_conv; } else if (cindex == 2) { /* * ISO-8859 special case. Simply find and look at the final * two digits. The set bits in the 'accept' value indicate * which encodings have the copyright symbol. */ dx_u32 accept = 0x00380383; dx_u32 collect = 0; char c; while ((c = *charset++)) { collect <<= 4; if (c != '-') collect |= c - '0'; } cindex = (accept >> (collect & 0x1f)) & 1; if (!cindex) goto no_conv; } return &codes[cindex]; no_conv: return &codes[20]; } #endif