/* * Copyright © 2023-2024 Nick Bowler * * Helper function to output the copyright symbol in a specified encoding. * * License WTFPL2: Do What The Fuck You Want To Public License, version 2. * This is free software: you are free to do what the fuck you want to. * There is NO WARRANTY, to the extent permitted by law. */ #if HAVE_CONFIG_H # include #endif #if ENABLE_NLS #include #include #include #if HAVE_INTTYPES_H #include typedef uint_least32_t dx_u32; #else #include #if UINT_MAX >= 0xffffffff typedef unsigned dx_u32; #else typedef unsigned long dx_u32; #endif #endif #define BSEARCH_ARRAY(key, arr, cmp) \ bsearch(key, arr, sizeof (arr) / sizeof *(arr), sizeof *(arr), cmp) enum { PREFIXLEN = 5 }; static int compar_prefix(const void *key, const void *elem_) { const char (*elem)[PREFIXLEN+1] = (void *)elem_; return strncmp(key, *elem, PREFIXLEN); } /* * Return, as a multibyte string, the copyright symbol for the * given character encoding, which is one of the strings returned * by Gnulib's locale_charset function. In particular, we are * looking for one of the strings: * * CP1129 * CP1250 * CP1251 * CP1252 * CP1253 * CP1254 * CP1256 * CP1257 * CP1258 * CP775 * CP850 * CP856 * CP857 * CP869 * CP922 * GEORGIAN-PS * ISO-8859-1 * ISO-8859-13 * ISO-8859-14 * ISO-8859-15 * ISO-8859-7 * ISO-8859-8 * ISO-8859-9 * PT154 * EUC-JP * GB18030 * KOI8-R * KOI8-T * KOI8-U * UTF-8 * * All of these are ASCII supersets. EBCDIC code pages like CP1122 are * presently handled by returning (C), even if the character set does * include the copyright symbol. * * To simplify the implementation, we allow some slop in the matching, * as long as the result is valid for any actual encoding names. * * If NLS support is disabled, or if the character set does not * include the copyright symbol, then the string (C) is returned * in the C execution character set. */ const char *copyright_symbol(const char *charset) { struct copysym_data { char tab[15][PREFIXLEN+1]; char codes[24]; }; /* * We need the list below to be in lexicographic order in * the C execution character encoding. */ #if 'B'>'E' || 'C'>'E' || 'E'>'G' || 'G'>'I' || 'K'>'P' || 'P'>'U' # error this character encoding is unsupported, please report a bug. #endif static const struct copysym_data data = { /* * For character sets that include the copyright symbol, * the first 5 characters suffices to distinguish amongst * all the different possible encodings. * * The final byte of each entry indicates the corresponding * offset into the codes array, except for CP112x and ISO-8859 * which use the values 0 and 1, respectively (handled below). */ .tab = { "CP112\x00", "CP125\x05", "CP775\x09", "CP850\x0b", "CP856\x0b", "CP857\x0b", "CP869\x07", "CP922\x05", "EUC-J\x0f", "GB180\x13", "GEORG\x05", "ISO-8\x01", "KOI8-\x0d", "PT154\x05", "UTF-8\x04" }, /* All known encodings of the copyright symbol. */ .codes = "(C)" "\0" "\xc2\xa9" "\0" "\x97" "\0" "\xa8" "\0" "\xb8" "\0" "\xbf" "\0" "\x8f\xa2\xed" "\0" "\x81\x30\x84\x38" }; unsigned cindex = 0; const char *m; if (!charset || !(m = BSEARCH_ARRAY(charset, data.tab, compar_prefix))) goto no_conv; cindex = m[PREFIXLEN]; charset += PREFIXLEN; /* * We now need to identify encodings that match one of the 5-character * prefixes above but don't actually have the copyright symbol in their * character set. Specifically, these are: * * CP1122 (does have it, but EBCDIC) * CP1124 * CP1125 * ISO-8859-10 * ISO-8859-11 * ISO-8859-2 * ISO-8859-3 * ISO-8859-4 * ISO-8859-5 * ISO-8859-6 */ if (cindex == 0) { /* CP112x, only CP1129 has copyright symbol. */ cindex = 5 * (*charset == '9'); } else if (cindex == 1) { /* * ISO-8859 special case. Simply find and look at the final * two digits. The set bits in the 'accept' value indicate * which encodings have the copyright symbol. */ dx_u32 accept = 0x00380383; dx_u32 collect = 0; char c; while ((c = *charset++)) { collect <<= 4; if (c != '-') collect |= c - '0'; } cindex = 5 * ((accept >> (collect & 0x1f)) & 1); } no_conv: return (char *)&data + offsetof(struct copysym_data, codes) + cindex; } #endif