X-Git-Url: https://git.draconx.ca/gitweb/dxcommon.git/blobdiff_plain/54f9ce81ef444d6e869f7b32066928ed82c17dff..207d0dbfbbfa7ad70b59ada3a741895842958885:/src/copysym.c?ds=inline diff --git a/src/copysym.c b/src/copysym.c new file mode 100644 index 0000000..8ecace7 --- /dev/null +++ b/src/copysym.c @@ -0,0 +1,187 @@ +/* + * Copyright © 2023 Nick Bowler + * + * Helper function to output the copyright symbol in a specified encoding. + * + * License WTFPL2: Do What The Fuck You Want To Public License, version 2. + * This is free software: you are free to do what the fuck you want to. + * There is NO WARRANTY, to the extent permitted by law. + */ + +#if HAVE_CONFIG_H +# include +#endif + +#if ENABLE_NLS + +#include +#include +#include +#include "xtra.h" + +static int compar_5arr(const void *key, const void *elem_) +{ + const char (*elem)[5] = elem_; + + return strncmp(key, *elem, sizeof *elem); +} + +/* + * Return, as a multibyte string, the copyright symbol for the + * given character encoding, which is one of the strings returned + * by Gnulib's locale_charset function. In particular, we are + * looking for one of the strings: + * + * CP1129 + * CP1250 + * CP1251 + * CP1252 + * CP1253 + * CP1254 + * CP1256 + * CP1257 + * CP1258 + * CP775 + * CP850 + * CP856 + * CP857 + * CP869 + * CP922 + * GEORGIAN-PS + * ISO-8859-1 + * ISO-8859-13 + * ISO-8859-14 + * ISO-8859-15 + * ISO-8859-7 + * ISO-8859-8 + * ISO-8859-9 + * PT154 + * EUC-JP + * GB18030 + * KOI8-R + * KOI8-T + * KOI8-U + * UTF-8 + * + * All of these are ASCII supersets. EBCDIC code pages like CP1122 are + * presently handled by returning (C), even if the character set does + * include the copyright symbol. + * + * To simplify the implementation, we allow some slop in the matching, + * as long as the result is valid for any actual encoding names. + * + * If NLS support is disabled, or if the character set does not + * include the copyright symbol, then the string (C) is returned + * in the C execution character set. + */ +const char *copyright_symbol(const char *charset) +{ + /* All known encodings of the copyright symbol */ + static const char codes[] = + "\xc2\xa9" "\0" + "\x97" "\0" + "\xa8" "\0" + "\xb8" "\0" + "\xbf" "\0" + "\x8f\xa2\xed" "\0" + "\x81\x30\x84\x38" "\0" + "(C)"; + + /* + * We need the list below to be in lexicographic order in + * the C execution character encoding. + */ +#if 'B'>'E' || 'C'>'E' || 'E'>'G' || 'G'>'I' || 'K'>'P' || 'P'>'U' +# error this character encoding is unsupported, please report a bug. +#endif + + /* + * For character sets that include the copyright symbol, + * the first 5 characters suffices to distinguish amongst + * all the different possible encodings. + */ + static const char t1[][5] = { + "CP112", + "CP125", + "CP775", + "CP850", + "CP856", + "CP857", + "CP869", + "CP922", + "EUC-J", + "GB180", + "GEORG", + "ISO-8", + "KOI8-", + "PT154", + "UTF-8" + }; + + /* + * Each nibble in the results value contains the offset in the + * codes array for the corresponding index in t1, except that + * ISO-8859 matches the special value '2' (handled below). + */ + uint_least64_t results = 0x001921fb13777511; + const char (*m1)[sizeof *t1]; + unsigned x, cindex; + + if (!charset) + goto no_conv; + + m1 = bsearch(charset, t1, XTRA_ARRAYSIZE(t1), sizeof *t1, compar_5arr); + if (!m1) + goto no_conv; + charset += 5; + + x = m1-t1; + cindex = (results >> (x << 2)) & 0xf; + + /* + * We now need to identify encodings that match one of the 5-character + * prefixes above but don't actually have the copyright symbol in their + * character set. Specifically, these are: + * + * CP1122 (does have it, but EBCDIC) + * CP1124 + * CP1125 + * ISO-8859-10 + * ISO-8859-11 + * ISO-8859-2 + * ISO-8859-3 + * ISO-8859-4 + * ISO-8859-5 + * ISO-8859-6 + */ + if ((x == 0) != (*charset == '9')) { + /* CP112x, x != '9', no copyright symbol. */ + goto no_conv; + } else if (cindex == 2) { + /* + * ISO-8859 special case. Simply find and look at the final + * two digits. The set bits in the 'accept' value indicate + * which encodings have the copyright symbol. + */ + uint_least32_t accept = 0x00380383; + uint_least32_t collect = 0; + char c; + + while ((c = *charset++)) { + collect <<= 4; + + if (c != '-') + collect |= c - '0'; + } + + cindex = (accept >> (collect & 0x1f)) & 1; + if (!cindex) + goto no_conv; + } + + return &codes[cindex]; +no_conv: + return &codes[20]; +} + +#endif