2 * Copyright © 2023 Nick Bowler
4 * Helper function to output the copyright symbol in a specified encoding.
6 * License WTFPL2: Do What The Fuck You Want To Public License, version 2.
7 * This is free software: you are free to do what the fuck you want to.
8 * There is NO WARRANTY, to the extent permitted by law.
22 typedef uint_least32_t dx_u32;
23 typedef uint_least64_t dx_u64;
26 #if UINT_MAX >= 0xffffffff
27 typedef unsigned dx_u32;
29 typedef unsigned long dx_u32;
31 typedef unsigned long long dx_u64;
34 #define ARRAYSIZE(x) (sizeof (x) / sizeof (x)[0])
36 static int compar_5arr(const void *key, const void *elem_)
38 const char (*elem)[5] = (void *)elem_;
40 return strncmp(key, *elem, sizeof *elem);
44 * Return, as a multibyte string, the copyright symbol for the
45 * given character encoding, which is one of the strings returned
46 * by Gnulib's locale_charset function. In particular, we are
47 * looking for one of the strings:
80 * All of these are ASCII supersets. EBCDIC code pages like CP1122 are
81 * presently handled by returning (C), even if the character set does
82 * include the copyright symbol.
84 * To simplify the implementation, we allow some slop in the matching,
85 * as long as the result is valid for any actual encoding names.
87 * If NLS support is disabled, or if the character set does not
88 * include the copyright symbol, then the string (C) is returned
89 * in the C execution character set.
91 const char *copyright_symbol(const char *charset)
93 /* All known encodings of the copyright symbol */
94 static const char codes[] =
101 "\x81\x30\x84\x38" "\0"
105 * We need the list below to be in lexicographic order in
106 * the C execution character encoding.
108 #if 'B'>'E' || 'C'>'E' || 'E'>'G' || 'G'>'I' || 'K'>'P' || 'P'>'U'
109 # error this character encoding is unsupported, please report a bug.
113 * For character sets that include the copyright symbol,
114 * the first 5 characters suffices to distinguish amongst
115 * all the different possible encodings.
117 static const char t1[][5] = {
136 * Each nibble in the results value contains the offset in the
137 * codes array for the corresponding index in t1, except that
138 * ISO-8859 matches the special value '2' (handled below).
140 dx_u64 results = 0x001921fb13777511;
141 const char (*m1)[sizeof *t1];
147 m1 = bsearch(charset, t1, ARRAYSIZE(t1), sizeof *t1, compar_5arr);
153 cindex = (results >> (x << 2)) & 0xf;
156 * We now need to identify encodings that match one of the 5-character
157 * prefixes above but don't actually have the copyright symbol in their
158 * character set. Specifically, these are:
160 * CP1122 (does have it, but EBCDIC)
171 if ((x == 0) != (*charset == '9')) {
172 /* CP112x, x != '9', no copyright symbol. */
174 } else if (cindex == 2) {
176 * ISO-8859 special case. Simply find and look at the final
177 * two digits. The set bits in the 'accept' value indicate
178 * which encodings have the copyright symbol.
180 dx_u32 accept = 0x00380383;
184 while ((c = *charset++)) {
191 cindex = (accept >> (collect & 0x1f)) & 1;
196 return &codes[cindex];