From 207d0dbfbbfa7ad70b59ada3a741895842958885 Mon Sep 17 00:00:00 2001 From: Nick Bowler Date: Fri, 26 May 2023 20:56:45 -0400 Subject: [PATCH] Add a dedicated function to emit the copyright symbol. This is intended to replace gnulib's str_iconv function in programs when the only use of that function is to generate a copyright symbol for output. This implementation should be much more compact. --- Makefile.am | 13 +++- src/copysym.c | 187 +++++++++++++++++++++++++++++++++++++++++++++ src/copysym.h | 20 +++++ t/.gitignore | 1 + t/copysym.c | 166 ++++++++++++++++++++++++++++++++++++++++ tests/functions.at | 4 + 6 files changed, 390 insertions(+), 1 deletion(-) create mode 100644 src/copysym.c create mode 100644 src/copysym.h create mode 100644 t/copysym.c diff --git a/Makefile.am b/Makefile.am index 8b2aa90..e064ddf 100644 --- a/Makefile.am +++ b/Makefile.am @@ -26,10 +26,21 @@ libglohelp_a_SOURCES = src/help.c libglohelp_a_CFLAGS = -DHELP_GETOPT_LONG_ONLY libglohelp_a_SHORTNAME = glo +t_helpdesc_SOURCES = t/helpdesc.c src/help.c src/tap.c t_helpopt_SOURCES = t/helpopt.c src/help.c src/tap.c t_helpopt2_SOURCES = t/helpopt.c src/tap.c t_helpopt2_LDADD = $(libglohelp_a_OBJECTS) -t_helpdesc_SOURCES = t/helpdesc.c src/help.c src/tap.c +EXTRA_t_helpopt2_DEPENDENCIES = $(t_helpopt2_LDADD) + +check_PROGRAMS += t/copysym +t_copysym_SOURCES = t/copysym.c src/tap.c +t_copysym_LDADD = $(libnlscopysym_a_OBJECTS) +EXTRA_t_copysym_DEPENDENCIES = $(t_copysym_LDADD) + +EXTRA_LIBRARIES += libnlscopysym.a +libnlscopysym_a_SOURCES = src/copysym.c +libnlscopysym_a_CFLAGS = -DENABLE_NLS +libnlscopysym_a_SHORTNAME = nls DISTCLEANFILES = EXTRA_DIST = diff --git a/src/copysym.c b/src/copysym.c new file mode 100644 index 0000000..8ecace7 --- /dev/null +++ b/src/copysym.c @@ -0,0 +1,187 @@ +/* + * Copyright © 2023 Nick Bowler + * + * Helper function to output the copyright symbol in a specified encoding. + * + * License WTFPL2: Do What The Fuck You Want To Public License, version 2. + * This is free software: you are free to do what the fuck you want to. + * There is NO WARRANTY, to the extent permitted by law. + */ + +#if HAVE_CONFIG_H +# include +#endif + +#if ENABLE_NLS + +#include +#include +#include +#include "xtra.h" + +static int compar_5arr(const void *key, const void *elem_) +{ + const char (*elem)[5] = elem_; + + return strncmp(key, *elem, sizeof *elem); +} + +/* + * Return, as a multibyte string, the copyright symbol for the + * given character encoding, which is one of the strings returned + * by Gnulib's locale_charset function. In particular, we are + * looking for one of the strings: + * + * CP1129 + * CP1250 + * CP1251 + * CP1252 + * CP1253 + * CP1254 + * CP1256 + * CP1257 + * CP1258 + * CP775 + * CP850 + * CP856 + * CP857 + * CP869 + * CP922 + * GEORGIAN-PS + * ISO-8859-1 + * ISO-8859-13 + * ISO-8859-14 + * ISO-8859-15 + * ISO-8859-7 + * ISO-8859-8 + * ISO-8859-9 + * PT154 + * EUC-JP + * GB18030 + * KOI8-R + * KOI8-T + * KOI8-U + * UTF-8 + * + * All of these are ASCII supersets. EBCDIC code pages like CP1122 are + * presently handled by returning (C), even if the character set does + * include the copyright symbol. + * + * To simplify the implementation, we allow some slop in the matching, + * as long as the result is valid for any actual encoding names. + * + * If NLS support is disabled, or if the character set does not + * include the copyright symbol, then the string (C) is returned + * in the C execution character set. + */ +const char *copyright_symbol(const char *charset) +{ + /* All known encodings of the copyright symbol */ + static const char codes[] = + "\xc2\xa9" "\0" + "\x97" "\0" + "\xa8" "\0" + "\xb8" "\0" + "\xbf" "\0" + "\x8f\xa2\xed" "\0" + "\x81\x30\x84\x38" "\0" + "(C)"; + + /* + * We need the list below to be in lexicographic order in + * the C execution character encoding. + */ +#if 'B'>'E' || 'C'>'E' || 'E'>'G' || 'G'>'I' || 'K'>'P' || 'P'>'U' +# error this character encoding is unsupported, please report a bug. +#endif + + /* + * For character sets that include the copyright symbol, + * the first 5 characters suffices to distinguish amongst + * all the different possible encodings. + */ + static const char t1[][5] = { + "CP112", + "CP125", + "CP775", + "CP850", + "CP856", + "CP857", + "CP869", + "CP922", + "EUC-J", + "GB180", + "GEORG", + "ISO-8", + "KOI8-", + "PT154", + "UTF-8" + }; + + /* + * Each nibble in the results value contains the offset in the + * codes array for the corresponding index in t1, except that + * ISO-8859 matches the special value '2' (handled below). + */ + uint_least64_t results = 0x001921fb13777511; + const char (*m1)[sizeof *t1]; + unsigned x, cindex; + + if (!charset) + goto no_conv; + + m1 = bsearch(charset, t1, XTRA_ARRAYSIZE(t1), sizeof *t1, compar_5arr); + if (!m1) + goto no_conv; + charset += 5; + + x = m1-t1; + cindex = (results >> (x << 2)) & 0xf; + + /* + * We now need to identify encodings that match one of the 5-character + * prefixes above but don't actually have the copyright symbol in their + * character set. Specifically, these are: + * + * CP1122 (does have it, but EBCDIC) + * CP1124 + * CP1125 + * ISO-8859-10 + * ISO-8859-11 + * ISO-8859-2 + * ISO-8859-3 + * ISO-8859-4 + * ISO-8859-5 + * ISO-8859-6 + */ + if ((x == 0) != (*charset == '9')) { + /* CP112x, x != '9', no copyright symbol. */ + goto no_conv; + } else if (cindex == 2) { + /* + * ISO-8859 special case. Simply find and look at the final + * two digits. The set bits in the 'accept' value indicate + * which encodings have the copyright symbol. + */ + uint_least32_t accept = 0x00380383; + uint_least32_t collect = 0; + char c; + + while ((c = *charset++)) { + collect <<= 4; + + if (c != '-') + collect |= c - '0'; + } + + cindex = (accept >> (collect & 0x1f)) & 1; + if (!cindex) + goto no_conv; + } + + return &codes[cindex]; +no_conv: + return &codes[20]; +} + +#endif diff --git a/src/copysym.h b/src/copysym.h new file mode 100644 index 0000000..52e0ba0 --- /dev/null +++ b/src/copysym.h @@ -0,0 +1,20 @@ +/* + * Copyright © 2023 Nick Bowler + * + * Helper function to output the copyright symbol in a specified encoding. + * + * License WTFPL2: Do What The Fuck You Want To Public License, version 2. + * This is free software: you are free to do what the fuck you want to. + * There is NO WARRANTY, to the extent permitted by law. + */ + +#ifndef DX_COPYSYM_H_ +#define DX_COPYSYM_H_ + +#if ENABLE_NLS +const char *copyright_symbol(const char *charset); +#else +#define copyright_symbol(x) "(C)" +#endif + +#endif diff --git a/t/.gitignore b/t/.gitignore index acb4e4e..7f15441 100644 --- a/t/.gitignore +++ b/t/.gitignore @@ -1,3 +1,4 @@ +/copysym /helpdesc /helpopt /helpopt2 diff --git a/t/copysym.c b/t/copysym.c new file mode 100644 index 0000000..ab5eb17 --- /dev/null +++ b/t/copysym.c @@ -0,0 +1,166 @@ +/* + * Copyright © 2023 Nick Bowler + * + * Tests for the copyright_symbol function. + * + * License WTFPL2: Do What The Fuck You Want To Public License, version 2. + * This is free software: you are free to do what the fuck you want to. + * There is NO WARRANTY, to the extent permitted by law. + */ + +#include +#include "tap.h" + +#define ENABLE_NLS 1 +#include "copysym.h" + +static const char *format_str(const char *s) +{ + static char buf[100]; + size_t pos = 0; + + for (; *s && pos < sizeof buf - 4; s++) { + if (*s == '(' || *s == 'C' || *s == ')') { + buf[pos++] = *s; + } else { + static const char xdigits[16] = "0123456789abcdef"; + unsigned char c = *s; + + buf[pos++] = '\\'; + buf[pos++] = 'x'; + buf[pos++] = xdigits[(c >> 4) & 0xf]; + buf[pos++] = xdigits[c & 0xf]; + } + } + + buf[pos] = 0; + return buf; +} + +static void do_test(const char *charset, char *expected) +{ + const char *sym = copyright_symbol(charset); + char *quote = charset ? "\"" : ""; + + if (!tap_result(!strcmp(sym, expected), "copyright_symbol(%s%s%s)", + quote, charset ? charset : "NULL", quote)) + { + tap_diag("Failed, unexpected result"); + tap_diag(" Received: %s", format_str(sym)); + tap_diag(" Expected: %s", format_str(expected)); + } +} + +int main(void) +{ + do_test(NULL, "(C)"); + do_test("ANSI_X3.4-1968", "(C)"); + do_test("ARMSCII-8", "(C)"); + do_test("ASCII", "(C)"); + do_test("BIG5", "(C)"); + do_test("BIG5-HKSCS", "(C)"); + do_test("CP1046", "(C)"); + do_test("CP1124", "(C)"); + do_test("CP1125", "(C)"); + do_test("CP1129", "\xa9"); + do_test("CP1131", "(C)"); + do_test("CP1250", "\xa9"); + do_test("CP1251", "\xa9"); + do_test("CP1252", "\xa9"); + do_test("CP1253", "\xa9"); + do_test("CP1254", "\xa9"); + do_test("CP1255", "\xa9"); + do_test("CP1256", "\xa9"); + do_test("CP1257", "\xa9"); + do_test("CP437", "(C)"); + do_test("CP775", "\xa8"); + do_test("CP850", "\xb8"); + do_test("CP852", "(C)"); + do_test("CP855", "(C)"); + do_test("CP856", "\xb8"); + do_test("CP857", "\xb8"); + do_test("CP861", "(C)"); + do_test("CP862", "(C)"); + do_test("CP864", "(C)"); + do_test("CP865", "(C)"); + do_test("CP866", "(C)"); + do_test("CP869", "\x97"); + do_test("CP874", "(C)"); + do_test("CP922", "\xa9"); + do_test("CP932", "(C)"); + do_test("CP943", "(C)"); + do_test("CP949", "(C)"); + do_test("CP950", "(C)"); + do_test("DEC-HANYU", "(C)"); + do_test("DEC-KANJI", "(C)"); + do_test("EUC-JP", "\x8f\xa2\xed"); + do_test("EUC-KR", "(C)"); + do_test("EUC-TW", "(C)"); + do_test("GB18030", "\x81\x30\x84\x38"); + do_test("GB2312", "(C)"); + do_test("GBK", "(C)"); + do_test("GEORGIAN-PS", "\xa9"); + do_test("HP-ARABIC8", "(C)"); + do_test("HP-GREEK8", "(C)"); + do_test("HP-HEBREW8", "(C)"); + do_test("HP-KANA8", "(C)"); + do_test("HP-ROMAN8", "(C)"); + do_test("HP-TURKISH8", "(C)"); + do_test("ISO-8859-1", "\xa9"); + do_test("ISO-8859-10", "(C)"); + do_test("ISO-8859-11", "(C)"); + do_test("ISO-8859-13", "\xa9"); + do_test("ISO-8859-14", "\xa9"); + do_test("ISO-8859-15", "\xa9"); + do_test("ISO-8859-2", "(C)"); + do_test("ISO-8859-3", "(C)"); + do_test("ISO-8859-4", "(C)"); + do_test("ISO-8859-5", "(C)"); + do_test("ISO-8859-6", "(C)"); + do_test("ISO-8859-7", "\xa9"); + do_test("ISO-8859-8", "\xa9"); + do_test("ISO-8859-9", "\xa9"); + do_test("JOHAB", "(C)"); + do_test("KOI8-R", "\xbf"); + do_test("KOI8-T", "\xbf"); + do_test("KOI8-U", "\xbf"); + do_test("PT154", "\xa9"); + do_test("SHIFT_JIS", "(C)"); + do_test("TCVN5712-1", "(C)"); + do_test("TIS-620", "(C)"); + do_test("UTF-8", "\xc2\xa9"); + do_test("VISCII", "(C)"); + + do_test("CP1026", "(C)"); // EBCDIC B4 + do_test("CP1047", "(C)"); // EBCDIC B4 + do_test("CP1112", "(C)"); // EBCDIC B4 + do_test("CP1122", "(C)"); // EBCDIC B4 + do_test("CP1130", "(C)"); // EBCDIC B4 + do_test("CP1140", "(C)"); // EBCDIC B4 + do_test("CP1141", "(C)"); // EBCDIC B4 + do_test("CP1142", "(C)"); // EBCDIC B4 + do_test("CP1143", "(C)"); // EBCDIC B4 + do_test("CP1144", "(C)"); // EBCDIC B4 + do_test("CP1145", "(C)"); // EBCDIC B4 + do_test("CP1146", "(C)"); // EBCDIC B4 + do_test("CP1147", "(C)"); // EBCDIC B4 + do_test("CP1148", "(C)"); // EBCDIC B4 + do_test("CP1149", "(C)"); // EBCDIC B4 + do_test("CP1155", "(C)"); // EBCDIC B4 + do_test("CP1156", "(C)"); // EBCDIC B4 + do_test("CP1157", "(C)"); // EBCDIC B4 + do_test("CP1164", "(C)"); // EBCDIC B4 + do_test("CP273", "(C)"); // EBCDIC B4 + do_test("CP277", "(C)"); // EBCDIC B4 + do_test("CP278", "(C)"); // EBCDIC B4 + do_test("CP280", "(C)"); // EBCDIC B4 + do_test("CP284", "(C)"); // EBCDIC B4 + do_test("CP285", "(C)"); // EBCDIC B4 + do_test("CP297", "(C)"); // EBCDIC B4 + do_test("CP424", "(C)"); // EBCDIC B4 + do_test("CP500", "(C)"); // EBCDIC B4 + do_test("CP871", "(C)"); // EBCDIC B4 + do_test("CP875", "(C)"); // EBCDIC FB + + tap_done(); +} diff --git a/tests/functions.at b/tests/functions.at index d2006df..ec7b6f3 100644 --- a/tests/functions.at +++ b/tests/functions.at @@ -127,3 +127,7 @@ AT_CHECK([m4_join([ ], ]]) AT_CLEANUP + +AT_BANNER([Miscellaneous functions]) + +TEST_TAP_SIMPLE([copyright_symbol], [copysym], [], []) -- 2.43.2