]> git.draconx.ca Git - dxcommon.git/commitdiff
Add a dedicated function to emit the copyright symbol.
authorNick Bowler <nbowler@draconx.ca>
Sat, 27 May 2023 00:56:45 +0000 (20:56 -0400)
committerNick Bowler <nbowler@draconx.ca>
Sat, 27 May 2023 01:17:52 +0000 (21:17 -0400)
This is intended to replace gnulib's str_iconv function in programs
when the only use of that function is to generate a copyright symbol
for output.  This implementation should be much more compact.

Makefile.am
src/copysym.c [new file with mode: 0644]
src/copysym.h [new file with mode: 0644]
t/.gitignore
t/copysym.c [new file with mode: 0644]
tests/functions.at

index 8b2aa908691a08b5235e8b947966cc7d946482fa..e064ddf905943e4153e1d20a1b50bc8c9ccbfd1f 100644 (file)
@@ -26,10 +26,21 @@ libglohelp_a_SOURCES = src/help.c
 libglohelp_a_CFLAGS = -DHELP_GETOPT_LONG_ONLY
 libglohelp_a_SHORTNAME = glo
 
+t_helpdesc_SOURCES = t/helpdesc.c src/help.c src/tap.c
 t_helpopt_SOURCES = t/helpopt.c src/help.c src/tap.c
 t_helpopt2_SOURCES = t/helpopt.c src/tap.c
 t_helpopt2_LDADD = $(libglohelp_a_OBJECTS)
-t_helpdesc_SOURCES = t/helpdesc.c src/help.c src/tap.c
+EXTRA_t_helpopt2_DEPENDENCIES = $(t_helpopt2_LDADD)
+
+check_PROGRAMS += t/copysym
+t_copysym_SOURCES = t/copysym.c src/tap.c
+t_copysym_LDADD = $(libnlscopysym_a_OBJECTS)
+EXTRA_t_copysym_DEPENDENCIES = $(t_copysym_LDADD)
+
+EXTRA_LIBRARIES += libnlscopysym.a
+libnlscopysym_a_SOURCES = src/copysym.c
+libnlscopysym_a_CFLAGS = -DENABLE_NLS
+libnlscopysym_a_SHORTNAME = nls
 
 DISTCLEANFILES =
 EXTRA_DIST =
diff --git a/src/copysym.c b/src/copysym.c
new file mode 100644 (file)
index 0000000..8ecace7
--- /dev/null
@@ -0,0 +1,187 @@
+/*
+ * Copyright © 2023 Nick Bowler
+ *
+ * Helper function to output the copyright symbol in a specified encoding.
+ *
+ * License WTFPL2: Do What The Fuck You Want To Public License, version 2.
+ * This is free software: you are free to do what the fuck you want to.
+ * There is NO WARRANTY, to the extent permitted by law.
+ */
+
+#if HAVE_CONFIG_H
+#      include <config.h>
+#endif
+
+#if ENABLE_NLS
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "xtra.h"
+
+static int compar_5arr(const void *key, const void *elem_)
+{
+       const char (*elem)[5] = elem_;
+
+       return strncmp(key, *elem, sizeof *elem);
+}
+
+/*
+ * Return, as a multibyte string, the copyright symbol for the
+ * given character encoding, which is one of the strings returned
+ * by Gnulib's locale_charset function.  In particular, we are
+ * looking for one of the strings:
+ *
+ *     CP1129
+ *     CP1250
+ *     CP1251
+ *     CP1252
+ *     CP1253
+ *     CP1254
+ *     CP1256
+ *     CP1257
+ *     CP1258
+ *     CP775
+ *     CP850
+ *     CP856
+ *     CP857
+ *     CP869
+ *     CP922
+ *     GEORGIAN-PS
+ *     ISO-8859-1
+ *     ISO-8859-13
+ *     ISO-8859-14
+ *     ISO-8859-15
+ *     ISO-8859-7
+ *     ISO-8859-8
+ *     ISO-8859-9
+ *     PT154
+ *     EUC-JP
+ *     GB18030
+ *     KOI8-R
+ *     KOI8-T
+ *     KOI8-U
+ *     UTF-8
+ *
+ * All of these are ASCII supersets.  EBCDIC code pages like CP1122 are
+ * presently handled by returning (C), even if the character set does
+ * include the copyright symbol.
+ *
+ * To simplify the implementation, we allow some slop in the matching,
+ * as long as the result is valid for any actual encoding names.
+ *
+ * If NLS support is disabled, or if the character set does not
+ * include the copyright symbol, then the string (C) is returned
+ * in the C execution character set.
+ */
+const char *copyright_symbol(const char *charset)
+{
+       /* All known encodings of the copyright symbol */
+       static const char codes[] =
+               "\xc2\xa9"         "\0"
+               "\x97"             "\0"
+               "\xa8"             "\0"
+               "\xb8"             "\0"
+               "\xbf"             "\0"
+               "\x8f\xa2\xed"     "\0"
+               "\x81\x30\x84\x38" "\0"
+               "(C)";
+
+       /*
+        * We need the list below to be in lexicographic order in
+        * the C execution character encoding.
+        */
+#if 'B'>'E' || 'C'>'E' || 'E'>'G' || 'G'>'I' || 'K'>'P' || 'P'>'U'
+#  error this character encoding is unsupported, please report a bug.
+#endif
+
+       /*
+        * For character sets that include the copyright symbol,
+        * the first 5 characters suffices to distinguish amongst
+        * all the different possible encodings.
+        */
+       static const char t1[][5] = {
+               "CP112",
+               "CP125",
+               "CP775",
+               "CP850",
+               "CP856",
+               "CP857",
+               "CP869",
+               "CP922",
+               "EUC-J",
+               "GB180",
+               "GEORG",
+               "ISO-8",
+               "KOI8-",
+               "PT154",
+               "UTF-8"
+       };
+
+       /*
+        * Each nibble in the results value contains the offset in the
+        * codes array for the corresponding index in t1, except that
+        * ISO-8859 matches the special value '2' (handled below).
+        */
+       uint_least64_t results = 0x001921fb13777511;
+       const char (*m1)[sizeof *t1];
+       unsigned x, cindex;
+
+       if (!charset)
+               goto no_conv;
+
+       m1 = bsearch(charset, t1, XTRA_ARRAYSIZE(t1), sizeof *t1, compar_5arr);
+       if (!m1)
+               goto no_conv;
+       charset += 5;
+
+       x = m1-t1;
+       cindex = (results >> (x << 2)) & 0xf;
+
+       /*
+        * We now need to identify encodings that match one of the 5-character
+        * prefixes above but don't actually have the copyright symbol in their
+        * character set.  Specifically, these are:
+        *
+        *   CP1122 (does have it, but EBCDIC)
+        *   CP1124
+        *   CP1125
+        *   ISO-8859-10
+        *   ISO-8859-11
+        *   ISO-8859-2
+        *   ISO-8859-3
+        *   ISO-8859-4
+        *   ISO-8859-5
+        *   ISO-8859-6
+        */
+       if ((x == 0) != (*charset == '9')) {
+               /* CP112x, x != '9', no copyright symbol. */
+               goto no_conv;
+       } else if (cindex == 2) {
+               /*
+                * ISO-8859 special case.  Simply find and look at the final
+                * two digits.  The set bits in the 'accept' value indicate
+                * which encodings have the copyright symbol.
+                */
+               uint_least32_t accept  = 0x00380383;
+               uint_least32_t collect = 0;
+               char c;
+
+               while ((c = *charset++)) {
+                       collect <<= 4;
+
+                       if (c != '-')
+                               collect |= c - '0';
+               }
+
+               cindex = (accept >> (collect & 0x1f)) & 1;
+               if (!cindex)
+                       goto no_conv;
+       }
+
+       return &codes[cindex];
+no_conv:
+       return &codes[20];
+}
+
+#endif
diff --git a/src/copysym.h b/src/copysym.h
new file mode 100644 (file)
index 0000000..52e0ba0
--- /dev/null
@@ -0,0 +1,20 @@
+/*
+ * Copyright © 2023 Nick Bowler
+ *
+ * Helper function to output the copyright symbol in a specified encoding.
+ *
+ * License WTFPL2: Do What The Fuck You Want To Public License, version 2.
+ * This is free software: you are free to do what the fuck you want to.
+ * There is NO WARRANTY, to the extent permitted by law.
+ */
+
+#ifndef DX_COPYSYM_H_
+#define DX_COPYSYM_H_
+
+#if ENABLE_NLS
+const char *copyright_symbol(const char *charset);
+#else
+#define copyright_symbol(x) "(C)"
+#endif
+
+#endif
index acb4e4e5598a482baabbdaea19b50e0c2ba38f25..7f1544127ab5feb4ab9519135f76b059f0b9f89b 100644 (file)
@@ -1,3 +1,4 @@
+/copysym
 /helpdesc
 /helpopt
 /helpopt2
diff --git a/t/copysym.c b/t/copysym.c
new file mode 100644 (file)
index 0000000..ab5eb17
--- /dev/null
@@ -0,0 +1,166 @@
+/*
+ * Copyright © 2023 Nick Bowler
+ *
+ * Tests for the copyright_symbol function.
+ *
+ * License WTFPL2: Do What The Fuck You Want To Public License, version 2.
+ * This is free software: you are free to do what the fuck you want to.
+ * There is NO WARRANTY, to the extent permitted by law.
+ */
+
+#include <string.h>
+#include "tap.h"
+
+#define ENABLE_NLS 1
+#include "copysym.h"
+
+static const char *format_str(const char *s)
+{
+       static char buf[100];
+       size_t pos = 0;
+
+       for (; *s && pos < sizeof buf - 4; s++) {
+               if (*s == '(' || *s == 'C' || *s == ')') {
+                       buf[pos++] = *s;
+               } else {
+                       static const char xdigits[16] = "0123456789abcdef";
+                       unsigned char c = *s;
+
+                       buf[pos++] = '\\';
+                       buf[pos++] = 'x';
+                       buf[pos++] = xdigits[(c >> 4) & 0xf];
+                       buf[pos++] = xdigits[c & 0xf];
+               }
+       }
+
+       buf[pos] = 0;
+       return buf;
+}
+
+static void do_test(const char *charset, char *expected)
+{
+       const char *sym = copyright_symbol(charset);
+       char *quote = charset ? "\"" : "";
+
+       if (!tap_result(!strcmp(sym, expected), "copyright_symbol(%s%s%s)",
+                       quote, charset ? charset : "NULL", quote))
+       {
+               tap_diag("Failed, unexpected result");
+               tap_diag("   Received:  %s", format_str(sym));
+               tap_diag("   Expected:  %s", format_str(expected));
+       }
+}
+
+int main(void)
+{
+       do_test(NULL, "(C)");
+       do_test("ANSI_X3.4-1968", "(C)");
+       do_test("ARMSCII-8", "(C)");
+       do_test("ASCII", "(C)");
+       do_test("BIG5", "(C)");
+       do_test("BIG5-HKSCS", "(C)");
+       do_test("CP1046", "(C)");
+       do_test("CP1124", "(C)");
+       do_test("CP1125", "(C)");
+       do_test("CP1129", "\xa9");
+       do_test("CP1131", "(C)");
+       do_test("CP1250", "\xa9");
+       do_test("CP1251", "\xa9");
+       do_test("CP1252", "\xa9");
+       do_test("CP1253", "\xa9");
+       do_test("CP1254", "\xa9");
+       do_test("CP1255", "\xa9");
+       do_test("CP1256", "\xa9");
+       do_test("CP1257", "\xa9");
+       do_test("CP437", "(C)");
+       do_test("CP775", "\xa8");
+       do_test("CP850", "\xb8");
+       do_test("CP852", "(C)");
+       do_test("CP855", "(C)");
+       do_test("CP856", "\xb8");
+       do_test("CP857", "\xb8");
+       do_test("CP861", "(C)");
+       do_test("CP862", "(C)");
+       do_test("CP864", "(C)");
+       do_test("CP865", "(C)");
+       do_test("CP866", "(C)");
+       do_test("CP869", "\x97");
+       do_test("CP874", "(C)");
+       do_test("CP922", "\xa9");
+       do_test("CP932", "(C)");
+       do_test("CP943", "(C)");
+       do_test("CP949", "(C)");
+       do_test("CP950", "(C)");
+       do_test("DEC-HANYU", "(C)");
+       do_test("DEC-KANJI", "(C)");
+       do_test("EUC-JP", "\x8f\xa2\xed");
+       do_test("EUC-KR", "(C)");
+       do_test("EUC-TW", "(C)");
+       do_test("GB18030", "\x81\x30\x84\x38");
+       do_test("GB2312", "(C)");
+       do_test("GBK", "(C)");
+       do_test("GEORGIAN-PS", "\xa9");
+       do_test("HP-ARABIC8", "(C)");
+       do_test("HP-GREEK8", "(C)");
+       do_test("HP-HEBREW8", "(C)");
+       do_test("HP-KANA8", "(C)");
+       do_test("HP-ROMAN8", "(C)");
+       do_test("HP-TURKISH8", "(C)");
+       do_test("ISO-8859-1", "\xa9");
+       do_test("ISO-8859-10", "(C)");
+       do_test("ISO-8859-11", "(C)");
+       do_test("ISO-8859-13", "\xa9");
+       do_test("ISO-8859-14", "\xa9");
+       do_test("ISO-8859-15", "\xa9");
+       do_test("ISO-8859-2", "(C)");
+       do_test("ISO-8859-3", "(C)");
+       do_test("ISO-8859-4", "(C)");
+       do_test("ISO-8859-5", "(C)");
+       do_test("ISO-8859-6", "(C)");
+       do_test("ISO-8859-7", "\xa9");
+       do_test("ISO-8859-8", "\xa9");
+       do_test("ISO-8859-9", "\xa9");
+       do_test("JOHAB", "(C)");
+       do_test("KOI8-R", "\xbf");
+       do_test("KOI8-T", "\xbf");
+       do_test("KOI8-U", "\xbf");
+       do_test("PT154", "\xa9");
+       do_test("SHIFT_JIS", "(C)");
+       do_test("TCVN5712-1", "(C)");
+       do_test("TIS-620", "(C)");
+       do_test("UTF-8", "\xc2\xa9");
+       do_test("VISCII", "(C)");
+
+       do_test("CP1026", "(C)"); // EBCDIC B4
+       do_test("CP1047", "(C)"); // EBCDIC B4
+       do_test("CP1112", "(C)"); // EBCDIC B4
+       do_test("CP1122", "(C)"); // EBCDIC B4
+       do_test("CP1130", "(C)"); // EBCDIC B4
+       do_test("CP1140", "(C)"); // EBCDIC B4
+       do_test("CP1141", "(C)"); // EBCDIC B4
+       do_test("CP1142", "(C)"); // EBCDIC B4
+       do_test("CP1143", "(C)"); // EBCDIC B4
+       do_test("CP1144", "(C)"); // EBCDIC B4
+       do_test("CP1145", "(C)"); // EBCDIC B4
+       do_test("CP1146", "(C)"); // EBCDIC B4
+       do_test("CP1147", "(C)"); // EBCDIC B4
+       do_test("CP1148", "(C)"); // EBCDIC B4
+       do_test("CP1149", "(C)"); // EBCDIC B4
+       do_test("CP1155", "(C)"); // EBCDIC B4
+       do_test("CP1156", "(C)"); // EBCDIC B4
+       do_test("CP1157", "(C)"); // EBCDIC B4
+       do_test("CP1164", "(C)"); // EBCDIC B4
+       do_test("CP273",  "(C)"); // EBCDIC B4
+       do_test("CP277",  "(C)"); // EBCDIC B4
+       do_test("CP278",  "(C)"); // EBCDIC B4
+       do_test("CP280",  "(C)"); // EBCDIC B4
+       do_test("CP284",  "(C)"); // EBCDIC B4
+       do_test("CP285",  "(C)"); // EBCDIC B4
+       do_test("CP297",  "(C)"); // EBCDIC B4
+       do_test("CP424",  "(C)"); // EBCDIC B4
+       do_test("CP500",  "(C)"); // EBCDIC B4
+       do_test("CP871",  "(C)"); // EBCDIC B4
+       do_test("CP875",  "(C)"); // EBCDIC FB
+
+       tap_done();
+}
index d2006df045889efac3ee51c6d5112fc7240c35ef..ec7b6f3184078846c5cc57a2a9b858aa7ecfd7c1 100644 (file)
@@ -127,3 +127,7 @@ AT_CHECK([m4_join([ ],
 ]])
 
 AT_CLEANUP
+
+AT_BANNER([Miscellaneous functions])
+
+TEST_TAP_SIMPLE([copyright_symbol], [copysym], [], [])