X-Git-Url: https://git.draconx.ca/gitweb/dxcommon.git/blobdiff_plain/54f9ce81ef444d6e869f7b32066928ed82c17dff..207d0dbfbbfa7ad70b59ada3a741895842958885:/src/copysym.c?ds=inline

diff --git a/src/copysym.c b/src/copysym.c
new file mode 100644
index 0000000..8ecace7
--- /dev/null
+++ b/src/copysym.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright Â© 2023 Nick Bowler
+ *
+ * Helper function to output the copyright symbol in a specified encoding.
+ *
+ * License WTFPL2: Do What The Fuck You Want To Public License, version 2.
+ * This is free software: you are free to do what the fuck you want to.
+ * There is NO WARRANTY, to the extent permitted by law.
+ */
+
+#if HAVE_CONFIG_H
+#	include <config.h>
+#endif
+
+#if ENABLE_NLS
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "xtra.h"
+
+static int compar_5arr(const void *key, const void *elem_)
+{
+	const char (*elem)[5] = elem_;
+
+	return strncmp(key, *elem, sizeof *elem);
+}
+
+/*
+ * Return, as a multibyte string, the copyright symbol for the
+ * given character encoding, which is one of the strings returned
+ * by Gnulib's locale_charset function.  In particular, we are
+ * looking for one of the strings:
+ *
+ *     CP1129
+ *     CP1250
+ *     CP1251
+ *     CP1252
+ *     CP1253
+ *     CP1254
+ *     CP1256
+ *     CP1257
+ *     CP1258
+ *     CP775
+ *     CP850
+ *     CP856
+ *     CP857
+ *     CP869
+ *     CP922
+ *     GEORGIAN-PS
+ *     ISO-8859-1
+ *     ISO-8859-13
+ *     ISO-8859-14
+ *     ISO-8859-15
+ *     ISO-8859-7
+ *     ISO-8859-8
+ *     ISO-8859-9
+ *     PT154
+ *     EUC-JP
+ *     GB18030
+ *     KOI8-R
+ *     KOI8-T
+ *     KOI8-U
+ *     UTF-8
+ *
+ * All of these are ASCII supersets.  EBCDIC code pages like CP1122 are
+ * presently handled by returning (C), even if the character set does
+ * include the copyright symbol.
+ *
+ * To simplify the implementation, we allow some slop in the matching,
+ * as long as the result is valid for any actual encoding names.
+ *
+ * If NLS support is disabled, or if the character set does not
+ * include the copyright symbol, then the string (C) is returned
+ * in the C execution character set.
+ */
+const char *copyright_symbol(const char *charset)
+{
+	/* All known encodings of the copyright symbol */
+	static const char codes[] =
+		"\xc2\xa9"         "\0"
+		"\x97"             "\0"
+		"\xa8"             "\0"
+		"\xb8"             "\0"
+		"\xbf"             "\0"
+		"\x8f\xa2\xed"     "\0"
+		"\x81\x30\x84\x38" "\0"
+		"(C)";
+
+	/*
+	 * We need the list below to be in lexicographic order in
+	 * the C execution character encoding.
+	 */
+#if 'B'>'E' || 'C'>'E' || 'E'>'G' || 'G'>'I' || 'K'>'P' || 'P'>'U'
+#  error this character encoding is unsupported, please report a bug.
+#endif
+
+	/*
+	 * For character sets that include the copyright symbol,
+	 * the first 5 characters suffices to distinguish amongst
+	 * all the different possible encodings.
+	 */
+	static const char t1[][5] = {
+		"CP112",
+		"CP125",
+		"CP775",
+		"CP850",
+		"CP856",
+		"CP857",
+		"CP869",
+		"CP922",
+		"EUC-J",
+		"GB180",
+		"GEORG",
+		"ISO-8",
+		"KOI8-",
+		"PT154",
+		"UTF-8"
+	};
+
+	/*
+	 * Each nibble in the results value contains the offset in the
+	 * codes array for the corresponding index in t1, except that
+	 * ISO-8859 matches the special value '2' (handled below).
+	 */
+	uint_least64_t results = 0x001921fb13777511;
+	const char (*m1)[sizeof *t1];
+	unsigned x, cindex;
+
+	if (!charset)
+		goto no_conv;
+
+	m1 = bsearch(charset, t1, XTRA_ARRAYSIZE(t1), sizeof *t1, compar_5arr);
+	if (!m1)
+		goto no_conv;
+	charset += 5;
+
+	x = m1-t1;
+	cindex = (results >> (x << 2)) & 0xf;
+
+	/*
+	 * We now need to identify encodings that match one of the 5-character
+	 * prefixes above but don't actually have the copyright symbol in their
+	 * character set.  Specifically, these are:
+	 *
+	 *   CP1122 (does have it, but EBCDIC)
+	 *   CP1124
+	 *   CP1125
+	 *   ISO-8859-10
+	 *   ISO-8859-11
+	 *   ISO-8859-2
+	 *   ISO-8859-3
+	 *   ISO-8859-4
+	 *   ISO-8859-5
+	 *   ISO-8859-6
+	 */
+	if ((x == 0) != (*charset == '9')) {
+		/* CP112x, x != '9', no copyright symbol. */
+		goto no_conv;
+	} else if (cindex == 2) {
+		/*
+		 * ISO-8859 special case.  Simply find and look at the final
+		 * two digits.  The set bits in the 'accept' value indicate
+		 * which encodings have the copyright symbol.
+		 */
+		uint_least32_t accept  = 0x00380383;
+		uint_least32_t collect = 0;
+		char c;
+
+		while ((c = *charset++)) {
+			collect <<= 4;
+
+			if (c != '-')
+				collect |= c - '0';
+		}
+
+		cindex = (accept >> (collect & 0x1f)) & 1;
+		if (!cindex)
+			goto no_conv;
+	}
+
+	return &codes[cindex];
+no_conv:
+	return &codes[20];
+}
+
+#endif