/*
 * Copyright © 2023-2024 Nick Bowler
 *
 * Helper function to output the copyright symbol in a specified encoding.
 *
 * License WTFPL2: Do What The Fuck You Want To Public License, version 2.
 * This is free software: you are free to do what the fuck you want to.
 * There is NO WARRANTY, to the extent permitted by law.
 */

#if HAVE_CONFIG_H
#	include <config.h>
#endif

#if ENABLE_NLS

#include <stdlib.h>
#include <string.h>
#include <stddef.h>

#if HAVE_INTTYPES_H
#include <inttypes.h>
typedef uint_least32_t dx_u32;
#else
#include <limits.h>
#if UINT_MAX >= 0xffffffff
typedef unsigned dx_u32;
#else
typedef unsigned long dx_u32;
#endif
#endif

#define BSEARCH_ARRAY(key, arr, cmp) \
	bsearch(key, arr, sizeof (arr) / sizeof *(arr), sizeof *(arr), cmp)

enum { PREFIXLEN = 5 };

static int compar_prefix(const void *key, const void *elem_)
{
	const char (*elem)[PREFIXLEN+1] = (void *)elem_;

	return strncmp(key, *elem, PREFIXLEN);
}

/*
 * Return, as a multibyte string, the copyright symbol for the
 * given character encoding, which is one of the strings returned
 * by Gnulib's locale_charset function.  In particular, we are
 * looking for one of the strings:
 *
 *     CP1129
 *     CP1250
 *     CP1251
 *     CP1252
 *     CP1253
 *     CP1254
 *     CP1256
 *     CP1257
 *     CP1258
 *     CP775
 *     CP850
 *     CP856
 *     CP857
 *     CP869
 *     CP922
 *     GEORGIAN-PS
 *     ISO-8859-1
 *     ISO-8859-13
 *     ISO-8859-14
 *     ISO-8859-15
 *     ISO-8859-7
 *     ISO-8859-8
 *     ISO-8859-9
 *     PT154
 *     EUC-JP
 *     GB18030
 *     KOI8-R
 *     KOI8-T
 *     KOI8-U
 *     UTF-8
 *
 * All of these are ASCII supersets.  EBCDIC code pages like CP1122 are
 * presently handled by returning (C), even if the character set does
 * include the copyright symbol.
 *
 * To simplify the implementation, we allow some slop in the matching,
 * as long as the result is valid for any actual encoding names.
 *
 * If NLS support is disabled, or if the character set does not
 * include the copyright symbol, then the string (C) is returned
 * in the C execution character set.
 */
const char *copyright_symbol(const char *charset)
{
	struct copysym_data {
		char tab[15][PREFIXLEN+1];
		char codes[24];
	};

	/*
	 * We need the list below to be in lexicographic order in
	 * the C execution character encoding.
	 */
#if 'B'>'E' || 'C'>'E' || 'E'>'G' || 'G'>'I' || 'K'>'P' || 'P'>'U'
#  error this character encoding is unsupported, please report a bug.
#endif

	static const struct copysym_data data = {
		/*
		 * For character sets that include the copyright symbol,
		 * the first 5 characters suffices to distinguish amongst
		 * all the different possible encodings.
		 *
		 * The final byte of each entry indicates the corresponding
		 * offset into the codes array, except for CP112x and ISO-8859
		 * which use the values 0 and 1, respectively (handled below).
		 */
		.tab =
		{
		"CP112\x00",
		"CP125\x05",
		"CP775\x09",
		"CP850\x0b",
		"CP856\x0b",
		"CP857\x0b",
		"CP869\x07",
		"CP922\x05",
		"EUC-J\x0f",
		"GB180\x13",
		"GEORG\x05",
		"ISO-8\x01",
		"KOI8-\x0d",
		"PT154\x05",
		"UTF-8\x04"
		},

		/* All known encodings of the copyright symbol. */
		.codes =
		"(C)"          "\0"
		"\xc2\xa9"     "\0"
		"\x97"         "\0"
		"\xa8"         "\0"
		"\xb8"         "\0"
		"\xbf"         "\0"
		"\x8f\xa2\xed" "\0"
		"\x81\x30\x84\x38"
	};

	unsigned cindex = 0;
	const char *m;

	if (!charset || !(m = BSEARCH_ARRAY(charset, data.tab, compar_prefix)))
		goto no_conv;

	cindex = m[PREFIXLEN];
	charset += PREFIXLEN;

	/*
	 * We now need to identify encodings that match one of the 5-character
	 * prefixes above but don't actually have the copyright symbol in their
	 * character set.  Specifically, these are:
	 *
	 *   CP1122 (does have it, but EBCDIC)
	 *   CP1124
	 *   CP1125
	 *   ISO-8859-10
	 *   ISO-8859-11
	 *   ISO-8859-2
	 *   ISO-8859-3
	 *   ISO-8859-4
	 *   ISO-8859-5
	 *   ISO-8859-6
	 */
	if (cindex == 0) {
		/* CP112x, only CP1129 has copyright symbol. */
		cindex = 5 * (*charset == '9');
	} else if (cindex == 1) {
		/*
		 * ISO-8859 special case.  Simply find and look at the final
		 * two digits.  The set bits in the 'accept' value indicate
		 * which encodings have the copyright symbol.
		 */
		dx_u32 accept  = 0x00380383;
		dx_u32 collect = 0;
		char c;

		while ((c = *charset++)) {
			collect <<= 4;

			if (c != '-')
				collect |= c - '0';
		}

		cindex = 5 * ((accept >> (collect & 0x1f)) & 1);
	}
no_conv:
	return (char *)&data + offsetof(struct copysym_data, codes) + cindex;
}

#endif