From d370ab57b97f352ace48992a01dc57ad84d57dd0 Mon Sep 17 00:00:00 2001 From: Nick Bowler Date: Tue, 20 Feb 2024 20:52:04 -0500 Subject: [PATCH] copysym: Integrate code offset into lookup table. Instead of shifting out a nibble from a 64-bit constant based on the lookup result, we can just add a byte to each entry in the table with the same value. Since we no longer need to worry about fitting the offsets into 4 bits we can also rearrange the code array a bit, for an overall code size reduction. --- src/copysym.c | 104 +++++++++++++++++++++++--------------------------- 1 file changed, 48 insertions(+), 56 deletions(-) diff --git a/src/copysym.c b/src/copysym.c index 3128c3e..ca23c44 100644 --- a/src/copysym.c +++ b/src/copysym.c @@ -16,11 +16,11 @@ #include #include +#include #if HAVE_INTTYPES_H #include typedef uint_least32_t dx_u32; -typedef uint_least64_t dx_u64; #else #include #if UINT_MAX >= 0xffffffff @@ -28,16 +28,18 @@ typedef unsigned dx_u32; #else typedef unsigned long dx_u32; #endif -typedef unsigned long long dx_u64; #endif -#define ARRAYSIZE(x) (sizeof (x) / sizeof (x)[0]) +#define BSEARCH_ARRAY(key, arr, cmp) \ + bsearch(key, arr, sizeof (arr) / sizeof *(arr), sizeof *(arr), cmp) -static int compar_5arr(const void *key, const void *elem_) +enum { PREFIXLEN = 5 }; + +static int compar_prefix(const void *key, const void *elem_) { - const char (*elem)[5] = (void *)elem_; + const char (*elem)[PREFIXLEN+1] = (void *)elem_; - return strncmp(key, *elem, sizeof *elem); + return strncmp(key, *elem, PREFIXLEN); } /* @@ -92,14 +94,14 @@ const char *copyright_symbol(const char *charset) { /* All known encodings of the copyright symbol */ static const char codes[] = - "\xc2\xa9" "\0" - "\x97" "\0" - "\xa8" "\0" - "\xb8" "\0" - "\xbf" "\0" - "\x8f\xa2\xed" "\0" - "\x81\x30\x84\x38" "\0" - "(C)"; + "(C)" "\0" + "\xc2\xa9" "\0" + "\x97" "\0" + "\xa8" "\0" + "\xb8" "\0" + "\xbf" "\0" + "\x8f\xa2\xed" "\0" + "\x81\x30\x84\x38"; /* * We need the list below to be in lexicographic order in @@ -113,44 +115,38 @@ const char *copyright_symbol(const char *charset) * For character sets that include the copyright symbol, * the first 5 characters suffices to distinguish amongst * all the different possible encodings. + * + * The final byte of each entry indicates the corresponding + * offset into the codes array, except for CP112x and ISO-8859-x + * which use the special values 0 and 1, respectively (handled + * below). */ - static const char t1[][5] = { - "CP112", - "CP125", - "CP775", - "CP850", - "CP856", - "CP857", - "CP869", - "CP922", - "EUC-J", - "GB180", - "GEORG", - "ISO-8", - "KOI8-", - "PT154", - "UTF-8" + static const char t1[][PREFIXLEN+1] = { + "CP112\x00", + "CP125\x05", + "CP775\x09", + "CP850\x0b", + "CP856\x0b", + "CP857\x0b", + "CP869\x07", + "CP922\x05", + "EUC-J\x0f", + "GB180\x13", + "GEORG\x05", + "ISO-8\x01", + "KOI8-\x0d", + "PT154\x05", + "UTF-8\x04" }; - /* - * Each nibble in the results value contains the offset in the - * codes array for the corresponding index in t1, except that - * ISO-8859 matches the special value '2' (handled below). - */ - dx_u64 results = 0x001921fb13777511ull; - const char (*m1)[sizeof *t1]; - unsigned x, cindex; - - if (!charset) - goto no_conv; + unsigned cindex = 0; + const char *m; - m1 = bsearch(charset, t1, ARRAYSIZE(t1), sizeof *t1, compar_5arr); - if (!m1) + if (!charset || !(m = BSEARCH_ARRAY(charset, t1, compar_prefix))) goto no_conv; - charset += 5; - x = m1-t1; - cindex = (results >> (x << 2)) & 0xf; + cindex = m[PREFIXLEN]; + charset += PREFIXLEN; /* * We now need to identify encodings that match one of the 5-character @@ -168,10 +164,10 @@ const char *copyright_symbol(const char *charset) * ISO-8859-5 * ISO-8859-6 */ - if ((x == 0) != (*charset == '9')) { - /* CP112x, x != '9', no copyright symbol. */ - goto no_conv; - } else if (cindex == 2) { + if (cindex == 0) { + /* CP112x, only CP1129 has copyright symbol. */ + cindex = 5 * (*charset == '9'); + } else if (cindex == 1) { /* * ISO-8859 special case. Simply find and look at the final * two digits. The set bits in the 'accept' value indicate @@ -188,14 +184,10 @@ const char *copyright_symbol(const char *charset) collect |= c - '0'; } - cindex = (accept >> (collect & 0x1f)) & 1; - if (!cindex) - goto no_conv; + cindex = 5 * ((accept >> (collect & 0x1f)) & 1); } - - return &codes[cindex]; no_conv: - return &codes[20]; + return codes+cindex; } #endif -- 2.43.2