]> git.draconx.ca Git - dxcommon.git/commitdiff
copysym: Integrate code offset into lookup table.
authorNick Bowler <nbowler@draconx.ca>
Wed, 21 Feb 2024 01:52:04 +0000 (20:52 -0500)
committerNick Bowler <nbowler@draconx.ca>
Fri, 23 Feb 2024 01:31:59 +0000 (20:31 -0500)
Instead of shifting out a nibble from a 64-bit constant based on the
lookup result, we can just add a byte to each entry in the table with
the same value.  Since we no longer need to worry about fitting the
offsets into 4 bits we can also rearrange the code array a bit, for
an overall code size reduction.

src/copysym.c

index 3128c3e92d9da790ce77a995a502901f3fdf0562..ca23c44cf1734b8dbd12b409f82343ccd668810d 100644 (file)
 
 #include <stdlib.h>
 #include <string.h>
+#include <stddef.h>
 
 #if HAVE_INTTYPES_H
 #include <inttypes.h>
 typedef uint_least32_t dx_u32;
-typedef uint_least64_t dx_u64;
 #else
 #include <limits.h>
 #if UINT_MAX >= 0xffffffff
@@ -28,16 +28,18 @@ typedef unsigned dx_u32;
 #else
 typedef unsigned long dx_u32;
 #endif
-typedef unsigned long long dx_u64;
 #endif
 
-#define ARRAYSIZE(x) (sizeof (x) / sizeof (x)[0])
+#define BSEARCH_ARRAY(key, arr, cmp) \
+       bsearch(key, arr, sizeof (arr) / sizeof *(arr), sizeof *(arr), cmp)
 
-static int compar_5arr(const void *key, const void *elem_)
+enum { PREFIXLEN = 5 };
+
+static int compar_prefix(const void *key, const void *elem_)
 {
-       const char (*elem)[5] = (void *)elem_;
+       const char (*elem)[PREFIXLEN+1] = (void *)elem_;
 
-       return strncmp(key, *elem, sizeof *elem);
+       return strncmp(key, *elem, PREFIXLEN);
 }
 
 /*
@@ -92,14 +94,14 @@ const char *copyright_symbol(const char *charset)
 {
        /* All known encodings of the copyright symbol */
        static const char codes[] =
-               "\xc2\xa9"         "\0"
-               "\x97"             "\0"
-               "\xa8"             "\0"
-               "\xb8"             "\0"
-               "\xbf"             "\0"
-               "\x8f\xa2\xed"     "\0"
-               "\x81\x30\x84\x38" "\0"
-               "(C)";
+               "(C)"          "\0"
+               "\xc2\xa9"     "\0"
+               "\x97"         "\0"
+               "\xa8"         "\0"
+               "\xb8"         "\0"
+               "\xbf"         "\0"
+               "\x8f\xa2\xed" "\0"
+               "\x81\x30\x84\x38";
 
        /*
         * We need the list below to be in lexicographic order in
@@ -113,44 +115,38 @@ const char *copyright_symbol(const char *charset)
         * For character sets that include the copyright symbol,
         * the first 5 characters suffices to distinguish amongst
         * all the different possible encodings.
+        *
+        * The final byte of each entry indicates the corresponding
+        * offset into the codes array, except for CP112x and ISO-8859-x
+        * which use the special values 0 and 1, respectively (handled
+        * below).
         */
-       static const char t1[][5] = {
-               "CP112",
-               "CP125",
-               "CP775",
-               "CP850",
-               "CP856",
-               "CP857",
-               "CP869",
-               "CP922",
-               "EUC-J",
-               "GB180",
-               "GEORG",
-               "ISO-8",
-               "KOI8-",
-               "PT154",
-               "UTF-8"
+       static const char t1[][PREFIXLEN+1] = {
+               "CP112\x00",
+               "CP125\x05",
+               "CP775\x09",
+               "CP850\x0b",
+               "CP856\x0b",
+               "CP857\x0b",
+               "CP869\x07",
+               "CP922\x05",
+               "EUC-J\x0f",
+               "GB180\x13",
+               "GEORG\x05",
+               "ISO-8\x01",
+               "KOI8-\x0d",
+               "PT154\x05",
+               "UTF-8\x04"
        };
 
-       /*
-        * Each nibble in the results value contains the offset in the
-        * codes array for the corresponding index in t1, except that
-        * ISO-8859 matches the special value '2' (handled below).
-        */
-       dx_u64 results = 0x001921fb13777511ull;
-       const char (*m1)[sizeof *t1];
-       unsigned x, cindex;
-
-       if (!charset)
-               goto no_conv;
+       unsigned cindex = 0;
+       const char *m;
 
-       m1 = bsearch(charset, t1, ARRAYSIZE(t1), sizeof *t1, compar_5arr);
-       if (!m1)
+       if (!charset || !(m = BSEARCH_ARRAY(charset, t1, compar_prefix)))
                goto no_conv;
-       charset += 5;
 
-       x = m1-t1;
-       cindex = (results >> (x << 2)) & 0xf;
+       cindex = m[PREFIXLEN];
+       charset += PREFIXLEN;
 
        /*
         * We now need to identify encodings that match one of the 5-character
@@ -168,10 +164,10 @@ const char *copyright_symbol(const char *charset)
         *   ISO-8859-5
         *   ISO-8859-6
         */
-       if ((x == 0) != (*charset == '9')) {
-               /* CP112x, x != '9', no copyright symbol. */
-               goto no_conv;
-       } else if (cindex == 2) {
+       if (cindex == 0) {
+               /* CP112x, only CP1129 has copyright symbol. */
+               cindex = 5 * (*charset == '9');
+       } else if (cindex == 1) {
                /*
                 * ISO-8859 special case.  Simply find and look at the final
                 * two digits.  The set bits in the 'accept' value indicate
@@ -188,14 +184,10 @@ const char *copyright_symbol(const char *charset)
                                collect |= c - '0';
                }
 
-               cindex = (accept >> (collect & 0x1f)) & 1;
-               if (!cindex)
-                       goto no_conv;
+               cindex = 5 * ((accept >> (collect & 0x1f)) & 1);
        }
-
-       return &codes[cindex];
 no_conv:
-       return &codes[20];
+       return codes+cindex;
 }
 
 #endif