copysym: Integrate code offset into lookup table.

author Nick Bowler <nbowler@draconx.ca>

Wed, 21 Feb 2024 01:52:04 +0000 (20:52 -0500)

committer Nick Bowler <nbowler@draconx.ca>

Fri, 23 Feb 2024 01:31:59 +0000 (20:31 -0500)
author Nick Bowler <nbowler@draconx.ca>
Wed, 21 Feb 2024 01:52:04 +0000 (20:52 -0500)
committer Nick Bowler <nbowler@draconx.ca>
Fri, 23 Feb 2024 01:31:59 +0000 (20:31 -0500)
diff --git a/src/copysym.c b/src/copysym.c

index 3128c3e92d9da790ce77a995a502901f3fdf0562..ca23c44cf1734b8dbd12b409f82343ccd668810d 100644 (file)
--- a/src/copysym.c
+++ b/src/copysym.c
@@ -16,11 +16,11 @@
  
  #include <stdlib.h>
  #include <string.h>
+#include <stddef.h>
  
  #if HAVE_INTTYPES_H
  #include <inttypes.h>
  typedef uint_least32_t dx_u32;
-typedef uint_least64_t dx_u64;
  #else
  #include <limits.h>
  #if UINT_MAX >= 0xffffffff
@@ -28,16 +28,18 @@ typedef unsigned dx_u32;
  #else
  typedef unsigned long dx_u32;
  #endif
-typedef unsigned long long dx_u64;
  #endif
  
-#define ARRAYSIZE(x) (sizeof (x) / sizeof (x)[0])
+#define BSEARCH_ARRAY(key, arr, cmp) \
+       bsearch(key, arr, sizeof (arr) / sizeof *(arr), sizeof *(arr), cmp)
  
-static int compar_5arr(const void *key, const void *elem_)
+enum { PREFIXLEN = 5 };
+
+static int compar_prefix(const void *key, const void *elem_)
  {
-       const char (*elem)[5] = (void *)elem_;
+       const char (*elem)[PREFIXLEN+1] = (void *)elem_;
  
-       return strncmp(key, *elem, sizeof *elem);
+       return strncmp(key, *elem, PREFIXLEN);
  }
  
  /*
@@ -92,14 +94,14 @@ const char *copyright_symbol(const char *charset)
  {
         /* All known encodings of the copyright symbol */
         static const char codes[] =
-               "\xc2\xa9"         "\0"
-               "\x97"             "\0"
-               "\xa8"             "\0"
-               "\xb8"             "\0"
-               "\xbf"             "\0"
-               "\x8f\xa2\xed"     "\0"
-               "\x81\x30\x84\x38" "\0"
-               "(C)";
+               "(C)"          "\0"
+               "\xc2\xa9"     "\0"
+               "\x97"         "\0"
+               "\xa8"         "\0"
+               "\xb8"         "\0"
+               "\xbf"         "\0"
+               "\x8f\xa2\xed" "\0"
+               "\x81\x30\x84\x38";
  
         /*
          * We need the list below to be in lexicographic order in
@@ -113,44 +115,38 @@ const char *copyright_symbol(const char *charset)
          * For character sets that include the copyright symbol,
          * the first 5 characters suffices to distinguish amongst
          * all the different possible encodings.
+        *
+        * The final byte of each entry indicates the corresponding
+        * offset into the codes array, except for CP112x and ISO-8859-x
+        * which use the special values 0 and 1, respectively (handled
+        * below).
          */
-       static const char t1[][5] = {
-               "CP112",
-               "CP125",
-               "CP775",
-               "CP850",
-               "CP856",
-               "CP857",
-               "CP869",
-               "CP922",
-               "EUC-J",
-               "GB180",
-               "GEORG",
-               "ISO-8",
-               "KOI8-",
-               "PT154",
-               "UTF-8"
+       static const char t1[][PREFIXLEN+1] = {
+               "CP112\x00",
+               "CP125\x05",
+               "CP775\x09",
+               "CP850\x0b",
+               "CP856\x0b",
+               "CP857\x0b",
+               "CP869\x07",
+               "CP922\x05",
+               "EUC-J\x0f",
+               "GB180\x13",
+               "GEORG\x05",
+               "ISO-8\x01",
+               "KOI8-\x0d",
+               "PT154\x05",
+               "UTF-8\x04"
         };
  
-       /*
-        * Each nibble in the results value contains the offset in the
-        * codes array for the corresponding index in t1, except that
-        * ISO-8859 matches the special value '2' (handled below).
-        */
-       dx_u64 results = 0x001921fb13777511ull;
-       const char (*m1)[sizeof *t1];
-       unsigned x, cindex;
-
-       if (!charset)
-               goto no_conv;
+       unsigned cindex = 0;
+       const char *m;
  
-       m1 = bsearch(charset, t1, ARRAYSIZE(t1), sizeof *t1, compar_5arr);
-       if (!m1)
+       if (!charset || !(m = BSEARCH_ARRAY(charset, t1, compar_prefix)))
                 goto no_conv;
-       charset += 5;
  
-       x = m1-t1;
-       cindex = (results >> (x << 2)) & 0xf;
+       cindex = m[PREFIXLEN];
+       charset += PREFIXLEN;
  
         /*
          * We now need to identify encodings that match one of the 5-character
@@ -168,10 +164,10 @@ const char *copyright_symbol(const char *charset)
          *   ISO-8859-5
          *   ISO-8859-6
          */
-       if ((x == 0) != (*charset == '9')) {
-               /* CP112x, x != '9', no copyright symbol. */
-               goto no_conv;
-       } else if (cindex == 2) {
+       if (cindex == 0) {
+               /* CP112x, only CP1129 has copyright symbol. */
+               cindex = 5 * (*charset == '9');
+       } else if (cindex == 1) {
                 /*
                  * ISO-8859 special case.  Simply find and look at the final
                  * two digits.  The set bits in the 'accept' value indicate
@@ -188,14 +184,10 @@ const char *copyright_symbol(const char *charset)
                                 collect |= c - '0';
                 }
  
-               cindex = (accept >> (collect & 0x1f)) & 1;
-               if (!cindex)
-                       goto no_conv;
+               cindex = 5 * ((accept >> (collect & 0x1f)) & 1);
         }
-
-       return &codes[cindex];
  no_conv:
-       return &codes[20];
+       return codes+cindex;
  }
  
  #endif
author	Nick Bowler <nbowler@draconx.ca>
	Wed, 21 Feb 2024 01:52:04 +0000 (20:52 -0500)
committer	Nick Bowler <nbowler@draconx.ca>
	Fri, 23 Feb 2024 01:31:59 +0000 (20:31 -0500)