From 566e87bb15a98ed499e79b45f6c834ad0ffdc3b7 Mon Sep 17 00:00:00 2001
From: Nick Bowler <nbowler@draconx.ca>
Date: Thu, 6 Jul 2023 23:56:29 -0400
Subject: [PATCH] libcdecl: Re-use strings from parser in spec_string.

With the fixups applied by fix-yytname, we now have two distinct string
tables containing the same strings.  To avoid this duplication, add a
new internal function to allow spec_string to access the parser's token
name table.
---
 Makefile.am         |  6 +--
 src/gen-specstr.awk | 91 +++++++++------------------------------------
 src/output.c        |  1 +
 src/parse.y         | 16 ++++++++
 4 files changed, 38 insertions(+), 76 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index b8136eb..6d2ea65 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -100,11 +100,11 @@ t_rng_test_LDADD = $(TEST_LIBS)
 $(t_rng_test_OBJECTS): $(gnulib_headers)
 EXTRA_DIST += t/xos256p.c
 
+src/error.lo: src/errmsg.h
+src/output.lo: src/parse.h src/specstr.h
+src/parse-decl.lo: src/scan.h src/parse.h src/typemap.h src/errmsg.h
 src/parse.lo: src/scan.h src/errmsg.h
 src/scan.lo: src/parse.h src/errmsg.h
-src/parse-decl.lo: src/scan.h src/parse.h src/typemap.h src/errmsg.h
-src/output.lo: src/specstr.h
-src/error.lo: src/errmsg.h
 t/declgen.$(OBJEXT): t/typegen.h
 t/cdeclerr.$(OBJEXT): src/errmsg.h
 
diff --git a/src/gen-specstr.awk b/src/gen-specstr.awk
index 68401c1..c07856e 100755
--- a/src/gen-specstr.awk
+++ b/src/gen-specstr.awk
@@ -22,8 +22,7 @@ END {
 
 BEGIN {
   kinds["TYPE"] = kinds["STOR"] = kinds["QUAL"] = kinds["FUNC"] = 1;
-  underscore["BOOL"] = underscore["COMPLEX"] = underscore["IMAGINARY"] = 1;
-  count = 0;
+  count = maxwidth = 0;
 }
 
 # Locate all the relevant identifiers in cdecl.h.  We assume everything
@@ -48,40 +47,17 @@ $1 ~ /^CDECL_/ {
 
   if (parts[2] in kinds) {
     kind_counts[parts[2]]++;
+    specs[count++] = parts[3];
 
-    if (parts[3] == "IDENT") {
-      s = "";
-    } else if (parts[3] in underscore) {
-      s = "_" substr(parts[3], 1, 1) tolower(substr(parts[3], 2));
-    } else {
-      s = tolower(parts[3]);
-    }
-    rspecs[s] = count;
-    specs[count++] = s;
+    if (length(parts[3]) > maxwidth)
+      maxwidth = length(parts[3]);
   }
 }
 
 END {
-  string_table = "";
-
-  # The basic approach is to first generate a suffix-compressed string
-  # table containing all the specifier strings (not a lot of overlap in
-  # C specifiers, but there is (un)signed.
-  count = bucketsort(sorted_specs, specs);
-  for (i = 0; i < count; i++) {
-    s = sorted_specs[i];
-
-    if ((n = index(string_table, s "\1")) > 0) {
-      offsets[rspecs[s]] = n - 1;
-    } else {
-      offsets[rspecs[s]] = length(string_table);
-      string_table = string_table s "\1";
-    }
-  }
-
-  # Next, we create the index table.  The first 5 entries key off of bits 9
-  # through 11, which is sufficient to distinguish the different specifier
-  # kinds and is used to partition the rest of the index table.
+  # Create the token table.  The first 5 entries key off of bits 9 through 11,
+  # which is sufficient to distinguish the different specifier kinds and is
+  # used to partition the rest of the token table.
   skip_count = 0;
   for (i in skiptab) {
     if (skip_count < i)
@@ -95,59 +71,28 @@ END {
   }
   sub(/ $/, "\n\t\t", offset_table);
 
-  # Then, each remaining entry in the index table is an offset into the
-  # string table.
   for (i = 0; i < count; i++) {
-    suffix = "\t/* " (specs[i] ? specs[i] : "\"\"") " */";
+    suffix = "";
     if (i+1 < count)
-      suffix = "," suffix "\n\t\t";
-    offset_table = offset_table offsets[i] suffix;
-  }
+      suffix = ",\n\t\t";
 
-  sub(/\1$/, "", string_table);
-  gsub(/\1/, "\"\n\t\t\"\\0\" \"", string_table);
+    if (specs[i] == "IDENT")
+      s = "0";
+    else
+      s = "T_" substr(specs[i] "                ", 1, maxwidth) " - 256";
+    offset_table = offset_table s suffix;
+  }
 
   print "static const char *spec_string(unsigned type)"
   print "{"
-  print "\tstatic const char tab[] =";
-  print "\t\t     \"" string_table "\";\n";
   print "\tstatic const uint_least8_t idx[] = {";
   print "\t\t" offset_table;
   print "\t};\n";
 
   print "\tunsigned x = (type & 0xff) + idx[type >> 9];";
   print "\tassert(x < sizeof idx);";
-  print "\treturn tab + idx[x];";
+  print "\tif (!(x = idx[x]))";
+  print "\t\treturn \"\";";
+  print "\treturn cdecl__token_name(x + 256);";
   print "}";
 }
-
-# bucketsort(dst, src)
-#
-#
-# Sort the elements of src by descending string length,
-# placing them into dst[0] ... dst[n].
-#
-# Returns the number of elements.
-function bucketsort(dst, src, buckets, max, count, i, t)
-{
-  for (t in src) {
-    i = length(src[t]);
-    if (i > max) { max = i; }
-    buckets[i]++;
-  }
-
-  for (i = max; i >= 0; i--) {
-    if (i in buckets) {
-      t = buckets[i];
-      buckets[i] = count;
-      count += t;
-    }
-  }
-
-  for (t in src) {
-    i = length(t = src[t]);
-    dst[buckets[i]++] = t;
-  }
-
-  return count;
-}
diff --git a/src/output.c b/src/output.c
index 4590a5c..62c9c8f 100644
--- a/src/output.c
+++ b/src/output.c
@@ -22,6 +22,7 @@
 #include "cdecl.h"
 #include "cdecl-internal.h"
 
+#include "parse.h"
 #include "specstr.h"
 
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
diff --git a/src/parse.y b/src/parse.y
index 6faf2d2..263129c 100644
--- a/src/parse.y
+++ b/src/parse.y
@@ -62,6 +62,7 @@
 %code provides {
 void cdecl__free(struct cdecl *);
 int cdecl__yyparse(void *scanner, struct cdecl **out);
+const char *cdecl__token_name(unsigned token);
 }
 
 %union {
@@ -589,3 +590,18 @@ english_vla: T_IDENT | {
 	ALLOC($$, sizeof "");
 	strcpy($$, "");
 }
+
+%%
+
+/*
+ * Expose the token string table to the rest of the library, in order to
+ * produce strings that match parser keywords.
+ *
+ * In order for this to work properly, the Bison output must be postprocessed
+ * by fix-yytname.awk to remove pointless quotation marks from the keyword
+ * strings.
+ */
+const char *cdecl__token_name(unsigned token)
+{
+	return yytname[YYTRANSLATE(token)];
+}
-- 
2.43.2