From af1f874c4c58d0bd8becd52c892d0b358d08736e Mon Sep 17 00:00:00 2001
From: Nick Bowler <nbowler@draconx.ca>
Date: Thu, 6 Jul 2023 22:46:20 -0400
Subject: [PATCH] libcdecl: Replace yytname array in the Bison parser.

Bison generates a list of symbol names as a static array of char pointers
initialized with string literals, which is simply horrible.  These pointers
are two to four times larger than necessary, but with position-independent
code this also forces them into an unshareable, writeable segment since
they are not compile-time constants and must be initialized by the dynamic
loader at runtime.

Furthermore, the names of nonterminal symbols are always included but they
are not always needed; they should only be output by tracing code which
is disabled by default at compile time.

Fix this by adding a new build script which postprocesses the output,
replacing the yytname array with a function implementing the same lookup
with truly constant tables.  Inclusion of nonterminals is now conditional
on YYDEBUG.

This is a big win, reducing the overall library size about 5 to 10 percent
(64-bit hosts see the most improvement).
---
 Makefile.am         |  17 +--
 src/fix-yytname.awk | 281 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 291 insertions(+), 7 deletions(-)
 create mode 100755 src/fix-yytname.awk

diff --git a/Makefile.am b/Makefile.am
index 031c0a8..b8136eb 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -261,19 +261,22 @@ DEV_TOOL_ERROR = { \
   echo "       *** configure, consired setting $$toolvar and re-running configure."; \
   echo "       *** See config.log for more details."; } >&2; false
 
+DO_BISON = $(BISON) $(BISON_COMPAT) $(BISONFLAGS)
+
 .y.c: ;
 .y.stamp:
 if !HAVE_BISON
 	$(BISON_V)tool=bison toolvar=BISON toolsrc=$<; $(DEV_TOOL_ERROR)
 endif
-	$(AM_V_at) touch $@.tmp
-	$(BISON_V) $(BISON) $(BISON_COMPAT) -o $*.c --defines=$*.h.tmp $(BISONFLAGS) $<
-	$(AM_V_at) if cmp $*.h.tmp $*.h >/dev/null 2>&1; then \
-		rm -f $*.h.tmp; \
-	else \
-		mv -f $*.h.tmp $*.h; \
-	fi
+	$(BISON_V) : >$@.tmp
+	$(AM_V_at) $(DO_BISON) -o $*.c.tmp --defines=$*.h.tmp $<
+	$(AM_V_at) $(AWK) -f $(srcdir)/src/fix-yytname.awk $*.c.tmp >$*.c.t2
+	$(AM_V_at) mv -f $*.c.t2 $*.c
+	$(AM_V_at) cmp $*.h.tmp $*.h >/dev/null 2>&1 || mv -f $*.h.tmp $*.h
+	$(AM_V_at) rm -f $*.c.tmp $*.h.tmp
 	$(AM_V_at) mv -f $@.tmp $@
+src/parse.stamp: $(srcdir)/src/fix-yytname.awk
+EXTRA_DIST += src/fix-yytname.awk
 
 .l.c: ;
 .l.stamp:
diff --git a/src/fix-yytname.awk b/src/fix-yytname.awk
new file mode 100755
index 0000000..8a417b3
--- /dev/null
+++ b/src/fix-yytname.awk
@@ -0,0 +1,281 @@
+#!/bin/awk -f
+#
+# Copyright Â© 2023 Nick Bowler
+#
+# Hackjob to improve the horrible yytname array generated by Bison.
+#
+# Bison generates a list of symbol names as a static array of char pointers
+# initialized with string literals, which is simply horrible.  These pointers
+# are two to four times larger than necessary, but with position-independent
+# code this also forces them into an unshareable, writeable segment since
+# they are not compile-time constants and must be initialized by the dynamic
+# loader at runtime.
+#
+# Furthermore, the names of nonterminal symbols are always included but they
+# are not always needed; they should only be output by tracing code which
+# is disabled by default at compile time.
+#
+# This script replaces the definition of the yytname array with a function
+# that implements the same lookup using truly constant tables, and makes the
+# inclusion of nonterminal symbols in these tables conditional on YYDEBUG.
+#
+# License WTFPL2: Do What The Fuck You Want To Public License, version 2.
+# This is free software: you are free to do what the fuck you want to.
+# There is NO WARRANTY, to the extent permitted by law.
+
+BEGIN {
+  in_table   = 0;
+  num_tokens = 0;
+  num_nterms = 0;
+}
+
+# Locate YYNTOKENS definition, needed to distinguish terminal symbols
+# from nonterminals.
+$1 == "#define" && $2 == "YYNTOKENS" { yyntokens = $3; }
+
+# Locate the yytname array definition to replace it.
+$0 ~ /static.*yytname *\[/ { in_table = 1; }
+
+# If we are not in the yytname definition, look for direct references to the
+# array and replace them with a function call.
+!in_table {
+  left = ""
+  right = $0;
+
+  while ((m = match(right, /yytname *\[/))) {
+    left  = substr(right, 1, m-1) "tname(";
+    right = substr(right, m+RLENGTH);
+
+    depth = 1;
+    n = length(right);
+    for (i = 1; i <= n; i++) {
+      c = substr(right, i, 1);
+      if (c == "]" && --depth == 0)
+        break;
+      if (c == "[")
+        depth++;
+    }
+
+    left  = left substr(right, 1, i-1) ")";
+    right = substr(right, i+1);
+  }
+
+  print left right;
+}
+
+# If we are in the yytname definition, collect all the string literals in the
+# array initializer.  The first yyntokens strings are terminal symbols, the
+# rest are nonterminals.
+in_table {
+  gsub(/\\\\/, "\2\2");
+  gsub(/\\"/,  "\1");
+
+  while ($1 ~ /^"/) {
+    sub(/^[^"]*"/, "");
+
+    x  = index($0, "\"");
+    s  = substr($0, 1, x - 1);
+    $0 = substr($0, x + 2);
+
+    gsub("\1", "\\\"", s);
+
+    # Bison will remove quotes from the token strings at runtime unless the
+    # token contains an unescaped backslash, an apostrophe, or a comma.  It is
+    # silly store all these quote characters if they aren't going to be used.
+    if (s ~ /^\\"[^\'\\,]*\\"$/)
+      s = substr(s, 3, length(s) - 4);
+
+    if (num_tokens < yyntokens)
+      tokens[num_tokens++] = s;
+    else
+      nterms[num_nterms++] = s;
+  }
+}
+
+# At the end of the yytname definition, output our replacement function.
+in_table && $0 ~ /^};/ {
+  print "#if !defined(UINT_LEAST8_MAX) || !defined(UINT_LEAST16_MAX)";
+  print "#  include <stdint.h>";
+  print "#endif";
+  print "#ifndef assert";
+  print "#  include <assert.h>";
+  print "#endif\n";
+
+  print "static const char *tname(unsigned x)";
+  print "{";
+
+  count = bucketsort(sorted_tokens, tokens);
+  table = build_strtab(sorted_tokens, count, token_offsets);
+  token_offset_max = TMAX;
+  nterm_offset = TLEN;
+
+  sub("\1$", "", table);
+  gsub("\1", "\"\n\t\t\"\\0\" \"", table);
+  gsub("\2", "\\", table);
+
+  print "\tstatic const char tname_strings[] =";
+  print "\t\t     \"" table "\"";
+
+  if ((count = bucketsort(sorted_nterms, nterms))) {
+    table = build_strtab(sorted_nterms, count, nterm_offsets);
+    offset_max = nterm_offset + TMAX;
+    offset_chk = "(YYDEBUG ? " offset_max " : " token_offset_max ")";
+
+    sub("\1$", "", table);
+    gsub("\1", "\"\n\t\t\"\\0\" \"", table);
+    gsub("\2", "\\", table);
+
+    print "#if YYDEBUG";
+    if (nterm_offset)
+      print "\t\t\"\\0\" \"" table "\"";
+    else
+      print "\t\t     \"" table "\"";
+    print "#endif";
+  } else {
+    offset_chk = offset_max = token_offset_max;
+  }
+  print "\t\t;\n";
+
+  print "\tstatic const";
+  print "#if UINT_LEAST8_MAX >= " offset_chk;
+  print "\tuint_least8_t";
+  print "#elif UINT_LEAST16_MAX >= " offset_chk;
+  print "\tuint_least16_t";
+  print "#else";
+  print "\tuint_least32_t";
+  print "#endif";
+
+  print "\ttname_offsets[] = {";
+  print_offsets(token_offsets, tokens, num_tokens, log10(offset_max));
+  if (num_nterms) {
+    print "#if YYDEBUG";
+    print_offsets(nterm_offsets, nterms, num_nterms, log10(offset_max), nterm_offset);
+    print "#endif";
+  }
+  print "\t};\n";
+
+  print "\tassert(x < sizeof tname_offsets / sizeof tname_offsets[0]);";
+  print "\treturn tname_strings + tname_offsets[x];";
+  print "}";
+
+  in_table = 0;
+}
+
+function log10(x)
+{
+  if (x < 10)
+    return 1;
+  if (x < 100)
+    return 2;
+  if (x < 1000)
+    return 3;
+  if (x < 10000)
+    return 4;
+
+  return 5;
+}
+
+# build_strtab(strings, count, offsets)
+#
+# Generate a string table.  Each token in the strings array (indexed from 0
+# through count-1) is appended to a string, with a \1 byte terminating each
+# item.
+#
+# The table is suffix-compressed: if the token exists as a suffix of one that
+# is already in the table, no new entry is needed.  The input strings array
+# must be sorted from longest to shortest in order to find all possible suffix
+# matches.
+#
+# The offsets array is populated with the actual offsets (for C code),
+# indexed by each token, and the resulting table is returned.
+#
+# The caller must transform the \1 characters to NUL terminators in the C code.
+function build_strtab(in_strings, in_count, out_offsets, i, s, ret)
+{
+  ret = "";
+
+  for (i = TLEN = 0; i < in_count; i++) {
+    s = in_strings[i];
+    if ((n = index(ret, s "\1")) > 0) {
+      out_offsets[s] = real_length(substr(ret, 1, n-1));
+    } else {
+      ret = ret s "\1";
+      out_offsets[s] = TMAX = TLEN;
+      TLEN += real_length(s) + 1;
+    }
+  }
+
+  return ret;
+}
+
+# print_offsets(offsets, strings, count, w, adj)
+#
+# Outputs the initializer portion of the tname offset array.
+#
+# Each token in the strings array (indexed from 0 through count-1) is used to
+# index the offsets array, and the resulting value is printed, with commas
+# between them.
+#
+# The w value specifies the minimum field width for the printed offsets, used
+# to achieve a visually pleasing alignment of output.
+#
+# If adj is nonzero, it is added to all the printed offset values, and a comma
+# is printed before the first line.
+function print_offsets(in_offsets, in_strings, in_count, in_w, in_adj, i, t, s)
+{
+  t = in_adj ? "\t,\t" : "\t\t";
+  s = "";
+
+  for (i = 0; i < in_count; i++) {
+    s = s sprintf("%" (in_w+1) "d,", (in_offsets[in_strings[i]] + in_adj));
+    if (i+1 == in_count)
+      sub(/,$/, "", s);
+
+    if ((i+1) % 8 == 0) {
+      print t s;
+      t = "\t\t";
+      s = "";
+    }
+  }
+
+  print t s;
+}
+
+# bucketsort(dst, src)
+#
+# Sort the elements of src by descending string length,
+# placing them into dst[0] ... dst[n].
+#
+# Returns the number of elements.
+function bucketsort(dst, src, buckets, max, count, i, t)
+{
+  for (t in src) {
+    i = length(src[t])
+    if (i > max) { max = i }
+    buckets[i]++
+  }
+
+  for (i = max; i > 0; i--) {
+    if (i in buckets) {
+      t = buckets[i]
+      buckets[i] = count
+      count += t
+    }
+  }
+
+  for (t in src) {
+    i = length(t = src[t])
+    dst[buckets[i]++] = t
+  }
+
+  return count
+}
+
+# real_length(s)
+#
+# Calculate the length of s with backslash sequences counted as one character.
+function real_length(s, t)
+{
+  t = length(s)
+  return t - gsub(/\\./, "&", s) - gsub("\2\2", "&", s);
+}
-- 
2.43.2