src/fix-yytname.awk

   1 #!/bin/awk -f
   2 #
   3 # Copyright © 2023 Nick Bowler
   4 #
   5 # Hackjob to improve the horrible yytname array generated by Bison.
   6 #
   7 # Bison generates a list of symbol names as a static array of char pointers
   8 # initialized with string literals, which is simply horrible.  These pointers
   9 # are two to four times larger than necessary, but with position-independent
  10 # code this also forces them into an unshareable, writeable segment since
  11 # they are not compile-time constants and must be initialized by the dynamic
  12 # loader at runtime.
  13 #
  14 # Furthermore, the names of nonterminal symbols are always included but they
  15 # are not always needed; they should only be output by tracing code which
  16 # is disabled by default at compile time.
  17 #
  18 # This script replaces the definition of the yytname array with a function
  19 # that implements the same lookup using truly constant tables, and makes the
  20 # inclusion of nonterminal symbols in these tables conditional on YYDEBUG.
  21 #
  22 # License WTFPL2: Do What The Fuck You Want To Public License, version 2.
  23 # This is free software: you are free to do what the fuck you want to.
  24 # There is NO WARRANTY, to the extent permitted by law.
  25
  26 BEGIN {
  27   in_table   = 0;
  28   num_tokens = 0;
  29   num_nterms = 0;
  30 }
  31
  32 # Locate YYNTOKENS definition, needed to distinguish terminal symbols
  33 # from nonterminals.
  34 $1 == "#define" && $2 == "YYNTOKENS" { yyntokens = $3; }
  35
  36 # Locate the yytname array definition to replace it.
  37 $0 ~ /static.*yytname *\[/ { in_table = 1; }
  38
  39 # If we are not in the yytname definition, look for direct references to the
  40 # array and replace them with a function call.
  41 !in_table {
  42   left = ""
  43   right = $0;
  44
  45   while ((m = match(right, /yytname *\[/))) {
  46     left  = substr(right, 1, m-1) "tname(";
  47     right = substr(right, m+RLENGTH);
  48
  49     depth = 1;
  50     n = length(right);
  51     for (i = 1; i <= n; i++) {
  52       c = substr(right, i, 1);
  53       if (c == "]" && --depth == 0)
  54         break;
  55       if (c == "[")
  56         depth++;
  57     }
  58
  59     left  = left substr(right, 1, i-1) ")";
  60     right = substr(right, i+1);
  61   }
  62
  63   print left right;
  64 }
  65
  66 # If we are in the yytname definition, collect all the string literals in the
  67 # array initializer.  The first yyntokens strings are terminal symbols, the
  68 # rest are nonterminals.
  69 in_table {
  70   gsub(/\\\\/, "\2\2");
  71   gsub(/\\"/,  "\1");
  72
  73   while ($1 ~ /^"/) {
  74     sub(/^[^"]*"/, "");
  75
  76     x  = index($0, "\"");
  77     s  = substr($0, 1, x - 1);
  78     $0 = substr($0, x + 2);
  79
  80     gsub("\1", "\\\"", s);
  81
  82     # Bison will remove quotes from the token strings at runtime unless the
  83     # token contains an unescaped backslash, an apostrophe, or a comma.  It is
  84     # silly store all these quote characters if they aren't going to be used.
  85     if (s ~ /^\\"[^\'\\,]*\\"$/)
  86       s = substr(s, 3, length(s) - 4);
  87
  88     if (num_tokens < yyntokens)
  89       tokens[num_tokens++] = s;
  90     else
  91       nterms[num_nterms++] = s;
  92   }
  93 }
  94
  95 # At the end of the yytname definition, output our replacement function.
  96 in_table && $0 ~ /^};/ {
  97   print "#if !defined(UINT_LEAST8_MAX) || !defined(UINT_LEAST16_MAX)";
  98   print "#  include <stdint.h>";
  99   print "#endif";
 100   print "#ifndef assert";
 101   print "#  include <assert.h>";
 102   print "#endif\n";
 103
 104   print "static const char *tname(unsigned x)";
 105   print "{";
 106
 107   count = bucketsort(sorted_tokens, tokens);
 108   table = build_strtab(sorted_tokens, count, token_offsets);
 109   token_offset_max = TMAX;
 110   nterm_offset = TLEN;
 111
 112   sub("\1$", "", table);
 113   gsub("\1", "\"\n\t\t\"\\0\" \"", table);
 114   gsub("\2", "\\", table);
 115
 116   print "\tstatic const char tname_strings[] =";
 117   print "\t\t     \"" table "\"";
 118
 119   if ((count = bucketsort(sorted_nterms, nterms))) {
 120     table = build_strtab(sorted_nterms, count, nterm_offsets);
 121     offset_max = nterm_offset + TMAX;
 122     offset_chk = "(YYDEBUG ? " offset_max " : " token_offset_max ")";
 123
 124     sub("\1$", "", table);
 125     gsub("\1", "\"\n\t\t\"\\0\" \"", table);
 126     gsub("\2", "\\", table);
 127
 128     print "#if YYDEBUG";
 129     if (nterm_offset)
 130       print "\t\t\"\\0\" \"" table "\"";
 131     else
 132       print "\t\t     \"" table "\"";
 133     print "#endif";
 134   } else {
 135     offset_chk = offset_max = token_offset_max;
 136   }
 137   print "\t\t;\n";
 138
 139   print "\tstatic const";
 140   print "#if UINT_LEAST8_MAX >= " offset_chk;
 141   print "\tuint_least8_t";
 142   print "#elif UINT_LEAST16_MAX >= " offset_chk;
 143   print "\tuint_least16_t";
 144   print "#else";
 145   print "\tuint_least32_t";
 146   print "#endif";
 147
 148   print "\ttname_offsets[] = {";
 149   print_offsets(token_offsets, tokens, num_tokens, log10(offset_max));
 150   if (num_nterms) {
 151     print "#if YYDEBUG";
 152     print_offsets(nterm_offsets, nterms, num_nterms, log10(offset_max), nterm_offset);
 153     print "#endif";
 154   }
 155   print "\t};\n";
 156
 157   print "\tassert(x < sizeof tname_offsets / sizeof tname_offsets[0]);";
 158   print "\treturn tname_strings + tname_offsets[x];";
 159   print "}";
 160
 161   in_table = 0;
 162 }
 163
 164 function log10(x)
 165 {
 166   if (x < 10)
 167     return 1;
 168   if (x < 100)
 169     return 2;
 170   if (x < 1000)
 171     return 3;
 172   if (x < 10000)
 173     return 4;
 174
 175   return 5;
 176 }
 177
 178 # build_strtab(strings, count, offsets)
 179 #
 180 # Generate a string table.  Each token in the strings array (indexed from 0
 181 # through count-1) is appended to a string, with a \1 byte terminating each
 182 # item.
 183 #
 184 # The table is suffix-compressed: if the token exists as a suffix of one that
 185 # is already in the table, no new entry is needed.  The input strings array
 186 # must be sorted from longest to shortest in order to find all possible suffix
 187 # matches.
 188 #
 189 # The offsets array is populated with the actual offsets (for C code),
 190 # indexed by each token, and the resulting table is returned.
 191 #
 192 # The caller must transform the \1 characters to NUL terminators in the C code.
 193 function build_strtab(in_strings, in_count, out_offsets, i, s, ret)
 194 {
 195   ret = "";
 196
 197   for (i = TLEN = 0; i < in_count; i++) {
 198     s = in_strings[i];
 199     if ((n = index(ret, s "\1")) > 0) {
 200       out_offsets[s] = real_length(substr(ret, 1, n-1));
 201     } else {
 202       ret = ret s "\1";
 203       out_offsets[s] = TMAX = TLEN;
 204       TLEN += real_length(s) + 1;
 205     }
 206   }
 207
 208   return ret;
 209 }
 210
 211 # print_offsets(offsets, strings, count, w, adj)
 212 #
 213 # Outputs the initializer portion of the tname offset array.
 214 #
 215 # Each token in the strings array (indexed from 0 through count-1) is used to
 216 # index the offsets array, and the resulting value is printed, with commas
 217 # between them.
 218 #
 219 # The w value specifies the minimum field width for the printed offsets, used
 220 # to achieve a visually pleasing alignment of output.
 221 #
 222 # If adj is nonzero, it is added to all the printed offset values, and a comma
 223 # is printed before the first line.
 224 function print_offsets(in_offsets, in_strings, in_count, in_w, in_adj, i, t, s)
 225 {
 226   t = in_adj ? "\t,\t" : "\t\t";
 227   s = "";
 228
 229   for (i = 0; i < in_count; i++) {
 230     s = s sprintf("%" (in_w+1) "d,", (in_offsets[in_strings[i]] + in_adj));
 231     if (i+1 == in_count)
 232       sub(/,$/, "", s);
 233
 234     if ((i+1) % 8 == 0) {
 235       print t s;
 236       t = "\t\t";
 237       s = "";
 238     }
 239   }
 240
 241   print t s;
 242 }
 243
 244 # bucketsort(dst, src)
 245 #
 246 # Sort the elements of src by descending string length,
 247 # placing them into dst[0] ... dst[n].
 248 #
 249 # Returns the number of elements.
 250 function bucketsort(dst, src, buckets, max, count, i, t)
 251 {
 252   for (t in src) {
 253     i = length(src[t])
 254     if (i > max) { max = i }
 255     buckets[i]++
 256   }
 257
 258   for (i = max; i > 0; i--) {
 259     if (i in buckets) {
 260       t = buckets[i]
 261       buckets[i] = count
 262       count += t
 263     }
 264   }
 265
 266   for (t in src) {
 267     i = length(t = src[t])
 268     dst[buckets[i]++] = t
 269   }
 270
 271   return count
 272 }
 273
 274 # real_length(s)
 275 #
 276 # Calculate the length of s with backslash sequences counted as one character.
 277 function real_length(s, t)
 278 {
 279   t = length(s)
 280   return t - gsub(/\\./, "&", s) - gsub("\2\2", "&", s);
 281 }