src/fix-yytname.awk

   1 #!/bin/awk -f
   2 #
   3 # Copyright © 2023-2024 Nick Bowler
   4 #
   5 # Hackjob to improve the horrible yytname array generated by Bison.
   6 #
   7 # Bison generates a list of symbol names as a static array of char pointers
   8 # initialized with string literals, which is simply horrible.  These pointers
   9 # are two to four times larger than necessary, but with position-independent
  10 # code this also forces them into an unshareable, writeable segment since
  11 # they are not compile-time constants and must be initialized by the dynamic
  12 # loader at runtime.
  13 #
  14 # Furthermore, the names of nonterminal symbols are always included but they
  15 # are not always needed; they should only be output by tracing code which
  16 # is disabled by default at compile time.
  17 #
  18 # This script replaces the definition of the yytname array with a function
  19 # that implements the same lookup using truly constant tables, and makes the
  20 # inclusion of nonterminal symbols in these tables conditional on YYDEBUG.
  21 #
  22 # License WTFPL2: Do What The Fuck You Want To Public License, version 2.
  23 # This is free software: you are free to do what the fuck you want to.
  24 # There is NO WARRANTY, to the extent permitted by law.
  25
  26 BEGIN {
  27   in_table   = 0;
  28   num_tokens = 0;
  29   num_nterms = 0;
  30 }
  31
  32 # Locate YYNTOKENS definition, needed to distinguish terminal symbols
  33 # from nonterminals.
  34 $1 == "#define" && $2 == "YYNTOKENS" { yyntokens = $3; }
  35
  36 # Locate the yytname array definition to replace it.
  37 $0 ~ /static.*yytname *\[/ { in_table = 1; }
  38
  39 # If we are not in the yytname definition, look for direct references to the
  40 # array and replace them with a function call.
  41 !in_table {
  42   left = ""
  43   right = $0;
  44
  45   while ((m = match(right, /yytname *\[/))) {
  46     left  = substr(right, 1, m-1) "tname(";
  47     right = substr(right, m+RLENGTH);
  48
  49     depth = 1;
  50     n = length(right);
  51     for (i = 1; i <= n; i++) {
  52       c = substr(right, i, 1);
  53       if (c == "]" && --depth == 0)
  54         break;
  55       if (c == "[")
  56         depth++;
  57     }
  58
  59     left  = left substr(right, 1, i-1) ")";
  60     right = substr(right, i+1);
  61   }
  62
  63   print left right;
  64 }
  65
  66 # If we are in the yytname definition, collect all the string literals in the
  67 # array initializer.  The first yyntokens strings are terminal symbols, the
  68 # rest are nonterminals.
  69 in_table {
  70   gsub(/\\\\/, "\2\2");
  71   gsub(/\\"/,  "\1");
  72
  73   while ($1 ~ /^"/) {
  74     sub(/^[^"]*"/, "");
  75
  76     x  = index($0, "\"");
  77     s  = substr($0, 1, x - 1);
  78     $0 = substr($0, x + 2);
  79
  80     gsub("\1", "\\\"", s);
  81
  82     # Bison will remove quotes from the token strings at runtime unless the
  83     # token contains an unescaped backslash, an apostrophe, or a comma.  It is
  84     # silly store all these quote characters if they aren't going to be used.
  85     if (s ~ /^\\"[^\'\\,]*\\"$/)
  86       s = substr(s, 3, length(s) - 4);
  87
  88     if (num_tokens < yyntokens)
  89       tokens[num_tokens++] = s;
  90     else
  91       nterms[num_nterms++] = s;
  92   }
  93 }
  94
  95 # At the end of the yytname definition, output our replacement function.
  96 in_table && $0 ~ /^};/ {
  97   print "#if !defined(UINT_LEAST8_MAX) || !defined(UINT_LEAST16_MAX)";
  98   print "#  include <inttypes.h>";
  99   print "#endif";
 100   print "#ifndef assert";
 101   print "#  include <assert.h>";
 102   print "#endif";
 103   print "#ifndef offsetof";
 104   print "#  include <stddef.h>";
 105   print "#endif\n"
 106
 107   print "static const char *tname(unsigned x)";
 108   print "{";
 109
 110   count = bucketsort(sorted_tokens, tokens);
 111   token_table = build_strtab(sorted_tokens, count, token_offsets);
 112   token_size = TLEN;
 113   offset_max = TMAX;
 114
 115   if ((count = bucketsort(sorted_nterms, nterms))) {
 116     nterm_table = build_strtab(sorted_nterms, count, nterm_offsets);
 117     offset_max = token_size + TMAX;
 118     nterm_size = TLEN;
 119   } else {
 120     nterm_table = "";
 121     nterm_size = 0;
 122   }
 123
 124   print "\tstruct tname_data {";
 125   print "#if YYDEBUG";
 126   print "\t\tuint_least16_t map[" num_tokens + num_nterms "];";
 127   print "\t\tchar tab[" token_size + nterm_size "];";
 128   print "#else"
 129   print "\t\tuint_least16_t map[" num_tokens "];";
 130   print "\t\tchar tab[" token_size "];";
 131   print "#endif";
 132   print "\t};";
 133
 134   print "\n\tenum { O = offsetof(struct tname_data, tab) };";
 135   print "\tstatic const struct tname_data data = {";
 136   print "\t\t{";
 137   print_offsets(token_offsets, tokens, num_tokens, log10(offset_max));
 138   if (num_nterms) {
 139     print "#if YYDEBUG";
 140     print_offsets(nterm_offsets, nterms, num_nterms, log10(offset_max), token_size);
 141     print "#endif";
 142   }
 143
 144   sub("\1$", "\"", token_table);
 145   gsub("\1", "\"\n\t\t\"\\0\" \"", token_table);
 146   gsub("\2", "\\", token_table);
 147   print "\t\t},   \"" token_table;
 148
 149   if (num_nterms) {
 150     print "#if YYDEBUG";
 151     sub("\1$", "\"", nterm_table);
 152     gsub("\1", "\"\n\t\t\"\\0\" \"", nterm_table);
 153     gsub("\2", "\\", nterm_table);
 154     print "\t\t\"\\0\" \"" nterm_table;
 155     print "#endif";
 156   }
 157
 158   print "\t};";
 159
 160   print "\n\tassert(x < sizeof data.map / sizeof data.map[0]);"
 161   print "\treturn (char *)&data + data.map[x];";
 162   print "}";
 163
 164   in_table = 0;
 165 }
 166
 167 function log10(x)
 168 {
 169   if (x < 10)
 170     return 1;
 171   if (x < 100)
 172     return 2;
 173   if (x < 1000)
 174     return 3;
 175   if (x < 10000)
 176     return 4;
 177
 178   return 5;
 179 }
 180
 181 # build_strtab(strings, count, offsets)
 182 #
 183 # Generate a string table.  Each token in the strings array (indexed from 0
 184 # through count-1) is appended to a string, with a \1 byte terminating each
 185 # item.
 186 #
 187 # The table is suffix-compressed: if the token exists as a suffix of one that
 188 # is already in the table, no new entry is needed.  The input strings array
 189 # must be sorted from longest to shortest in order to find all possible suffix
 190 # matches.
 191 #
 192 # The offsets array is populated with the actual offsets (for C code),
 193 # indexed by each token, and the resulting table is returned.
 194 #
 195 # The caller must transform the \1 characters to NUL terminators in the C code.
 196 function build_strtab(in_strings, in_count, out_offsets, i, s, ret)
 197 {
 198   ret = "";
 199
 200   for (i = TLEN = 0; i < in_count; i++) {
 201     s = in_strings[i];
 202     if ((n = index(ret, s "\1")) > 0) {
 203       out_offsets[s] = real_length(substr(ret, 1, n-1));
 204     } else {
 205       ret = ret s "\1";
 206       out_offsets[s] = TMAX = TLEN;
 207       TLEN += real_length(s) + 1;
 208     }
 209   }
 210
 211   return ret;
 212 }
 213
 214 # print_offsets(offsets, strings, count, w, adj)
 215 #
 216 # Outputs the initializer portion of the tname offset array.
 217 #
 218 # Each token in the strings array (indexed from 0 through count-1) is used to
 219 # index the offsets array, and the resulting value is printed, with commas
 220 # between them.
 221 #
 222 # The w value specifies the minimum field width for the printed offsets, used
 223 # to achieve a visually pleasing alignment of output.
 224 #
 225 # If adj is nonzero, it is added to all the printed offset values, and a comma
 226 # is printed before the first line.
 227 function print_offsets(in_offsets, in_strings, in_count, in_w, in_adj, i, t, s)
 228 {
 229   t = in_adj ? "\t,\t" : "\t\t";
 230   s = "";
 231
 232   for (i = 0; i < in_count; i++) {
 233     s = s sprintf("%" (in_w+1) "d+O,", (in_offsets[in_strings[i]] + in_adj));
 234     if (i+1 == in_count)
 235       sub(/,$/, "", s);
 236
 237     if ((i+1) % 8 == 0) {
 238       print t s;
 239       t = "\t\t";
 240       s = "";
 241     }
 242   }
 243
 244   if (i%8 != 0)
 245     print t s;
 246 }
 247
 248 # bucketsort(dst, src)
 249 #
 250 # Sort the elements of src by descending string length,
 251 # placing them into dst[0] ... dst[n].
 252 #
 253 # Returns the number of elements.
 254 function bucketsort(dst, src, buckets, max, count, i, t)
 255 {
 256   for (t in src) {
 257     i = length(src[t])
 258     if (i > max) { max = i }
 259     buckets[i]++
 260   }
 261
 262   for (i = max; i > 0; i--) {
 263     if (i in buckets) {
 264       t = buckets[i]
 265       buckets[i] = count
 266       count += t
 267     }
 268   }
 269
 270   for (t in src) {
 271     i = length(t = src[t])
 272     dst[buckets[i]++] = t
 273   }
 274
 275   return count
 276 }
 277
 278 # real_length(s)
 279 #
 280 # Calculate the length of s with backslash sequences counted as one character.
 281 function real_length(s, t)
 282 {
 283   t = length(s)
 284   return t - gsub(/\\./, "&", s) - gsub("\2\2", "&", s);
 285 }