3 # Copyright © 2023 Nick Bowler
5 # Hackjob to improve the horrible yytname array generated by Bison.
7 # Bison generates a list of symbol names as a static array of char pointers
8 # initialized with string literals, which is simply horrible. These pointers
9 # are two to four times larger than necessary, but with position-independent
10 # code this also forces them into an unshareable, writeable segment since
11 # they are not compile-time constants and must be initialized by the dynamic
14 # Furthermore, the names of nonterminal symbols are always included but they
15 # are not always needed; they should only be output by tracing code which
16 # is disabled by default at compile time.
18 # This script replaces the definition of the yytname array with a function
19 # that implements the same lookup using truly constant tables, and makes the
20 # inclusion of nonterminal symbols in these tables conditional on YYDEBUG.
22 # License WTFPL2: Do What The Fuck You Want To Public License, version 2.
23 # This is free software: you are free to do what the fuck you want to.
24 # There is NO WARRANTY, to the extent permitted by law.
32 # Locate YYNTOKENS definition, needed to distinguish terminal symbols
34 $1 == "#define" && $2 == "YYNTOKENS" { yyntokens = $3; }
36 # Locate the yytname array definition to replace it.
37 $0 ~ /static.*yytname *\[/ { in_table = 1; }
39 # If we are not in the yytname definition, look for direct references to the
40 # array and replace them with a function call.
45 while ((m = match(right, /yytname *\[/))) {
46 left = substr(right, 1, m-1) "tname(";
47 right = substr(right, m+RLENGTH);
51 for (i = 1; i <= n; i++) {
52 c = substr(right, i, 1);
53 if (c == "]" && --depth == 0)
59 left = left substr(right, 1, i-1) ")";
60 right = substr(right, i+1);
66 # If we are in the yytname definition, collect all the string literals in the
67 # array initializer. The first yyntokens strings are terminal symbols, the
68 # rest are nonterminals.
77 s = substr($0, 1, x - 1);
78 $0 = substr($0, x + 2);
80 gsub("\1", "\\\"", s);
82 # Bison will remove quotes from the token strings at runtime unless the
83 # token contains an unescaped backslash, an apostrophe, or a comma. It is
84 # silly store all these quote characters if they aren't going to be used.
85 if (s ~ /^\\"[^\'\\,]*\\"$/)
86 s = substr(s, 3, length(s) - 4);
88 if (num_tokens < yyntokens)
89 tokens[num_tokens++] = s;
91 nterms[num_nterms++] = s;
95 # At the end of the yytname definition, output our replacement function.
96 in_table && $0 ~ /^};/ {
97 print "#if !defined(UINT_LEAST8_MAX) || !defined(UINT_LEAST16_MAX)";
98 print "# include <stdint.h>";
100 print "#ifndef assert";
101 print "# include <assert.h>";
104 print "static const char *tname(unsigned x)";
107 count = bucketsort(sorted_tokens, tokens);
108 table = build_strtab(sorted_tokens, count, token_offsets);
109 token_offset_max = TMAX;
112 sub("\1$", "", table);
113 gsub("\1", "\"\n\t\t\"\\0\" \"", table);
114 gsub("\2", "\\", table);
116 print "\tstatic const char tname_strings[] =";
117 print "\t\t \"" table "\"";
119 if ((count = bucketsort(sorted_nterms, nterms))) {
120 table = build_strtab(sorted_nterms, count, nterm_offsets);
121 offset_max = nterm_offset + TMAX;
122 offset_chk = "(YYDEBUG ? " offset_max " : " token_offset_max ")";
124 sub("\1$", "", table);
125 gsub("\1", "\"\n\t\t\"\\0\" \"", table);
126 gsub("\2", "\\", table);
130 print "\t\t\"\\0\" \"" table "\"";
132 print "\t\t \"" table "\"";
135 offset_chk = offset_max = token_offset_max;
139 print "\tstatic const";
140 print "#if UINT_LEAST8_MAX >= " offset_chk;
141 print "\tuint_least8_t";
142 print "#elif UINT_LEAST16_MAX >= " offset_chk;
143 print "\tuint_least16_t";
145 print "\tuint_least32_t";
148 print "\ttname_offsets[] = {";
149 print_offsets(token_offsets, tokens, num_tokens, log10(offset_max));
152 print_offsets(nterm_offsets, nterms, num_nterms, log10(offset_max), nterm_offset);
157 print "\tassert(x < sizeof tname_offsets / sizeof tname_offsets[0]);";
158 print "\treturn tname_strings + tname_offsets[x];";
178 # build_strtab(strings, count, offsets)
180 # Generate a string table. Each token in the strings array (indexed from 0
181 # through count-1) is appended to a string, with a \1 byte terminating each
184 # The table is suffix-compressed: if the token exists as a suffix of one that
185 # is already in the table, no new entry is needed. The input strings array
186 # must be sorted from longest to shortest in order to find all possible suffix
189 # The offsets array is populated with the actual offsets (for C code),
190 # indexed by each token, and the resulting table is returned.
192 # The caller must transform the \1 characters to NUL terminators in the C code.
193 function build_strtab(in_strings, in_count, out_offsets, i, s, ret)
197 for (i = TLEN = 0; i < in_count; i++) {
199 if ((n = index(ret, s "\1")) > 0) {
200 out_offsets[s] = real_length(substr(ret, 1, n-1));
203 out_offsets[s] = TMAX = TLEN;
204 TLEN += real_length(s) + 1;
211 # print_offsets(offsets, strings, count, w, adj)
213 # Outputs the initializer portion of the tname offset array.
215 # Each token in the strings array (indexed from 0 through count-1) is used to
216 # index the offsets array, and the resulting value is printed, with commas
219 # The w value specifies the minimum field width for the printed offsets, used
220 # to achieve a visually pleasing alignment of output.
222 # If adj is nonzero, it is added to all the printed offset values, and a comma
223 # is printed before the first line.
224 function print_offsets(in_offsets, in_strings, in_count, in_w, in_adj, i, t, s)
226 t = in_adj ? "\t,\t" : "\t\t";
229 for (i = 0; i < in_count; i++) {
230 s = s sprintf("%" (in_w+1) "d,", (in_offsets[in_strings[i]] + in_adj));
234 if ((i+1) % 8 == 0) {
244 # bucketsort(dst, src)
246 # Sort the elements of src by descending string length,
247 # placing them into dst[0] ... dst[n].
249 # Returns the number of elements.
250 function bucketsort(dst, src, buckets, max, count, i, t)
254 if (i > max) { max = i }
258 for (i = max; i > 0; i--) {
267 i = length(t = src[t])
268 dst[buckets[i]++] = t
276 # Calculate the length of s with backslash sequences counted as one character.
277 function real_length(s, t)
280 return t - gsub(/\\./, "&", s) - gsub("\2\2", "&", s);