#!/bin/awk -f # # Copyright © 2023-2024 Nick Bowler # # Hackjob to improve the horrible yytname array generated by Bison. # # Bison generates a list of symbol names as a static array of char pointers # initialized with string literals, which is simply horrible. These pointers # are two to four times larger than necessary, but with position-independent # code this also forces them into an unshareable, writeable segment since # they are not compile-time constants and must be initialized by the dynamic # loader at runtime. # # Furthermore, the names of nonterminal symbols are always included but they # are not always needed; they should only be output by tracing code which # is disabled by default at compile time. # # This script replaces the definition of the yytname array with a function # that implements the same lookup using truly constant tables, and makes the # inclusion of nonterminal symbols in these tables conditional on YYDEBUG. # # License WTFPL2: Do What The Fuck You Want To Public License, version 2. # This is free software: you are free to do what the fuck you want to. # There is NO WARRANTY, to the extent permitted by law. BEGIN { in_table = 0; num_tokens = 0; num_nterms = 0; } # Locate YYNTOKENS definition, needed to distinguish terminal symbols # from nonterminals. $1 == "#define" && $2 == "YYNTOKENS" { yyntokens = $3; } # Locate the yytname array definition to replace it. $0 ~ /static.*yytname *\[/ { in_table = 1; } # If we are not in the yytname definition, look for direct references to the # array and replace them with a function call. !in_table { left = "" right = $0; while ((m = match(right, /yytname *\[/))) { left = substr(right, 1, m-1) "tname("; right = substr(right, m+RLENGTH); depth = 1; n = length(right); for (i = 1; i <= n; i++) { c = substr(right, i, 1); if (c == "]" && --depth == 0) break; if (c == "[") depth++; } left = left substr(right, 1, i-1) ")"; right = substr(right, i+1); } print left right; } # If we are in the yytname definition, collect all the string literals in the # array initializer. The first yyntokens strings are terminal symbols, the # rest are nonterminals. in_table { gsub(/\\\\/, "\2\2"); gsub(/\\"/, "\1"); while ($1 ~ /^"/) { sub(/^[^"]*"/, ""); x = index($0, "\""); s = substr($0, 1, x - 1); $0 = substr($0, x + 2); gsub("\1", "\\\"", s); # Bison will remove quotes from the token strings at runtime unless the # token contains an unescaped backslash, an apostrophe, or a comma. It is # silly store all these quote characters if they aren't going to be used. if (s ~ /^\\"[^\'\\,]*\\"$/) s = substr(s, 3, length(s) - 4); if (num_tokens < yyntokens) tokens[num_tokens++] = s; else nterms[num_nterms++] = s; } } # At the end of the yytname definition, output our replacement function. in_table && $0 ~ /^};/ { print "#if !defined(UINT_LEAST8_MAX) || !defined(UINT_LEAST16_MAX)"; print "# include "; print "#endif"; print "#ifndef assert"; print "# include "; print "#endif"; print "#ifndef offsetof"; print "# include "; print "#endif\n" print "static const char *tname(unsigned x)"; print "{"; count = bucketsort(sorted_tokens, tokens); token_table = build_strtab(sorted_tokens, count, token_offsets); token_size = TLEN; offset_max = TMAX; if ((count = bucketsort(sorted_nterms, nterms))) { nterm_table = build_strtab(sorted_nterms, count, nterm_offsets); offset_max = token_size + TMAX; nterm_size = TLEN; } else { nterm_table = ""; nterm_size = 0; } print "\tstruct tname_data {"; print "#if YYDEBUG"; print "\t\tuint_least16_t map[" num_tokens + num_nterms "];"; print "\t\tchar tab[" token_size + nterm_size "];"; print "#else" print "\t\tuint_least16_t map[" num_tokens "];"; print "\t\tchar tab[" token_size "];"; print "#endif"; print "\t};"; print "\n\tenum { O = offsetof(struct tname_data, tab) };"; print "\tstatic const struct tname_data data = {"; print "\t\t{"; print_offsets(token_offsets, tokens, num_tokens, log10(offset_max)); if (num_nterms) { print "#if YYDEBUG"; print_offsets(nterm_offsets, nterms, num_nterms, log10(offset_max), token_size); print "#endif"; } sub("\1$", "\"", token_table); gsub("\1", "\"\n\t\t\"\\0\" \"", token_table); gsub("\2", "\\", token_table); print "\t\t}, \"" token_table; if (num_nterms) { print "#if YYDEBUG"; sub("\1$", "\"", nterm_table); gsub("\1", "\"\n\t\t\"\\0\" \"", nterm_table); gsub("\2", "\\", nterm_table); print "\t\t\"\\0\" \"" nterm_table; print "#endif"; } print "\t};"; print "\n\tassert(x < sizeof data.map / sizeof data.map[0]);" print "\treturn (char *)&data + data.map[x];"; print "}"; in_table = 0; } function log10(x) { if (x < 10) return 1; if (x < 100) return 2; if (x < 1000) return 3; if (x < 10000) return 4; return 5; } # build_strtab(strings, count, offsets) # # Generate a string table. Each token in the strings array (indexed from 0 # through count-1) is appended to a string, with a \1 byte terminating each # item. # # The table is suffix-compressed: if the token exists as a suffix of one that # is already in the table, no new entry is needed. The input strings array # must be sorted from longest to shortest in order to find all possible suffix # matches. # # The offsets array is populated with the actual offsets (for C code), # indexed by each token, and the resulting table is returned. # # The caller must transform the \1 characters to NUL terminators in the C code. function build_strtab(in_strings, in_count, out_offsets, i, s, ret) { ret = ""; for (i = TLEN = 0; i < in_count; i++) { s = in_strings[i]; if ((n = index(ret, s "\1")) > 0) { out_offsets[s] = real_length(substr(ret, 1, n-1)); } else { ret = ret s "\1"; out_offsets[s] = TMAX = TLEN; TLEN += real_length(s) + 1; } } return ret; } # print_offsets(offsets, strings, count, w, adj) # # Outputs the initializer portion of the tname offset array. # # Each token in the strings array (indexed from 0 through count-1) is used to # index the offsets array, and the resulting value is printed, with commas # between them. # # The w value specifies the minimum field width for the printed offsets, used # to achieve a visually pleasing alignment of output. # # If adj is nonzero, it is added to all the printed offset values, and a comma # is printed before the first line. function print_offsets(in_offsets, in_strings, in_count, in_w, in_adj, i, t, s) { t = in_adj ? "\t,\t" : "\t\t"; s = ""; for (i = 0; i < in_count; i++) { s = s sprintf("%" (in_w+1) "d+O,", (in_offsets[in_strings[i]] + in_adj)); if (i+1 == in_count) sub(/,$/, "", s); if ((i+1) % 8 == 0) { print t s; t = "\t\t"; s = ""; } } if (i%8 != 0) print t s; } # bucketsort(dst, src) # # Sort the elements of src by descending string length, # placing them into dst[0] ... dst[n]. # # Returns the number of elements. function bucketsort(dst, src, buckets, max, count, i, t) { for (t in src) { i = length(src[t]) if (i > max) { max = i } buckets[i]++ } for (i = max; i > 0; i--) { if (i in buckets) { t = buckets[i] buckets[i] = count count += t } } for (t in src) { i = length(t = src[t]) dst[buckets[i]++] = t } return count } # real_length(s) # # Calculate the length of s with backslash sequences counted as one character. function real_length(s, t) { t = length(s) return t - gsub(/\\./, "&", s) - gsub("\2\2", "&", s); }