From af1f874c4c58d0bd8becd52c892d0b358d08736e Mon Sep 17 00:00:00 2001 From: Nick Bowler Date: Thu, 6 Jul 2023 22:46:20 -0400 Subject: [PATCH] libcdecl: Replace yytname array in the Bison parser. Bison generates a list of symbol names as a static array of char pointers initialized with string literals, which is simply horrible. These pointers are two to four times larger than necessary, but with position-independent code this also forces them into an unshareable, writeable segment since they are not compile-time constants and must be initialized by the dynamic loader at runtime. Furthermore, the names of nonterminal symbols are always included but they are not always needed; they should only be output by tracing code which is disabled by default at compile time. Fix this by adding a new build script which postprocesses the output, replacing the yytname array with a function implementing the same lookup with truly constant tables. Inclusion of nonterminals is now conditional on YYDEBUG. This is a big win, reducing the overall library size about 5 to 10 percent (64-bit hosts see the most improvement). --- Makefile.am | 17 +-- src/fix-yytname.awk | 281 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 291 insertions(+), 7 deletions(-) create mode 100755 src/fix-yytname.awk diff --git a/Makefile.am b/Makefile.am index 031c0a8..b8136eb 100644 --- a/Makefile.am +++ b/Makefile.am @@ -261,19 +261,22 @@ DEV_TOOL_ERROR = { \ echo " *** configure, consired setting $$toolvar and re-running configure."; \ echo " *** See config.log for more details."; } >&2; false +DO_BISON = $(BISON) $(BISON_COMPAT) $(BISONFLAGS) + .y.c: ; .y.stamp: if !HAVE_BISON $(BISON_V)tool=bison toolvar=BISON toolsrc=$<; $(DEV_TOOL_ERROR) endif - $(AM_V_at) touch $@.tmp - $(BISON_V) $(BISON) $(BISON_COMPAT) -o $*.c --defines=$*.h.tmp $(BISONFLAGS) $< - $(AM_V_at) if cmp $*.h.tmp $*.h >/dev/null 2>&1; then \ - rm -f $*.h.tmp; \ - else \ - mv -f $*.h.tmp $*.h; \ - fi + $(BISON_V) : >$@.tmp + $(AM_V_at) $(DO_BISON) -o $*.c.tmp --defines=$*.h.tmp $< + $(AM_V_at) $(AWK) -f $(srcdir)/src/fix-yytname.awk $*.c.tmp >$*.c.t2 + $(AM_V_at) mv -f $*.c.t2 $*.c + $(AM_V_at) cmp $*.h.tmp $*.h >/dev/null 2>&1 || mv -f $*.h.tmp $*.h + $(AM_V_at) rm -f $*.c.tmp $*.h.tmp $(AM_V_at) mv -f $@.tmp $@ +src/parse.stamp: $(srcdir)/src/fix-yytname.awk +EXTRA_DIST += src/fix-yytname.awk .l.c: ; .l.stamp: diff --git a/src/fix-yytname.awk b/src/fix-yytname.awk new file mode 100755 index 0000000..8a417b3 --- /dev/null +++ b/src/fix-yytname.awk @@ -0,0 +1,281 @@ +#!/bin/awk -f +# +# Copyright © 2023 Nick Bowler +# +# Hackjob to improve the horrible yytname array generated by Bison. +# +# Bison generates a list of symbol names as a static array of char pointers +# initialized with string literals, which is simply horrible. These pointers +# are two to four times larger than necessary, but with position-independent +# code this also forces them into an unshareable, writeable segment since +# they are not compile-time constants and must be initialized by the dynamic +# loader at runtime. +# +# Furthermore, the names of nonterminal symbols are always included but they +# are not always needed; they should only be output by tracing code which +# is disabled by default at compile time. +# +# This script replaces the definition of the yytname array with a function +# that implements the same lookup using truly constant tables, and makes the +# inclusion of nonterminal symbols in these tables conditional on YYDEBUG. +# +# License WTFPL2: Do What The Fuck You Want To Public License, version 2. +# This is free software: you are free to do what the fuck you want to. +# There is NO WARRANTY, to the extent permitted by law. + +BEGIN { + in_table = 0; + num_tokens = 0; + num_nterms = 0; +} + +# Locate YYNTOKENS definition, needed to distinguish terminal symbols +# from nonterminals. +$1 == "#define" && $2 == "YYNTOKENS" { yyntokens = $3; } + +# Locate the yytname array definition to replace it. +$0 ~ /static.*yytname *\[/ { in_table = 1; } + +# If we are not in the yytname definition, look for direct references to the +# array and replace them with a function call. +!in_table { + left = "" + right = $0; + + while ((m = match(right, /yytname *\[/))) { + left = substr(right, 1, m-1) "tname("; + right = substr(right, m+RLENGTH); + + depth = 1; + n = length(right); + for (i = 1; i <= n; i++) { + c = substr(right, i, 1); + if (c == "]" && --depth == 0) + break; + if (c == "[") + depth++; + } + + left = left substr(right, 1, i-1) ")"; + right = substr(right, i+1); + } + + print left right; +} + +# If we are in the yytname definition, collect all the string literals in the +# array initializer. The first yyntokens strings are terminal symbols, the +# rest are nonterminals. +in_table { + gsub(/\\\\/, "\2\2"); + gsub(/\\"/, "\1"); + + while ($1 ~ /^"/) { + sub(/^[^"]*"/, ""); + + x = index($0, "\""); + s = substr($0, 1, x - 1); + $0 = substr($0, x + 2); + + gsub("\1", "\\\"", s); + + # Bison will remove quotes from the token strings at runtime unless the + # token contains an unescaped backslash, an apostrophe, or a comma. It is + # silly store all these quote characters if they aren't going to be used. + if (s ~ /^\\"[^\'\\,]*\\"$/) + s = substr(s, 3, length(s) - 4); + + if (num_tokens < yyntokens) + tokens[num_tokens++] = s; + else + nterms[num_nterms++] = s; + } +} + +# At the end of the yytname definition, output our replacement function. +in_table && $0 ~ /^};/ { + print "#if !defined(UINT_LEAST8_MAX) || !defined(UINT_LEAST16_MAX)"; + print "# include "; + print "#endif"; + print "#ifndef assert"; + print "# include "; + print "#endif\n"; + + print "static const char *tname(unsigned x)"; + print "{"; + + count = bucketsort(sorted_tokens, tokens); + table = build_strtab(sorted_tokens, count, token_offsets); + token_offset_max = TMAX; + nterm_offset = TLEN; + + sub("\1$", "", table); + gsub("\1", "\"\n\t\t\"\\0\" \"", table); + gsub("\2", "\\", table); + + print "\tstatic const char tname_strings[] ="; + print "\t\t \"" table "\""; + + if ((count = bucketsort(sorted_nterms, nterms))) { + table = build_strtab(sorted_nterms, count, nterm_offsets); + offset_max = nterm_offset + TMAX; + offset_chk = "(YYDEBUG ? " offset_max " : " token_offset_max ")"; + + sub("\1$", "", table); + gsub("\1", "\"\n\t\t\"\\0\" \"", table); + gsub("\2", "\\", table); + + print "#if YYDEBUG"; + if (nterm_offset) + print "\t\t\"\\0\" \"" table "\""; + else + print "\t\t \"" table "\""; + print "#endif"; + } else { + offset_chk = offset_max = token_offset_max; + } + print "\t\t;\n"; + + print "\tstatic const"; + print "#if UINT_LEAST8_MAX >= " offset_chk; + print "\tuint_least8_t"; + print "#elif UINT_LEAST16_MAX >= " offset_chk; + print "\tuint_least16_t"; + print "#else"; + print "\tuint_least32_t"; + print "#endif"; + + print "\ttname_offsets[] = {"; + print_offsets(token_offsets, tokens, num_tokens, log10(offset_max)); + if (num_nterms) { + print "#if YYDEBUG"; + print_offsets(nterm_offsets, nterms, num_nterms, log10(offset_max), nterm_offset); + print "#endif"; + } + print "\t};\n"; + + print "\tassert(x < sizeof tname_offsets / sizeof tname_offsets[0]);"; + print "\treturn tname_strings + tname_offsets[x];"; + print "}"; + + in_table = 0; +} + +function log10(x) +{ + if (x < 10) + return 1; + if (x < 100) + return 2; + if (x < 1000) + return 3; + if (x < 10000) + return 4; + + return 5; +} + +# build_strtab(strings, count, offsets) +# +# Generate a string table. Each token in the strings array (indexed from 0 +# through count-1) is appended to a string, with a \1 byte terminating each +# item. +# +# The table is suffix-compressed: if the token exists as a suffix of one that +# is already in the table, no new entry is needed. The input strings array +# must be sorted from longest to shortest in order to find all possible suffix +# matches. +# +# The offsets array is populated with the actual offsets (for C code), +# indexed by each token, and the resulting table is returned. +# +# The caller must transform the \1 characters to NUL terminators in the C code. +function build_strtab(in_strings, in_count, out_offsets, i, s, ret) +{ + ret = ""; + + for (i = TLEN = 0; i < in_count; i++) { + s = in_strings[i]; + if ((n = index(ret, s "\1")) > 0) { + out_offsets[s] = real_length(substr(ret, 1, n-1)); + } else { + ret = ret s "\1"; + out_offsets[s] = TMAX = TLEN; + TLEN += real_length(s) + 1; + } + } + + return ret; +} + +# print_offsets(offsets, strings, count, w, adj) +# +# Outputs the initializer portion of the tname offset array. +# +# Each token in the strings array (indexed from 0 through count-1) is used to +# index the offsets array, and the resulting value is printed, with commas +# between them. +# +# The w value specifies the minimum field width for the printed offsets, used +# to achieve a visually pleasing alignment of output. +# +# If adj is nonzero, it is added to all the printed offset values, and a comma +# is printed before the first line. +function print_offsets(in_offsets, in_strings, in_count, in_w, in_adj, i, t, s) +{ + t = in_adj ? "\t,\t" : "\t\t"; + s = ""; + + for (i = 0; i < in_count; i++) { + s = s sprintf("%" (in_w+1) "d,", (in_offsets[in_strings[i]] + in_adj)); + if (i+1 == in_count) + sub(/,$/, "", s); + + if ((i+1) % 8 == 0) { + print t s; + t = "\t\t"; + s = ""; + } + } + + print t s; +} + +# bucketsort(dst, src) +# +# Sort the elements of src by descending string length, +# placing them into dst[0] ... dst[n]. +# +# Returns the number of elements. +function bucketsort(dst, src, buckets, max, count, i, t) +{ + for (t in src) { + i = length(src[t]) + if (i > max) { max = i } + buckets[i]++ + } + + for (i = max; i > 0; i--) { + if (i in buckets) { + t = buckets[i] + buckets[i] = count + count += t + } + } + + for (t in src) { + i = length(t = src[t]) + dst[buckets[i]++] = t + } + + return count +} + +# real_length(s) +# +# Calculate the length of s with backslash sequences counted as one character. +function real_length(s, t) +{ + t = length(s) + return t - gsub(/\\./, "&", s) - gsub("\2\2", "&", s); +} -- 2.43.2