#!/bin/awk -f
#
# Copyright © 2023-2024 Nick Bowler
#
# Hackjob to improve the horrible yytname array generated by Bison.
#
# Bison generates a list of symbol names as a static array of char pointers
# initialized with string literals, which is simply horrible.  These pointers
# are two to four times larger than necessary, but with position-independent
# code this also forces them into an unshareable, writeable segment since
# they are not compile-time constants and must be initialized by the dynamic
# loader at runtime.
#
# Furthermore, the names of nonterminal symbols are always included but they
# are not always needed; they should only be output by tracing code which
# is disabled by default at compile time.
#
# This script replaces the definition of the yytname array with a function
# that implements the same lookup using truly constant tables, and makes the
# inclusion of nonterminal symbols in these tables conditional on YYDEBUG.
#
# License WTFPL2: Do What The Fuck You Want To Public License, version 2.
# This is free software: you are free to do what the fuck you want to.
# There is NO WARRANTY, to the extent permitted by law.

BEGIN {
  in_table   = 0;
  num_tokens = 0;
  num_nterms = 0;
}

# Locate YYNTOKENS definition, needed to distinguish terminal symbols
# from nonterminals.
$1 == "#define" && $2 == "YYNTOKENS" { yyntokens = $3; }

# Locate the yytname array definition to replace it.
$0 ~ /static.*yytname *\[/ { in_table = 1; }

# If we are not in the yytname definition, look for direct references to the
# array and replace them with a function call.
!in_table {
  left = ""
  right = $0;

  while ((m = match(right, /yytname *\[/))) {
    left  = substr(right, 1, m-1) "tname(";
    right = substr(right, m+RLENGTH);

    depth = 1;
    n = length(right);
    for (i = 1; i <= n; i++) {
      c = substr(right, i, 1);
      if (c == "]" && --depth == 0)
        break;
      if (c == "[")
        depth++;
    }

    left  = left substr(right, 1, i-1) ")";
    right = substr(right, i+1);
  }

  print left right;
}

# If we are in the yytname definition, collect all the string literals in the
# array initializer.  The first yyntokens strings are terminal symbols, the
# rest are nonterminals.
in_table {
  gsub(/\\\\/, "\2\2");
  gsub(/\\"/,  "\1");

  while ($1 ~ /^"/) {
    sub(/^[^"]*"/, "");

    x  = index($0, "\"");
    s  = substr($0, 1, x - 1);
    $0 = substr($0, x + 2);

    gsub("\1", "\\\"", s);

    # Bison will remove quotes from the token strings at runtime unless the
    # token contains an unescaped backslash, an apostrophe, or a comma.  It is
    # silly store all these quote characters if they aren't going to be used.
    if (s ~ /^\\"[^\'\\,]*\\"$/)
      s = substr(s, 3, length(s) - 4);

    if (num_tokens < yyntokens)
      tokens[num_tokens++] = s;
    else
      nterms[num_nterms++] = s;
  }
}

# At the end of the yytname definition, output our replacement function.
in_table && $0 ~ /^};/ {
  print "#if !defined(UINT_LEAST8_MAX) || !defined(UINT_LEAST16_MAX)";
  print "#  include <inttypes.h>";
  print "#endif";
  print "#ifndef assert";
  print "#  include <assert.h>";
  print "#endif";
  print "#ifndef offsetof";
  print "#  include <stddef.h>";
  print "#endif\n"

  print "static const char *tname(unsigned x)";
  print "{";

  count = bucketsort(sorted_tokens, tokens);
  token_table = build_strtab(sorted_tokens, count, token_offsets);
  token_size = TLEN;
  offset_max = TMAX;

  if ((count = bucketsort(sorted_nterms, nterms))) {
    nterm_table = build_strtab(sorted_nterms, count, nterm_offsets);
    offset_max = token_size + TMAX;
    nterm_size = TLEN;
  } else {
    nterm_table = "";
    nterm_size = 0;
  }

  print "\tstruct tname_data {";
  print "#if YYDEBUG";
  print "\t\tuint_least16_t map[" num_tokens + num_nterms "];";
  print "\t\tchar tab[" token_size + nterm_size "];";
  print "#else"
  print "\t\tuint_least16_t map[" num_tokens "];";
  print "\t\tchar tab[" token_size "];";
  print "#endif";
  print "\t};";

  print "\n\tenum { O = offsetof(struct tname_data, tab) };";
  print "\tstatic const struct tname_data data = {";
  print "\t\t{";
  print_offsets(token_offsets, tokens, num_tokens, log10(offset_max));
  if (num_nterms) {
    print "#if YYDEBUG";
    print_offsets(nterm_offsets, nterms, num_nterms, log10(offset_max), token_size);
    print "#endif";
  }

  sub("\1$", "\"", token_table);
  gsub("\1", "\"\n\t\t\"\\0\" \"", token_table);
  gsub("\2", "\\", token_table);
  print "\t\t},   \"" token_table;

  if (num_nterms) {
    print "#if YYDEBUG";
    sub("\1$", "\"", nterm_table);
    gsub("\1", "\"\n\t\t\"\\0\" \"", nterm_table);
    gsub("\2", "\\", nterm_table);
    print "\t\t\"\\0\" \"" nterm_table;
    print "#endif";
  }

  print "\t};";

  print "\n\tassert(x < sizeof data.map / sizeof data.map[0]);"
  print "\treturn (char *)&data + data.map[x];";
  print "}";

  in_table = 0;
}

function log10(x)
{
  if (x < 10)
    return 1;
  if (x < 100)
    return 2;
  if (x < 1000)
    return 3;
  if (x < 10000)
    return 4;

  return 5;
}

# build_strtab(strings, count, offsets)
#
# Generate a string table.  Each token in the strings array (indexed from 0
# through count-1) is appended to a string, with a \1 byte terminating each
# item.
#
# The table is suffix-compressed: if the token exists as a suffix of one that
# is already in the table, no new entry is needed.  The input strings array
# must be sorted from longest to shortest in order to find all possible suffix
# matches.
#
# The offsets array is populated with the actual offsets (for C code),
# indexed by each token, and the resulting table is returned.
#
# The caller must transform the \1 characters to NUL terminators in the C code.
function build_strtab(in_strings, in_count, out_offsets, i, s, ret)
{
  ret = "";

  for (i = TLEN = 0; i < in_count; i++) {
    s = in_strings[i];
    if ((n = index(ret, s "\1")) > 0) {
      out_offsets[s] = real_length(substr(ret, 1, n-1));
    } else {
      ret = ret s "\1";
      out_offsets[s] = TMAX = TLEN;
      TLEN += real_length(s) + 1;
    }
  }

  return ret;
}

# print_offsets(offsets, strings, count, w, adj)
#
# Outputs the initializer portion of the tname offset array.
#
# Each token in the strings array (indexed from 0 through count-1) is used to
# index the offsets array, and the resulting value is printed, with commas
# between them.
#
# The w value specifies the minimum field width for the printed offsets, used
# to achieve a visually pleasing alignment of output.
#
# If adj is nonzero, it is added to all the printed offset values, and a comma
# is printed before the first line.
function print_offsets(in_offsets, in_strings, in_count, in_w, in_adj, i, t, s)
{
  t = in_adj ? "\t,\t" : "\t\t";
  s = "";

  for (i = 0; i < in_count; i++) {
    s = s sprintf("%" (in_w+1) "d+O,", (in_offsets[in_strings[i]] + in_adj));
    if (i+1 == in_count)
      sub(/,$/, "", s);

    if ((i+1) % 8 == 0) {
      print t s;
      t = "\t\t";
      s = "";
    }
  }

  if (i%8 != 0)
    print t s;
}

# bucketsort(dst, src)
#
# Sort the elements of src by descending string length,
# placing them into dst[0] ... dst[n].
#
# Returns the number of elements.
function bucketsort(dst, src, buckets, max, count, i, t)
{
  for (t in src) {
    i = length(src[t])
    if (i > max) { max = i }
    buckets[i]++
  }

  for (i = max; i > 0; i--) {
    if (i in buckets) {
      t = buckets[i]
      buckets[i] = count
      count += t
    }
  }

  for (t in src) {
    i = length(t = src[t])
    dst[buckets[i]++] = t
  }

  return count
}

# real_length(s)
#
# Calculate the length of s with backslash sequences counted as one character.
function real_length(s, t)
{
  t = length(s)
  return t - gsub(/\\./, "&", s) - gsub("\2\2", "&", s);
}