#!/bin/awk -f
#
# Copyright © 2021-2022 Nick Bowler
#
# Generate one or more C array initializers to encode simple tree structures
# in a compact format.
#
# Each nonempty line of the input file is either a comment, an option
# specification or a tree specification.  Each line is distinguished by
# its first character.  A # character introduces a comment, an @ character
# introduces an option specification (described later), and all other
# nonempty lines are tree nodes.
#
# The first field of a tree specification must be a valid C identifier,
# optionally followed by a comma.  The identifiers used on non-leaf nodes
# must be unique with respect to other non-leaf nodes, but leaf nodes may
# share the same identifier as other nodes.
#
# The tree structure is expressed through indentation.  Lines with no leading
# whitespace designate root nodes.  When the number of white-space characters
# at the beginning of a line is greater than that of the previous line, this
# introduces a new subtree rooted at the previous node.  When the number of
# leading white-space characters is less than the previous line, this concludes
# the definition of the current subtree.
#
# For example, the input on the left describes the tree on the right.
#
#    INPUT: root           TREE: root -- a -- b -- c
#              a                    |    |
#                 b                 |    + -- d
#                    c              |
#                 d                 + -- e -- f
#              e                         |
#                 f                      + -- g
#                 g
#
# For each root node X, the object-like macro X_INITIALIZER is defined.  This
# can be used to initialize an array with static storage duration.  Each line
# of the input is defines the initializer for one array element.  These are
# then ordered topologically beginning with the root, but the root itself is
# not included.  Each subtree is terminated by an "empty" element (with an
# initializer of {0}).
#
# If there is a comma after the identifier which begins a tree element, or
# if the identifier is the only text on the line, then the element initializer
# is constructed by surrounding the entire line by braces.  Otherwise, the
# identifier is removed and the remainder of the line is surrounded by braces
# to construct the initializer.  This allows the exact form of the initializer
# to be customized.
#
# For example, the above input will produce this (or an equivalent topological
# ordering) initializer:
#
#    OUTPUT: #define root_INITIALIZER \
#                 {a}, {e}, {0}, {b}, {d}, {0}, {f}, {g}, {0}, {c}, {0}
#
# Constants representing the array offset for each subtree are additionally
# defined.  For a non-leaf node X, the enumeration constant X_OFFSET gives the
# index of the first node rooted at X.  With the above example, the results
# are:
#
#    OUTPUT: enum { a_OFFSET = 3, b_OFFSET = 9, e_OFFSET = 6 };
#
# By using an array of structures to represent the tree and including these
# constants in initializers, it is possible for the flattened tree to describe
# its own structure.
#
# Finally, each node identifier is defined as an enumeration constant
# representing the nonzero offset into a string table tree_strtab.  These
# constants will normally be in scope when using the root_INITIALIZER macro,
# which means the resulting array initializers will use them.
#
# Options may be used to alter the normal behaviour.  They may be placed
# anywhere in the input file.  The following options are defined:
#
#   @nostrtab
#     Do not output a definition for tree_strtab or its associated
#     enumeration constants.
#
# License WTFPL2: Do What The Fuck You Want To Public License, version 2.
# This is free software: you are free to do what the fuck you want to.
# There is NO WARRANTY, to the extent permitted by law.

END {
  print "/*"
  if (FILENAME) {
    print " * Automatically generated by gen-tree.awk from " FILENAME
  } else {
    print " * Automatically generated by gen-tree.awk"
  }
  print " * Do not edit."
  print " */"
}

BEGIN {
  # Check if "\\\\" in substitutions gives just one backslash.
  bs = "x"; sub(/x/, "\\\\", bs);
  bs = (length(bs) == 1 ? "\\\\" : "\\");

  opts["strtab"] = 1;

  depth = max_depth = 0;
  num_entries = 0;

  tree_name = "";
  entry_name = "";

  indent_stack[0] = 0;
}

# Comments
NF == 0 { next }
$0 ~ /^#/ { next }

# Options
sub(/^@/, "", $0) {
  if (NF == 1) {
    orig=$1
    gsub(/-/, "_", $1);
    val = !sub(/^no_?/, "", $1);
    if ($1 in opts) {
      opts[$1] = val;
    } else {
      print "error: unrecognized option: @" orig | "cat 1>&2"
      exit 1
    }
  }
  next
}

{ indent = index($0, $1) - 1 }

indent > 0 {
  if (indent > indent_stack[depth]) {
    indent_stack[++depth] = indent;
    if (depth > 1) {
      subtree_offsets[entry_name] = level_count[depth];
      subtree_depth[entry_name] = depth;
    }
  } else {
    while (indent < indent_stack[depth]) {
      tree_items[depth] = tree_items[depth] ", \\\n\t{ 0 }";
      level_count[depth]++;
      depth--;
    }
  }

  entry_name = $1; sub(/,$/, "", entry_name);
  all_items[num_entries++] = entry_name;

  # Construct the element initializer for this tree node.
  $1 = " " $1;
  if (NF > 1) {
    # Check if entry name is followed by a comma.  If it is not, the entry
    # name is excluded from the initializer.
    check_str = $1$2;
    gsub(/[ \t]/, "", check_str);
    if (index(check_str, entry_name ",") == 0) {
      $1 = "";
    }
  }
  $1 = ", \\\n\t{" $1; $NF = $NF " }";

  tree_items[depth] = tree_items[depth] $0;
  level_count[depth]++;
}

indent == 0 && tree_identifier != "" {
  trees[tree_identifier] = format_items();
  tree_identifier = "";
}
END {
  if (tree_identifier != "")
    trees[tree_identifier] = format_items()
}
indent == 0 { tree_identifier = $1 }

END {
  prefix = "\nenum {\n";

  if (opts["strtab"]) {
    entry_strtab = "\1";
    bucketsort(sorted_items, all_items);
    for (i = 0; i < num_entries; i++) {
      s = sorted_items[i];
      if ((n = index(entry_strtab, s "\1")) > 0) {
        entry_offsets[s] = n-1;
      } else {
        entry_offsets[s] = length(entry_strtab);
        entry_strtab = entry_strtab s "\1";
      }
    }

    gsub("\1", "\"\n\t\"" bs "0\" \"", entry_strtab);
    sub(/^"/, "", entry_strtab);
    sub(/\n[^\n]*$/, ";", entry_strtab);
    print "\nstatic const char tree_strtab[] =" entry_strtab

    for (item in entry_offsets) {
      printf "%s\t%s = %d", prefix, item, entry_offsets[item];
      prefix = ",\n";
    }
  }

  for (i in subtree_offsets) {
    printf "%s\t%s_OFFSET = %d", prefix, i, subtree_offsets[i];
    prefix = ",\n";
  }
  if (!index(prefix, "enum")) {
    print "\n};";
  }
}

END {
  for (tree in trees) {
    print "\n" trees[tree];
  }
}

function format_items(s, i)
{
  while (depth > 0) {
    tree_items[depth] = tree_items[depth] ", \\\n\t{ 0 }";
    level_count[depth]++;
    depth--;
  }

  for (i = 2; tree_items[i] != ""; i++) {
    level_count[i] += level_count[i-1];
  }

  for (i in subtree_depth) {
    subtree_offsets[i] += level_count[subtree_depth[i]-1];
    delete subtree_depth[i];
  }

  for (i = 1; tree_items[i] != ""; i++) {
    s = s tree_items[i];
    delete tree_items[i];
    delete level_count[i];
  }

  sub(/^,/, "", s);
  return "#define " tree_identifier "_INITIALIZER" s;
}

# bucketsort(dst, src)
#
# Sort the elements of src by descending string length,
# placing them into dst[0] ... dst[n].
#
# Returns the number of elements.
function bucketsort(dst, src, max, count, i, t)
{
  # Note: ULTRIX 4.5 nawk does not support local array parameters
  split("", bucketsort_buckets);

  for (t in src) {
    i = length(src[t])
    if (i > max) { max = i }
    bucketsort_buckets[i]++
  }

  for (i = max; i > 0; i--) {
    if (i in bucketsort_buckets) {
      t = bucketsort_buckets[i]
      bucketsort_buckets[i] = count
      count += t
    }
  }

  for (t in src) {
    i = length(t = src[t])
    dst[bucketsort_buckets[i]++] = t
  }

  return count
}