--- /dev/null
+#!/bin/awk -f
+#
+# Copyright © 2021 Nick Bowler
+#
+# Generate one or more C array initializers to encode simple tree structures
+# in a compact format.
+#
+# Each line of the input file defines a tree node. The first field must be a
+# C identifier, which may be optionally followed by a comma. The identifiers
+# used on non-leaf nodes must be unique with respect to other non-leaf nodes,
+# but leaf nodes may share the same identifier as other nodes.
+#
+# The tree structure is expressed through indentation. Lines with no leading
+# whitespace designate root nodes. When the number of white-space characters
+# at the beginning of a line is greater than that of the previous line, this
+# introduces a new subtree rooted at the previous node. When the number of
+# leading white-space characters is less than the previous line, this concludes
+# the definition of the current subtree.
+#
+# For example, the input on the left describes the tree on the right.
+#
+# INPUT: root TREE: root -- a -- b -- c
+# a | |
+# b | + -- d
+# c |
+# d + -- e -- f
+# e |
+# f + -- g
+# g
+#
+# For each root node X, the object-like macro X_INITIALIZER is defined. This
+# can be used to initialize an array with static storage duration. Each line
+# of the input is surrounded by braces to construct the initializer for one
+# array element. These are then ordered topologically beginning with the root,
+# but the root itself is not included. Each subtree is terminated by an
+# "empty" element (with an initializer of {0}). For example, the above input
+# will produce this (or an equivalent topological ordering) initializer:
+#
+# OUTPUT: #define root_INITIALIZER \
+# {a}, {e}, {0}, {b}, {d}, {0}, {f}, {g}, {0}, {c}, {0}
+#
+# Constants representing the array offset for each subtree are additionally
+# defined. For a non-leaf node X, the enumeration constant X_OFFSET gives the
+# index of the first node rooted at X. With the above example, the results
+# are:
+#
+# OUTPUT: enum { a_OFFSET = 3, b_OFFSET = 9, e_OFFSET = 6 };
+#
+# By using an array of structures to represent the tree and including these
+# constants in initializers, it is possible for the flattened tree to describe
+# its own structure.
+#
+# Finally, each node identifier is defined as an enumeration constant
+# representing the nonzero offset into a string table tree_strtab. These
+# constants will normally be in scope when using the root_INITIALIZER macro,
+# which means the resulting array initializers will use them.
+#
+# License WTFPL2: Do What The Fuck You Want To Public License, version 2.
+# This is free software: you are free to do what the fuck you want to.
+# There is NO WARRANTY, to the extent permitted by law.
+
+END {
+ print "/*"
+ if (FILENAME) {
+ print " * Automatically generated by gen-tree.awk from " FILENAME
+ } else {
+ print " * Automatically generated by gen-tree.awk"
+ }
+ print " * Do not edit."
+ print " */"
+}
+
+BEGIN {
+ depth = max_depth = 0;
+ num_entries = 0;
+
+ tree_name = "";
+ entry_name = "";
+
+ indent_stack[0] = 0;
+}
+
+NF == 0 { next }
+{ indent = index($0, $1) - 1 }
+
+indent > 0 {
+ if (indent > indent_stack[depth]) {
+ indent_stack[++depth] = indent;
+ if (depth > 1) {
+ subtree_offsets[entry_name] = level_count[depth];
+ subtree_depth[entry_name] = depth;
+ }
+ } else {
+ while (indent < indent_stack[depth]) {
+ tree_items[depth] = tree_items[depth] ", \\\n\t{ 0 }";
+ level_count[depth]++;
+ depth--;
+ }
+ }
+
+ entry_name = $1; sub(/,$/, "", entry_name);
+ all_items[num_entries++] = entry_name;
+
+ $1 = ", \\\n\t{ " $1; $NF = $NF " }";
+ tree_items[depth] = tree_items[depth] $0;
+ level_count[depth]++;
+}
+
+indent == 0 && tree_identifier {
+ trees[tree_identifier] = format_items();
+ tree_identifier = "";
+}
+END {
+ if (tree_identifier)
+ trees[tree_identifier] = format_items()
+}
+indent == 0 { tree_identifier = $1 }
+
+END {
+ entry_strtab = "\1";
+ bucketsort(sorted_items, all_items);
+ for (i = 0; i < num_entries; i++) {
+ s = sorted_items[i];
+ if ((n = index(entry_strtab, s "\1")) > 0) {
+ entry_offsets[s] = n-1;
+ } else {
+ entry_offsets[s] = length(entry_strtab);
+ entry_strtab = entry_strtab s "\1";
+ }
+ }
+
+ gsub(/\1/, "\"\n\t\"\\0\" \"", entry_strtab);
+ sub(/^"/, "", entry_strtab);
+ sub(/\n[^\n]*$/, ";", entry_strtab);
+ print "\nstatic const char tree_strtab[] =" entry_strtab
+
+ prefix = "\nenum {\n";
+ for (item in entry_offsets) {
+ printf "%s\t%s = %d", prefix, item, entry_offsets[item];
+ prefix = ",\n";
+ }
+ for (i in subtree_offsets) {
+ printf ",\n\t%s_OFFSET = %d", i, subtree_offsets[i];
+ }
+ print "\n};";
+}
+
+END {
+ for (tree in trees) {
+ print "\n" trees[tree];
+ }
+}
+
+function format_items(s, i)
+{
+ while (depth > 0) {
+ tree_items[depth] = tree_items[depth] ", \\\n\t{ 0 }";
+ level_count[depth]++;
+ depth--;
+ }
+
+ for (i = 2; tree_items[i]; i++) {
+ level_count[i] += level_count[i-1];
+ }
+
+ for (i in subtree_depth) {
+ subtree_offsets[i] += level_count[subtree_depth[i]-1];
+ delete subtree_depth[i];
+ }
+
+ for (i = 1; tree_items[i]; i++) {
+ s = s tree_items[i];
+ delete tree_items[i];
+ delete level_count[i];
+ }
+
+ sub(/^,/, "", s);
+ return "#define " tree_identifier "_INITIALIZER" s;
+}
+
+# bucketsort(dst, src)
+#
+# Sort the elements of src by descending string length,
+# placing them into dst[0] ... dst[n].
+#
+# Returns the number of elements.
+function bucketsort(dst, src, buckets, max, count, i, t)
+{
+ for (t in src) {
+ i = length(src[t])
+ if (i > max) { max = i }
+ buckets[i]++
+ }
+
+ for (i = max; i > 0; i--) {
+ if (i in buckets) {
+ t = buckets[i]
+ buckets[i] = count
+ count += t
+ }
+ }
+
+ for (t in src) {
+ i = length(t = src[t])
+ dst[buckets[i]++] = t
+ }
+
+ return count
+}