#!/bin/awk -f # # Copyright © 2021-2022 Nick Bowler # # Generate one or more C array initializers to encode simple tree structures # in a compact format. # # Each nonempty line of the input file is either a comment, an option # specification or a tree specification. Each line is distinguished by # its first character. A # character introduces a comment, an @ character # introduces an option specification (described later), and all other # nonempty lines are tree nodes. # # The first field of a tree specification must be a valid C identifier, # optionally followed by a comma. The identifiers used on non-leaf nodes # must be unique with respect to other non-leaf nodes, but leaf nodes may # share the same identifier as other nodes. # # The tree structure is expressed through indentation. Lines with no leading # whitespace designate root nodes. When the number of white-space characters # at the beginning of a line is greater than that of the previous line, this # introduces a new subtree rooted at the previous node. When the number of # leading white-space characters is less than the previous line, this concludes # the definition of the current subtree. # # For example, the input on the left describes the tree on the right. # # INPUT: root TREE: root -- a -- b -- c # a | | # b | + -- d # c | # d + -- e -- f # e | # f + -- g # g # # For each root node X, the object-like macro X_INITIALIZER is defined. This # can be used to initialize an array with static storage duration. Each line # of the input is defines the initializer for one array element. These are # then ordered topologically beginning with the root, but the root itself is # not included. Each subtree is terminated by an "empty" element (with an # initializer of {0}). # # If there is a comma after the identifier which begins a tree element, or # if the identifier is the only text on the line, then the element initializer # is constructed by surrounding the entire line by braces. Otherwise, the # identifier is removed and the remainder of the line is surrounded by braces # to construct the initializer. This allows the exact form of the initializer # to be customized. # # For example, the above input will produce this (or an equivalent topological # ordering) initializer: # # OUTPUT: #define root_INITIALIZER \ # {a}, {e}, {0}, {b}, {d}, {0}, {f}, {g}, {0}, {c}, {0} # # Constants representing the array offset for each subtree are additionally # defined. For a non-leaf node X, the enumeration constant X_OFFSET gives the # index of the first node rooted at X. With the above example, the results # are: # # OUTPUT: enum { a_OFFSET = 3, b_OFFSET = 9, e_OFFSET = 6 }; # # By using an array of structures to represent the tree and including these # constants in initializers, it is possible for the flattened tree to describe # its own structure. # # Finally, each node identifier is defined as an enumeration constant # representing the nonzero offset into a string table tree_strtab. These # constants will normally be in scope when using the root_INITIALIZER macro, # which means the resulting array initializers will use them. # # Options may be used to alter the normal behaviour. They may be placed # anywhere in the input file. The following options are defined: # # @nostrtab # Do not output a definition for tree_strtab or its associated # enumeration constants. # # License WTFPL2: Do What The Fuck You Want To Public License, version 2. # This is free software: you are free to do what the fuck you want to. # There is NO WARRANTY, to the extent permitted by law. END { print "/*" if (FILENAME) { print " * Automatically generated by gen-tree.awk from " FILENAME } else { print " * Automatically generated by gen-tree.awk" } print " * Do not edit." print " */" } BEGIN { # Check if "\\\\" in substitutions gives just one backslash. bs = "x"; sub(/x/, "\\\\", bs); bs = (length(bs) == 1 ? "\\\\" : "\\"); opts["strtab"] = 1; depth = max_depth = 0; num_entries = 0; tree_name = ""; entry_name = ""; indent_stack[0] = 0; } # Comments NF == 0 { next } $0 ~ /^#/ { next } # Options sub(/^@/, "", $0) { if (NF == 1) { orig=$1 gsub(/-/, "_", $1); val = !sub(/^no_?/, "", $1); if ($1 in opts) { opts[$1] = val; } else { print "error: unrecognized option: @" orig | "cat 1>&2" exit 1 } } next } { indent = index($0, $1) - 1 } indent > 0 { if (indent > indent_stack[depth]) { indent_stack[++depth] = indent; if (depth > 1) { subtree_offsets[entry_name] = level_count[depth]; subtree_depth[entry_name] = depth; } } else { while (indent < indent_stack[depth]) { tree_items[depth] = tree_items[depth] ", \\\n\t{ 0 }"; level_count[depth]++; depth--; } } entry_name = $1; sub(/,$/, "", entry_name); all_items[num_entries++] = entry_name; # Construct the element initializer for this tree node. $1 = " " $1; if (NF > 1) { # Check if entry name is followed by a comma. If it is not, the entry # name is excluded from the initializer. check_str = $1$2; gsub(/[ \t]/, "", check_str); if (index(check_str, entry_name ",") == 0) { $1 = ""; } } $1 = ", \\\n\t{" $1; $NF = $NF " }"; tree_items[depth] = tree_items[depth] $0; level_count[depth]++; } indent == 0 && tree_identifier != "" { trees[tree_identifier] = format_items(); tree_identifier = ""; } END { if (tree_identifier != "") trees[tree_identifier] = format_items() } indent == 0 { tree_identifier = $1 } END { prefix = "\nenum {\n"; if (opts["strtab"]) { entry_strtab = "\1"; bucketsort(sorted_items, all_items); for (i = 0; i < num_entries; i++) { s = sorted_items[i]; if ((n = index(entry_strtab, s "\1")) > 0) { entry_offsets[s] = n-1; } else { entry_offsets[s] = length(entry_strtab); entry_strtab = entry_strtab s "\1"; } } gsub("\1", "\"\n\t\"" bs "0\" \"", entry_strtab); sub(/^"/, "", entry_strtab); sub(/\n[^\n]*$/, ";", entry_strtab); print "\nstatic const char tree_strtab[] =" entry_strtab for (item in entry_offsets) { printf "%s\t%s = %d", prefix, item, entry_offsets[item]; prefix = ",\n"; } } for (i in subtree_offsets) { printf "%s\t%s_OFFSET = %d", prefix, i, subtree_offsets[i]; prefix = ",\n"; } if (!index(prefix, "enum")) { print "\n};"; } } END { for (tree in trees) { print "\n" trees[tree]; } } function format_items(s, i) { while (depth > 0) { tree_items[depth] = tree_items[depth] ", \\\n\t{ 0 }"; level_count[depth]++; depth--; } for (i = 2; tree_items[i] != ""; i++) { level_count[i] += level_count[i-1]; } for (i in subtree_depth) { subtree_offsets[i] += level_count[subtree_depth[i]-1]; delete subtree_depth[i]; } for (i = 1; tree_items[i] != ""; i++) { s = s tree_items[i]; delete tree_items[i]; delete level_count[i]; } sub(/^,/, "", s); return "#define " tree_identifier "_INITIALIZER" s; } # bucketsort(dst, src) # # Sort the elements of src by descending string length, # placing them into dst[0] ... dst[n]. # # Returns the number of elements. function bucketsort(dst, src, max, count, i, t) { # Note: ULTRIX 4.5 nawk does not support local array parameters split("", bucketsort_buckets); for (t in src) { i = length(src[t]) if (i > max) { max = i } bucketsort_buckets[i]++ } for (i = max; i > 0; i--) { if (i in bucketsort_buckets) { t = bucketsort_buckets[i] bucketsort_buckets[i] = count count += t } } for (t in src) { i = length(t = src[t]) dst[bucketsort_buckets[i]++] = t } return count }