Add script for generating miscellaneous string tables.

author Nick Bowler <nbowler@draconx.ca>

Tue, 2 Mar 2021 04:15:30 +0000 (23:15 -0500)

committer Nick Bowler <nbowler@draconx.ca>

Tue, 2 Mar 2021 04:16:30 +0000 (23:16 -0500)
author Nick Bowler <nbowler@draconx.ca>
Tue, 2 Mar 2021 04:15:30 +0000 (23:15 -0500)
committer Nick Bowler <nbowler@draconx.ca>
Tue, 2 Mar 2021 04:16:30 +0000 (23:16 -0500)
diff --git a/scripts/gen-strtab.awk b/scripts/gen-strtab.awk

new file mode 100755 (executable)

index 0000000..f844d4f
--- /dev/null
+++ b/scripts/gen-strtab.awk
@@ -0,0 +1,204 @@
+#!/bin/awk -f
+#
+# Copyright © 2021 Nick Bowler
+#
+# Generate a C string table based on an input string specification file.
+#
+# A string table is a single large char single array containing all of
+# the specified (0-terminated) strings, which is then offset to obtain
+# the desired string.  By storing these offsets instead of string pointers
+# into read-only data structures, this can reduce the need for relocation
+# processing at startup when programs are built in PIC mode.
+#
+# The string specification file is processed line by line.  Comment
+# lines may be included by beginning the line with a # character, which
+# must be the very first character on the line.  If a comment is encountered,
+# processing immediately moves on to the next line and the result is as if
+# the comment line were omitted from the input.
+#
+# A string is defined by beginning a line with an & character, which must
+# be immediately followed by a C identifier.  A nonempty sequence of
+# whitespace (with at most one newline) separates the identifier from the
+# beginning of the string itself.  This whitespace is never included in the
+# output.
+#
+# The string is then interpreted as follows:
+#
+#   - Leading blanks on each line are ignored.
+#   - The sequences \\, \a, \b, \t, \n, \v, \f and \r can be entered and
+#     mean the same as they do in C string literals.  The "\\" sequence
+#     prevents any special interpretation of the second backslash.
+#   - Newlines in the input are included in the output, except for the
+#     where the entire string (including its identifier) are on one line.
+#   - If this is not desired, a newline which is immediately preceded by an
+#     unescaped backslash will deleted, along with the backslash.
+#   - All other backslashes are deleted.  This can be used to prevent special
+#     handling of whitespace, # or & characters at the beginning of a line.
+#
+# The output defines a variable, strtab, which contains all of the strings,
+# and each identifier in the input is declared as an emumeration constant
+# whose value is the offset of the associated string within strtab.
+#
+# The object-like macro STRTAB_MAX_OFFSET is defined and expands to the
+# greatest string offset, suitable for use in #if preprocessing directives.
+
+END {
+  print "/*"
+  if (FILENAME) {
+    print " * Automatically generated by gen-strtab.awk from " FILENAME
+  } else {
+    print " * Automatically generated by gen-strtab.awk"
+  }
+  print " * Do not edit."
+  print " */"
+}
+
+BEGIN {
+  collected = ident = ""
+  startline = endline = 0
+  num_vars = 0
+}
+
+$0 ~ /^[#]/ { next }
+
+$0 ~ /^[&]/ {
+  if (ident) {
+    finish_string_input(strings, ident, collected)
+    vars[num_vars++] = ident
+  }
+
+  sub(/^[&]/, "", $1)
+  startline = NR
+  ident = $1
+
+  $1 = ""
+  collected = ""
+}
+
+ident {
+  sub(/^[ \t]*/, "")
+  if (collected) {
+    collected = collected "\n" $0
+  } else {
+    collected = $0
+  }
+
+  endline = NR
+}
+
+END {
+  if (ident) {
+    finish_string_input(strings, ident, collected)
+    vars[num_vars++] = ident
+  }
+}
+
+END {
+  strtab = ""
+  strtab_len = 0
+  count = bucketsort(sorted_strings, strings)
+  max = 0
+
+  print "\n#define STR_L10N_(x)"
+  print "#ifndef N_"
+  print "#  define N_(x) x"
+  print "#endif"
+  print "\nstatic const char strtab[] ="
+
+  for (i = 0; i < count; i++) {
+    s = sorted_strings[i]
+    gsub(/\\\\/, "\2", s)
+    if ((n = index(strtab "\1", s "\1")) > 0) {
+      offsets[sorted_strings[i]] = real_length(substr(strtab, 1, n-1))
+      print "\tSTR_L10N_(N_(\"" sorted_strings[i] "\"))"
+    } else if (strtab) {
+      strtab = strtab "\1" s
+      offsets[sorted_strings[i]] = strtab_len + 1
+      strtab_len += real_length(s) + 1
+    } else {
+      strtab = s
+      offsets[sorted_strings[i]] = 0
+      strtab_len += real_length(s)
+    }
+  }
+
+  gsub(/\2/, "\\\\", strtab)
+  gsub(/\1/, "\")\"\\0\"\n\tN_(\"", strtab)
+  print "\tN_(\"" strtab "\")"
+  print "\t\"\";"
+
+  print "enum {"
+  for (i = 0; i < num_vars; i++) {
+    sep = (i+1) != num_vars ? "," : ""
+    s = vars[i]
+    o = offsets[strings[s]]
+    print "\t" s " = " o sep
+    if (o > max) {
+      max = o
+    }
+  }
+  print "};"
+  print "\n#define STRTAB_MAX_OFFSET " max
+}
+
+# finish_input_string(strings, ident, val)
+#
+# Deal with backslash-escapes and special characters in val, then set
+# strings[ident] = val.
+function finish_string_input(strings, ident, val, n, tmpval)
+{
+  gsub(/\\\\/, "\1", val)
+  val = val (endline > startline ? "\n" : "")
+  gsub(/\\\n/, "", val)
+
+  tmpval = ""
+  while ((n = match(val, /\\[^abtnvfr]/)) > 0) {
+    tmpval = tmpval substr(val, 1, n-1)
+    val = substr(val, n+1)
+  }
+  tmpval = tmpval val
+
+  # Escape special characters
+  gsub(/"/, "\\\"", tmpval)
+  gsub(/\t/, "\\t", tmpval)
+  gsub(/\n/, "\\n", tmpval)
+  gsub(/\1/, "\\\\", tmpval)
+
+  strings[ident] = tmpval
+}
+
+function real_length(s, t)
+{
+  t = length(s)
+  return t - gsub(/\\./, "&", s)
+}
+
+# bucketsort(dst, src)
+#
+# Sort the elements of src by descending string length,
+# placing them into dst[0] ... dst[n].
+#
+# Returns the number of elements.
+function bucketsort(dst, src, buckets, max, count, i, t)
+{
+  for (t in src) {
+    i = length(src[t])
+    if (i > max) { max = i }
+    buckets[i]++
+  }
+
+  for (i = max; i > 0; i--) {
+    if (i in buckets) {
+      t = buckets[i]
+      buckets[i] = count
+      count += t
+    }
+  }
+
+  for (t in src) {
+    i = length(t = src[t])
+    dst[buckets[i]++] = t
+  }
+
+  return count
+}
diff --git a/tests/scripts.at b/tests/scripts.at

index 29471d833482e7782ca18b69f01d857bf26874b7..4784b5994929953114c4c6c11e4cd7a3c97a95d6 100644 (file)
--- a/tests/scripts.at
+++ b/tests/scripts.at
@@ -212,3 +212,84 @@ p
  }' messages.po | LC_ALL=C sort], [0], [expout])
  
  AT_CLEANUP
+
+AT_SETUP([gen-strtab.awk])
+
+AT_DATA([test.def],
+[[
+&a world
+&b
+hello world
+&c
+hello
+world
+&d world\n
+&e
+\\not a newline
+&f
+\not a newline
+&g inline
+continued
+&h    no\
+newline\
+&i
+\   leading whitespace
+&j oneline
+# with a comment
+]])
+
+AT_CHECK([$AWK -f "$builddir/scripts/gen-strtab.awk" <test.def >test.h])
+
+sed -n 's/^[[&]]\([[^ ]]*\).*/\1/p' test.def >identifiers
+
+# test 0: sanity test
+AT_DATA([test0.c],
+[[#include "test.h"
+#include <stdio.h>
+
+int main(void)
+{
+  printf("---\n");
+]])
+exec 3<identifiers 4>>test0.c
+while read ident <&3; do
+  AS_ECHO(['  printf("%s\n---\n", '"strtab+$ident);"]) >&4
+done
+AS_ECHO(['  return 0;']) >&4
+AS_ECHO(['}']) >&4
+exec 3<&- 4>&-
+
+AT_CHECK([$CC -o test0$EXEEXT test0.c && ./test0$EXEEXT], [0], [---
+world
+---
+hello world
+
+---
+hello
+world
+
+---
+world
+
+---
+\not a newline
+
+---
+
+ot a newline
+
+---
+inline
+continued
+
+---
+nonewline
+---
+   leading whitespace
+
+---
+oneline
+---
+], [ignore])
+
+AT_CLEANUP
author	Nick Bowler <nbowler@draconx.ca>
	Tue, 2 Mar 2021 04:15:30 +0000 (23:15 -0500)
committer	Nick Bowler <nbowler@draconx.ca>
	Tue, 2 Mar 2021 04:16:30 +0000 (23:16 -0500)
scripts/gen-strtab.awk	[new file with mode: 0755]	patch \| blob
tests/scripts.at		patch \| blob \| history