scripts/gen-strtab.awk

   1 #!/bin/awk -f
   2 #
   3 # Copyright © 2021, 2023 Nick Bowler
   4 #
   5 # Generate a C string table based on an input string specification file.
   6 #
   7 # A string table is a single large char single array containing all of
   8 # the specified (0-terminated) strings, which is then offset to obtain
   9 # the desired string.  By storing these offsets instead of string pointers
  10 # into read-only data structures, this can reduce the need for relocation
  11 # processing at startup when programs are built in PIC mode.
  12 #
  13 # The string specification file is processed line by line.  Comment
  14 # lines may be included by beginning the line with a # character, which
  15 # must be the very first character on the line.  If a comment is encountered,
  16 # processing immediately moves on to the next line and the result is as if
  17 # the comment line were omitted from the input.
  18 #
  19 # Options may be used to alter the normal behaviour.  An option is placed
  20 # on a line by itself beginning with an @ character, and may appear anywhere
  21 # in the input file.  The following options are defined:
  22 #
  23 #   @nozero
  24 #     All strings will have a non-zero offset in the strtab.
  25 #
  26 # A string is defined by beginning a line with one or two & characters, which
  27 # must be immediately followed by a C identifier.  Two & characters indicates
  28 # a string that should not be translated, as described below.  A nonempty
  29 # sequence of whitespace (with at most one newline) separates the identifier
  30 # from the beginning of the string itself.  This whitespace is never included
  31 # in the output.
  32 #
  33 # The string is then interpreted as follows:
  34 #
  35 #   - Leading blanks on each line are ignored.
  36 #   - The sequences \\, \a, \b, \t, \n, \v, \f and \r can be entered and
  37 #     mean the same as they do in C string literals.  The "\\" sequence
  38 #     prevents any special interpretation of the second backslash.
  39 #   - Newlines in the input are included in the output, except for the
  40 #     where the entire string (including its identifier) are on one line.
  41 #   - If this is not desired, a newline which is immediately preceded by an
  42 #     unescaped backslash will deleted, along with the backslash.
  43 #   - All other backslashes are deleted.  This can be used to prevent special
  44 #     handling of whitespace, # or & characters at the beginning of a line.
  45 #
  46 # The output defines a variable, strtab, which contains all of the strings,
  47 # and each identifier in the input is declared as an emumeration constant
  48 # whose value is the offset of the associated string within strtab.
  49 #
  50 # Normally, the generated source code wraps strings using the identity macro
  51 # N_(x), which has no effect on the resulting data structures but enables tools
  52 # such as xgettext to extract translatable strings from the source code.  An
  53 # identifier preceded by two ampersands (&&) suppresses this output to allow
  54 # a single string table to also contain both translateable strings as well as
  55 # ones that should not be translated.
  56 #
  57 # The object-like macro STRTAB_MAX_OFFSET is defined and expands to the
  58 # greatest string offset, suitable for use in #if preprocessing directives.
  59 #
  60 # License WTFPL2: Do What The Fuck You Want To Public License, version 2.
  61 # This is free software: you are free to do what the fuck you want to.
  62 # There is NO WARRANTY, to the extent permitted by law.
  63
  64 END {
  65   print "/*"
  66   if (FILENAME) {
  67     print " * Automatically generated by gen-strtab.awk from " FILENAME
  68   } else {
  69     print " * Automatically generated by gen-strtab.awk"
  70   }
  71   print " * Do not edit."
  72   print " */"
  73 }
  74
  75 BEGIN {
  76   opts["zero"] = 1
  77   collected = ident = ""
  78   startline = endline = 0
  79   num_vars = 0
  80 }
  81
  82 # Comments
  83 NF == 0 || $0 ~ /^[#]/ { next }
  84
  85 # Options
  86 sub(/^@/, "", $0) {
  87   if (NF == 1) {
  88     orig=$1
  89     gsub(/-/, "_", $1);
  90     val = !sub(/^no_?/, "", $1);
  91     if ($1 in opts) {
  92       opts[$1] = val;
  93     } else {
  94       print "error: unrecognized option: @" orig | "cat 1>&2"
  95       exit 1
  96     }
  97   }
  98   next
  99 }
 100
 101 sub(/^[&]/, "") {
 102   if (ident) {
 103     finish_string_input(strings, ident, collected)
 104     vars[num_vars++] = ident
 105   }
 106
 107   current_l10n = !sub(/^[&]/, "", $1);
 108   startline = NR
 109   ident = $1
 110
 111   $1 = ""
 112   collected = ""
 113 }
 114
 115 ident {
 116   sub(/^[ \t]*/, "")
 117   if (collected) {
 118     collected = collected "\n" $0
 119   } else {
 120     collected = $0
 121   }
 122
 123   endline = NR
 124 }
 125
 126 END {
 127   if (ident) {
 128     finish_string_input(strings, ident, collected)
 129     vars[num_vars++] = ident
 130   }
 131 }
 132
 133 END {
 134   strtab = ""
 135   strtab_len = 0
 136   count = bucketsort(sorted_strings, strings)
 137   max = 0
 138
 139   print "\n#define STR_L10N_(x)"
 140   print "#ifndef N_"
 141   print "#  define N_(x) x"
 142   print "#endif"
 143   print "\nstatic const char strtab[] ="
 144
 145   for (i = 0; i < count; i++) {
 146     s = sorted_strings[i]
 147     gsub(/\\\\/, "\2", s)
 148     if ((n = index(strtab "\1", s "\1")) > 0) {
 149       offsets[sorted_strings[i]] = real_length(substr(strtab, 1, n-1));
 150       if (!(sorted_strings[i] in nol10n))
 151         print "\tSTR_L10N_(N_(\"" sorted_strings[i] "\"))";
 152     } else if (strtab) {
 153       strtab = strtab "\1" s
 154       offsets[sorted_strings[i]] = strtab_len + 1
 155       strtab_len += real_length(s) + 1
 156     } else {
 157       strtab = s
 158       offsets[sorted_strings[i]] = 0
 159       strtab_len += real_length(s)
 160     }
 161   }
 162
 163   gsub(/\2/, "\\\\", strtab);
 164   n = split(strtab, split_strtab, "\1");
 165   for (i = 1; i <= n; i++) {
 166     printf("\t%4s ", i > !!opts["zero"] ? "\"\\0\"" : "");
 167
 168     if (split_strtab[i] in nol10n) {
 169       print "\"" split_strtab[i] "\"";
 170     } else {
 171       print "N_(\"" split_strtab[i] "\")";
 172     }
 173   }
 174   print "\t\"\";";
 175
 176   print "enum {"
 177   for (i = 0; i < num_vars; i++) {
 178     sep = (i+1) != num_vars ? "," : ""
 179     s = vars[i]
 180     o = offsets[strings[s]] + !opts["zero"]
 181     print "\t" s " = " o sep
 182     if (o > max) {
 183       max = o
 184     }
 185   }
 186   print "};"
 187   print "\n#define STRTAB_MAX_OFFSET " max
 188 }
 189
 190 # finish_input_string(strings, ident, val)
 191 #
 192 # Deal with backslash-escapes and special characters in val, then set
 193 # strings[ident] = val.
 194 function finish_string_input(strings, ident, val, n, tmpval)
 195 {
 196   gsub(/\\\\/, "\1", val)
 197   val = val (endline > startline ? "\n" : "")
 198   gsub(/\\\n/, "", val)
 199
 200   tmpval = ""
 201   while ((n = match(val, /\\[^abtnvfr]/)) > 0) {
 202     tmpval = tmpval substr(val, 1, n-1)
 203     val = substr(val, n+1)
 204   }
 205   tmpval = tmpval val
 206
 207   # Escape special characters
 208   gsub(/"/, "\\\"", tmpval)
 209   gsub(/\t/, "\\t", tmpval)
 210   gsub(/\n/, "\\n", tmpval)
 211   gsub(/\1/, "\\\\", tmpval)
 212
 213   strings[ident] = tmpval
 214   if (!current_l10n) {
 215     nol10n[tmpval] = 1;
 216   }
 217 }
 218
 219 function real_length(s, t)
 220 {
 221   t = length(s)
 222   return t - gsub(/\\./, "&", s)
 223 }
 224
 225 # bucketsort(dst, src)
 226 #
 227 # Sort the elements of src by descending string length,
 228 # placing them into dst[0] ... dst[n].
 229 #
 230 # Returns the number of elements.
 231 function bucketsort(dst, src, buckets, max, count, i, t)
 232 {
 233   for (t in src) {
 234     i = length(src[t])
 235     if (i > max) { max = i }
 236     buckets[i]++
 237   }
 238
 239   for (i = max; i > 0; i--) {
 240     if (i in buckets) {
 241       t = buckets[i]
 242       buckets[i] = count
 243       count += t
 244     }
 245   }
 246
 247   for (t in src) {
 248     i = length(t = src[t])
 249     dst[buckets[i]++] = t
 250   }
 251
 252   return count
 253 }