#!/bin/awk -f
#
-# Copyright © 2021 Nick Bowler
+# Copyright © 2021, 2023-2024 Nick Bowler
#
# Generate a C string table based on an input string specification file.
#
# processing immediately moves on to the next line and the result is as if
# the comment line were omitted from the input.
#
-# A string is defined by beginning a line with an & character, which must
-# be immediately followed by a C identifier. A nonempty sequence of
-# whitespace (with at most one newline) separates the identifier from the
-# beginning of the string itself. This whitespace is never included in the
-# output.
+# Options may be used to alter the normal behaviour. An option is placed
+# on a line by itself beginning with an @ character, and may appear anywhere
+# in the input file. The following options are defined:
+#
+# @nozero
+# All strings will have a non-zero offset in the strtab.
+#
+# @macro
+# Instead of a variable declaration, the generated header will define an
+# object-like macro that can be used as the initializer for a char array.
+#
+# A string is defined by beginning a line with one or two & characters, which
+# must be immediately followed by a C identifier. Two & characters indicates
+# a string that should not be translated, as described below. A nonempty
+# sequence of whitespace (with at most one newline) separates the identifier
+# from the beginning of the string itself. This whitespace is never included
+# in the output.
#
# The string is then interpreted as follows:
#
# - All other backslashes are deleted. This can be used to prevent special
# handling of whitespace, # or & characters at the beginning of a line.
#
-# The output defines a variable, strtab, which contains all of the strings,
-# and each identifier in the input is declared as an emumeration constant
-# whose value is the offset of the associated string within strtab.
+# Unless the @macro option is specified, the output defines a variable,
+# strtab, which contains all of the strings, and each identifier in the input
+# is declared as an emumeration constant whose value is the offset of the
+# associated string within strtab. Otherwise, if the @macro option is
+# specified, no variables are defined and STRTAB_INITIALIZER object-like macro
+# may be used to initialize a char array with static storage duration.
+#
+# Normally, the generated source code wraps strings using the identity macro
+# N_(x), which has no effect on the resulting data structures but enables tools
+# such as xgettext to extract translatable strings from the source code. An
+# identifier preceded by two ampersands (&&) suppresses this output to allow
+# a single string table to also contain both translateable strings as well as
+# ones that should not be translated.
#
# The object-like macro STRTAB_MAX_OFFSET is defined and expands to the
# greatest string offset, suitable for use in #if preprocessing directives.
}
BEGIN {
+ # Check if "\\\\" in substitutions gives just one backslash.
+ bs = "x"; sub(/x/, "\\\\", bs);
+ bs = (length(bs) == 1 ? "\\\\" : "\\");
+
+ opts["zero"] = 1
+ opts["macro"] = 0
collected = ident = ""
startline = endline = 0
num_vars = 0
}
-$0 ~ /^[#]/ { next }
+# Comments
+NF == 0 || $0 ~ /^[#]/ { next }
-$0 ~ /^[&]/ {
- if (ident) {
- finish_string_input(strings, ident, collected)
- vars[num_vars++] = ident
+# Options
+sub(/^@/, "", $0) {
+ if (NF == 1) {
+ orig=$1
+ gsub(/-/, "_", $1);
+ val = !sub(/^no_?/, "", $1);
+ if ($1 in opts) {
+ opts[$1] = val;
+ } else {
+ print "error: unrecognized option: @" orig | "cat 1>&2"
+ exit 1
+ }
+ }
+ next
+}
+
+sub(/^[&]/, "") {
+ if (ident != "") {
+ finish_string_input(strings, ident, collected);
+ vars[num_vars++] = ident;
}
- sub(/^[&]/, "", $1)
- startline = NR
- ident = $1
+ current_l10n = !sub(/^[&]/, "");
+ startline = NR;
+ ident = $1;
- $1 = ""
- collected = ""
+ collected = "";
+ sub(/^[^ \t]*/, "");
}
-ident {
- sub(/^[ \t]*/, "")
- if (collected) {
- collected = collected "\n" $0
- } else {
- collected = $0
- }
+ident != "" {
+ sub(/^[ \t]*/, "");
- endline = NR
+ sep = collected != "" ? "\n" : "";
+ collected = collected sep $0;
+ endline = NR;
}
END {
- if (ident) {
+ if (ident != "") {
finish_string_input(strings, ident, collected)
vars[num_vars++] = ident
}
}
END {
- strtab = ""
+ strtab = cont = ""
strtab_len = 0
count = bucketsort(sorted_strings, strings)
max = 0
print "#ifndef N_"
print "# define N_(x) x"
print "#endif"
- print "\nstatic const char strtab[] ="
+ if (opts["macro"]) {
+ cont = " \\";
+ print "\n#define STRTAB_INITIALIZER" cont;
+ } else {
+ print "\nstatic const char strtab[] =";
+ }
for (i = 0; i < count; i++) {
s = sorted_strings[i]
gsub(/\\\\/, "\2", s)
if ((n = index(strtab "\1", s "\1")) > 0) {
- offsets[sorted_strings[i]] = real_length(substr(strtab, 1, n-1))
- print "\tSTR_L10N_(N_(\"" sorted_strings[i] "\"))"
+ offsets[sorted_strings[i]] = real_length(substr(strtab, 1, n-1));
+ if (!(sorted_strings[i] in nol10n))
+ print "\tSTR_L10N_(N_(\"" sorted_strings[i] "\"))" cont;
} else if (strtab) {
strtab = strtab "\1" s
offsets[sorted_strings[i]] = strtab_len + 1
}
}
- gsub(/\2/, "\\\\", strtab)
- gsub(/\1/, "\")\"\\0\"\n\tN_(\"", strtab)
- print "\tN_(\"" strtab "\")"
- print "\t\"\";"
+ gsub("\2", bs bs, strtab);
+ n = split(strtab, split_strtab, "\1");
+ for (i = 1; i <= n; i++) {
+ printf("\t%4s ", i > !!opts["zero"] ? "\"\\0\"" : "");
+
+ if (split_strtab[i] in nol10n) {
+ print "\"" split_strtab[i] "\"" cont;
+ } else {
+ print "N_(\"" split_strtab[i] "\")" cont;
+ }
+ }
+ print "\t\"\"" substr(";", 1, !opts["macro"]);
print "enum {"
for (i = 0; i < num_vars; i++) {
sep = (i+1) != num_vars ? "," : ""
s = vars[i]
- o = offsets[strings[s]]
+ o = offsets[strings[s]] + (!opts["zero"])
print "\t" s " = " o sep
if (o > max) {
max = o
print "\n#define STRTAB_MAX_OFFSET " max
}
-# finish_input_string(strings, ident, val)
+# finish_string_input(strings, ident, val)
#
# Deal with backslash-escapes and special characters in val, then set
# strings[ident] = val.
function finish_string_input(strings, ident, val, n, tmpval)
{
- gsub(/\\\\/, "\1", val)
- val = val (endline > startline ? "\n" : "")
- gsub(/\\\n/, "", val)
+ gsub(/\\\\/, "\2", val);
+ if (endline > startline)
+ val = val "\n";
+ gsub(/\\\n/, "", val);
tmpval = ""
while ((n = match(val, /\\[^abtnvfr]/)) > 0) {
- tmpval = tmpval substr(val, 1, n-1)
- val = substr(val, n+1)
+ tmpval = tmpval substr(val, 1, n-1);
+ val = substr(val, n+1);
}
- tmpval = tmpval val
+ tmpval = tmpval val;
# Escape special characters
- gsub(/"/, "\\\"", tmpval)
- gsub(/\t/, "\\t", tmpval)
- gsub(/\n/, "\\n", tmpval)
- gsub(/\1/, "\\\\", tmpval)
+ gsub(/"/, bs"\"", tmpval);
+ gsub(/\t/, bs"t", tmpval);
+ gsub(/\n/, bs"n", tmpval);
+ gsub("\2", bs bs, tmpval);
- strings[ident] = tmpval
+ strings[ident] = tmpval;
+ if (!current_l10n) {
+ nol10n[tmpval] = 1;
+ }
}
function real_length(s, t)
# placing them into dst[0] ... dst[n].
#
# Returns the number of elements.
-function bucketsort(dst, src, buckets, max, count, i, t)
+function bucketsort(dst, src, max, count, i, t)
{
+ # Note: ULTRIX 4.5 nawk does not support local array parameters
+ split("", bucketsort_buckets);
+
for (t in src) {
i = length(src[t])
if (i > max) { max = i }
- buckets[i]++
+ bucketsort_buckets[i]++
}
for (i = max; i > 0; i--) {
- if (i in buckets) {
- t = buckets[i]
- buckets[i] = count
+ if (i in bucketsort_buckets) {
+ t = bucketsort_buckets[i]
+ bucketsort_buckets[i] = count
count += t
}
}
for (t in src) {
i = length(t = src[t])
- dst[buckets[i]++] = t
+ dst[bucketsort_buckets[i]++] = t
}
return count