#!/bin/awk -f # # Copyright © 2021, 2023-2024 Nick Bowler # # Generate a C string table based on an input string specification file. # # A string table is a single large char single array containing all of # the specified (0-terminated) strings, which is then offset to obtain # the desired string. By storing these offsets instead of string pointers # into read-only data structures, this can reduce the need for relocation # processing at startup when programs are built in PIC mode. # # The string specification file is processed line by line. Comment # lines may be included by beginning the line with a # character, which # must be the very first character on the line. If a comment is encountered, # processing immediately moves on to the next line and the result is as if # the comment line were omitted from the input. # # Options may be used to alter the normal behaviour. An option is placed # on a line by itself beginning with an @ character, and may appear anywhere # in the input file. The following options are defined: # # @nozero # All strings will have a non-zero offset in the strtab. # # @macro # Instead of a variable declaration, the generated header will define an # object-like macro that can be used as the initializer for a char array. # # A string is defined by beginning a line with one or two & characters, which # must be immediately followed by a C identifier. Two & characters indicates # a string that should not be translated, as described below. A nonempty # sequence of whitespace (with at most one newline) separates the identifier # from the beginning of the string itself. This whitespace is never included # in the output. # # The string is then interpreted as follows: # # - Leading blanks on each line are ignored. # - The sequences \\, \a, \b, \t, \n, \v, \f and \r can be entered and # mean the same as they do in C string literals. The "\\" sequence # prevents any special interpretation of the second backslash. # - Newlines in the input are included in the output, except for the # where the entire string (including its identifier) are on one line. # - If this is not desired, a newline which is immediately preceded by an # unescaped backslash will deleted, along with the backslash. # - All other backslashes are deleted. This can be used to prevent special # handling of whitespace, # or & characters at the beginning of a line. # # Unless the @macro option is specified, the output defines a variable, # strtab, which contains all of the strings, and each identifier in the input # is declared as an emumeration constant whose value is the offset of the # associated string within strtab. Otherwise, if the @macro option is # specified, no variables are defined and STRTAB_INITIALIZER object-like macro # may be used to initialize a char array with static storage duration. # # Normally, the generated source code wraps strings using the identity macro # N_(x), which has no effect on the resulting data structures but enables tools # such as xgettext to extract translatable strings from the source code. An # identifier preceded by two ampersands (&&) suppresses this output to allow # a single string table to also contain both translateable strings as well as # ones that should not be translated. # # The object-like macro STRTAB_MAX_OFFSET is defined and expands to the # greatest string offset, suitable for use in #if preprocessing directives. # # License WTFPL2: Do What The Fuck You Want To Public License, version 2. # This is free software: you are free to do what the fuck you want to. # There is NO WARRANTY, to the extent permitted by law. END { print "/*" if (FILENAME) { print " * Automatically generated by gen-strtab.awk from " FILENAME } else { print " * Automatically generated by gen-strtab.awk" } print " * Do not edit." print " */" } BEGIN { # Check if "\\\\" in substitutions gives just one backslash. bs = "x"; sub(/x/, "\\\\", bs); bs = (length(bs) == 1 ? "\\\\" : "\\"); opts["zero"] = 1 opts["macro"] = 0 collected = ident = "" startline = endline = 0 num_vars = 0 } # Comments NF == 0 || $0 ~ /^[#]/ { next } # Options sub(/^@/, "", $0) { if (NF == 1) { orig=$1 gsub(/-/, "_", $1); val = !sub(/^no_?/, "", $1); if ($1 in opts) { opts[$1] = val; } else { print "error: unrecognized option: @" orig | "cat 1>&2" exit 1 } } next } sub(/^[&]/, "") { if (ident != "") { finish_string_input(strings, ident, collected); vars[num_vars++] = ident; } current_l10n = !sub(/^[&]/, ""); startline = NR; ident = $1; collected = ""; sub(/^[^ \t]*/, ""); } ident != "" { sub(/^[ \t]*/, ""); sep = collected != "" ? "\n" : ""; collected = collected sep $0; endline = NR; } END { if (ident != "") { finish_string_input(strings, ident, collected) vars[num_vars++] = ident } } END { strtab = cont = "" strtab_len = 0 count = bucketsort(sorted_strings, strings) max = 0 print "\n#define STR_L10N_(x)" print "#ifndef N_" print "# define N_(x) x" print "#endif" if (opts["macro"]) { cont = " \\"; print "\n#define STRTAB_INITIALIZER" cont; } else { print "\nstatic const char strtab[] ="; } for (i = 0; i < count; i++) { s = sorted_strings[i] gsub(/\\\\/, "\2", s) if ((n = index(strtab "\1", s "\1")) > 0) { offsets[sorted_strings[i]] = real_length(substr(strtab, 1, n-1)); if (!(sorted_strings[i] in nol10n)) print "\tSTR_L10N_(N_(\"" sorted_strings[i] "\"))" cont; } else if (strtab) { strtab = strtab "\1" s offsets[sorted_strings[i]] = strtab_len + 1 strtab_len += real_length(s) + 1 } else { strtab = s offsets[sorted_strings[i]] = 0 strtab_len += real_length(s) } } gsub("\2", bs bs, strtab); n = split(strtab, split_strtab, "\1"); for (i = 1; i <= n; i++) { printf("\t%4s ", i > !!opts["zero"] ? "\"\\0\"" : ""); if (split_strtab[i] in nol10n) { print "\"" split_strtab[i] "\"" cont; } else { print "N_(\"" split_strtab[i] "\")" cont; } } print "\t\"\"" substr(";", 1, !opts["macro"]); print "enum {" for (i = 0; i < num_vars; i++) { sep = (i+1) != num_vars ? "," : "" s = vars[i] o = offsets[strings[s]] + (!opts["zero"]) print "\t" s " = " o sep if (o > max) { max = o } } print "};" print "\n#define STRTAB_MAX_OFFSET " max } # finish_string_input(strings, ident, val) # # Deal with backslash-escapes and special characters in val, then set # strings[ident] = val. function finish_string_input(strings, ident, val, n, tmpval) { gsub(/\\\\/, "\2", val); if (endline > startline) val = val "\n"; gsub(/\\\n/, "", val); tmpval = "" while ((n = match(val, /\\[^abtnvfr]/)) > 0) { tmpval = tmpval substr(val, 1, n-1); val = substr(val, n+1); } tmpval = tmpval val; # Escape special characters gsub(/"/, bs"\"", tmpval); gsub(/\t/, bs"t", tmpval); gsub(/\n/, bs"n", tmpval); gsub("\2", bs bs, tmpval); strings[ident] = tmpval; if (!current_l10n) { nol10n[tmpval] = 1; } } function real_length(s, t) { t = length(s) return t - gsub(/\\./, "&", s) } # bucketsort(dst, src) # # Sort the elements of src by descending string length, # placing them into dst[0] ... dst[n]. # # Returns the number of elements. function bucketsort(dst, src, max, count, i, t) { # Note: ULTRIX 4.5 nawk does not support local array parameters split("", bucketsort_buckets); for (t in src) { i = length(src[t]) if (i > max) { max = i } bucketsort_buckets[i]++ } for (i = max; i > 0; i--) { if (i in bucketsort_buckets) { t = bucketsort_buckets[i] bucketsort_buckets[i] = count count += t } } for (t in src) { i = length(t = src[t]) dst[bucketsort_buckets[i]++] = t } return count }