#!/bin/awk -f
#
# Copyright © 2021, 2023-2024 Nick Bowler
#
# Generate a C string table based on an input string specification file.
#
# A string table is a single large char single array containing all of
# the specified (0-terminated) strings, which is then offset to obtain
# the desired string.  By storing these offsets instead of string pointers
# into read-only data structures, this can reduce the need for relocation
# processing at startup when programs are built in PIC mode.
#
# The string specification file is processed line by line.  Comment
# lines may be included by beginning the line with a # character, which
# must be the very first character on the line.  If a comment is encountered,
# processing immediately moves on to the next line and the result is as if
# the comment line were omitted from the input.
#
# Options may be used to alter the normal behaviour.  An option is placed
# on a line by itself beginning with an @ character, and may appear anywhere
# in the input file.  The following options are defined:
#
#   @nozero
#     All strings will have a non-zero offset in the strtab.
#
#   @macro
#     Instead of a variable declaration, the generated header will define an
#     object-like macro that can be used as the initializer for a char array.
#
# A string is defined by beginning a line with one or two & characters, which
# must be immediately followed by a C identifier.  Two & characters indicates
# a string that should not be translated, as described below.  A nonempty
# sequence of whitespace (with at most one newline) separates the identifier
# from the beginning of the string itself.  This whitespace is never included
# in the output.
#
# The string is then interpreted as follows:
#
#   - Leading blanks on each line are ignored.
#   - The sequences \\, \a, \b, \t, \n, \v, \f and \r can be entered and
#     mean the same as they do in C string literals.  The "\\" sequence
#     prevents any special interpretation of the second backslash.
#   - Newlines in the input are included in the output, except for the
#     where the entire string (including its identifier) are on one line.
#   - If this is not desired, a newline which is immediately preceded by an
#     unescaped backslash will deleted, along with the backslash.
#   - All other backslashes are deleted.  This can be used to prevent special
#     handling of whitespace, # or & characters at the beginning of a line.
#
# Unless the @macro option is specified, the output defines a variable,
# strtab, which contains all of the strings, and each identifier in the input
# is declared as an emumeration constant whose value is the offset of the
# associated string within strtab.  Otherwise, if the @macro option is
# specified, no variables are defined and STRTAB_INITIALIZER object-like macro
# may be used to initialize a char array with static storage duration.
#
# Normally, the generated source code wraps strings using the identity macro
# N_(x), which has no effect on the resulting data structures but enables tools
# such as xgettext to extract translatable strings from the source code.  An
# identifier preceded by two ampersands (&&) suppresses this output to allow
# a single string table to also contain both translateable strings as well as
# ones that should not be translated.
#
# The object-like macro STRTAB_MAX_OFFSET is defined and expands to the
# greatest string offset, suitable for use in #if preprocessing directives.
#
# License WTFPL2: Do What The Fuck You Want To Public License, version 2.
# This is free software: you are free to do what the fuck you want to.
# There is NO WARRANTY, to the extent permitted by law.

END {
  print "/*"
  if (FILENAME) {
    print " * Automatically generated by gen-strtab.awk from " FILENAME
  } else {
    print " * Automatically generated by gen-strtab.awk"
  }
  print " * Do not edit."
  print " */"
}

BEGIN {
  # Check if "\\\\" in substitutions gives just one backslash.
  bs = "x"; sub(/x/, "\\\\", bs);
  bs = (length(bs) == 1 ? "\\\\" : "\\");

  opts["zero"] = 1
  opts["macro"] = 0
  collected = ident = ""
  startline = endline = 0
  num_vars = 0
}

# Comments
NF == 0 || $0 ~ /^[#]/ { next }

# Options
sub(/^@/, "", $0) {
  if (NF == 1) {
    orig=$1
    gsub(/-/, "_", $1);
    val = !sub(/^no_?/, "", $1);
    if ($1 in opts) {
      opts[$1] = val;
    } else {
      print "error: unrecognized option: @" orig | "cat 1>&2"
      exit 1
    }
  }
  next
}

sub(/^[&]/, "") {
  if (ident != "") {
    finish_string_input(strings, ident, collected);
    vars[num_vars++] = ident;
  }

  current_l10n = !sub(/^[&]/, "");
  startline = NR;
  ident = $1;

  collected = "";
  sub(/^[^ \t]*/, "");
}

ident != "" {
  sub(/^[ \t]*/, "");

  sep = collected != "" ? "\n" : "";
  collected = collected sep $0;
  endline = NR;
}

END {
  if (ident != "") {
    finish_string_input(strings, ident, collected)
    vars[num_vars++] = ident
  }
}

END {
  strtab = cont = ""
  strtab_len = 0
  count = bucketsort(sorted_strings, strings)
  max = 0

  print "\n#define STR_L10N_(x)"
  print "#ifndef N_"
  print "#  define N_(x) x"
  print "#endif"
  if (opts["macro"]) {
    cont = " \\";
    print "\n#define STRTAB_INITIALIZER" cont;
  } else {
    print "\nstatic const char strtab[] =";
  }

  for (i = 0; i < count; i++) {
    s = sorted_strings[i]
    gsub(/\\\\/, "\2", s)
    if ((n = index(strtab "\1", s "\1")) > 0) {
      offsets[sorted_strings[i]] = real_length(substr(strtab, 1, n-1));
      if (!(sorted_strings[i] in nol10n))
        print "\tSTR_L10N_(N_(\"" sorted_strings[i] "\"))" cont;
    } else if (strtab) {
      strtab = strtab "\1" s
      offsets[sorted_strings[i]] = strtab_len + 1
      strtab_len += real_length(s) + 1
    } else {
      strtab = s
      offsets[sorted_strings[i]] = 0
      strtab_len += real_length(s)
    }
  }

  gsub("\2", bs bs, strtab);
  n = split(strtab, split_strtab, "\1");
  for (i = 1; i <= n; i++) {
    printf("\t%4s ", i > !!opts["zero"] ? "\"\\0\"" : "");

    if (split_strtab[i] in nol10n) {
      print "\"" split_strtab[i] "\"" cont;
    } else {
      print "N_(\"" split_strtab[i] "\")" cont;
    }
  }
  print "\t\"\"" substr(";", 1, !opts["macro"]);

  print "enum {"
  for (i = 0; i < num_vars; i++) {
    sep = (i+1) != num_vars ? "," : ""
    s = vars[i]
    o = offsets[strings[s]] + (!opts["zero"])
    print "\t" s " = " o sep
    if (o > max) {
      max = o
    }
  }
  print "};"
  print "\n#define STRTAB_MAX_OFFSET " max
}

# finish_string_input(strings, ident, val)
#
# Deal with backslash-escapes and special characters in val, then set
# strings[ident] = val.
function finish_string_input(strings, ident, val, n, tmpval)
{
  gsub(/\\\\/, "\2", val);
  if (endline > startline)
    val = val "\n";
  gsub(/\\\n/, "", val);

  tmpval = ""
  while ((n = match(val, /\\[^abtnvfr]/)) > 0) {
    tmpval = tmpval substr(val, 1, n-1);
    val = substr(val, n+1);
  }
  tmpval = tmpval val;

  # Escape special characters
  gsub(/"/,  bs"\"", tmpval);
  gsub(/\t/, bs"t", tmpval);
  gsub(/\n/, bs"n", tmpval);
  gsub("\2", bs bs, tmpval);

  strings[ident] = tmpval;
  if (!current_l10n) {
    nol10n[tmpval] = 1;
  }
}

function real_length(s, t)
{
  t = length(s)
  return t - gsub(/\\./, "&", s)
}

# bucketsort(dst, src)
#
# Sort the elements of src by descending string length,
# placing them into dst[0] ... dst[n].
#
# Returns the number of elements.
function bucketsort(dst, src, max, count, i, t)
{
  # Note: ULTRIX 4.5 nawk does not support local array parameters
  split("", bucketsort_buckets);

  for (t in src) {
    i = length(src[t])
    if (i > max) { max = i }
    bucketsort_buckets[i]++
  }

  for (i = max; i > 0; i--) {
    if (i in bucketsort_buckets) {
      t = bucketsort_buckets[i]
      bucketsort_buckets[i] = count
      count += t
    }
  }

  for (t in src) {
    i = length(t = src[t])
    dst[bucketsort_buckets[i]++] = t
  }

  return count
}