#!/bin/awk -f
#
# Copyright © 2021 Nick Bowler
#
# Generate a C string table based on an input string specification file.
#
# A string table is a single large char single array containing all of
# the specified (0-terminated) strings, which is then offset to obtain
# the desired string.  By storing these offsets instead of string pointers
# into read-only data structures, this can reduce the need for relocation
# processing at startup when programs are built in PIC mode.
#
# The string specification file is processed line by line.  Comment
# lines may be included by beginning the line with a # character, which
# must be the very first character on the line.  If a comment is encountered,
# processing immediately moves on to the next line and the result is as if
# the comment line were omitted from the input.
#
# A string is defined by beginning a line with an & character, which must
# be immediately followed by a C identifier.  A nonempty sequence of
# whitespace (with at most one newline) separates the identifier from the
# beginning of the string itself.  This whitespace is never included in the
# output.
#
# The string is then interpreted as follows:
#
#   - Leading blanks on each line are ignored.
#   - The sequences \\, \a, \b, \t, \n, \v, \f and \r can be entered and
#     mean the same as they do in C string literals.  The "\\" sequence
#     prevents any special interpretation of the second backslash.
#   - Newlines in the input are included in the output, except for the
#     where the entire string (including its identifier) are on one line.
#   - If this is not desired, a newline which is immediately preceded by an
#     unescaped backslash will deleted, along with the backslash.
#   - All other backslashes are deleted.  This can be used to prevent special
#     handling of whitespace, # or & characters at the beginning of a line.
#
# The output defines a variable, strtab, which contains all of the strings,
# and each identifier in the input is declared as an emumeration constant
# whose value is the offset of the associated string within strtab.
#
# The object-like macro STRTAB_MAX_OFFSET is defined and expands to the
# greatest string offset, suitable for use in #if preprocessing directives.
#
# License WTFPL2: Do What The Fuck You Want To Public License, version 2.
# This is free software: you are free to do what the fuck you want to.
# There is NO WARRANTY, to the extent permitted by law.

END {
  print "/*"
  if (FILENAME) {
    print " * Automatically generated by gen-strtab.awk from " FILENAME
  } else {
    print " * Automatically generated by gen-strtab.awk"
  }
  print " * Do not edit."
  print " */"
}

BEGIN {
  collected = ident = ""
  startline = endline = 0
  num_vars = 0
}

$0 ~ /^[#]/ { next }

$0 ~ /^[&]/ {
  if (ident) {
    finish_string_input(strings, ident, collected)
    vars[num_vars++] = ident
  }

  sub(/^[&]/, "", $1)
  startline = NR
  ident = $1

  $1 = ""
  collected = ""
}

ident {
  sub(/^[ \t]*/, "")
  if (collected) {
    collected = collected "\n" $0
  } else {
    collected = $0
  }

  endline = NR
}

END {
  if (ident) {
    finish_string_input(strings, ident, collected)
    vars[num_vars++] = ident
  }
}

END {
  strtab = ""
  strtab_len = 0
  count = bucketsort(sorted_strings, strings)
  max = 0

  print "\n#define STR_L10N_(x)"
  print "#ifndef N_"
  print "#  define N_(x) x"
  print "#endif"
  print "\nstatic const char strtab[] ="

  for (i = 0; i < count; i++) {
    s = sorted_strings[i]
    gsub(/\\\\/, "\2", s)
    if ((n = index(strtab "\1", s "\1")) > 0) {
      offsets[sorted_strings[i]] = real_length(substr(strtab, 1, n-1))
      print "\tSTR_L10N_(N_(\"" sorted_strings[i] "\"))"
    } else if (strtab) {
      strtab = strtab "\1" s
      offsets[sorted_strings[i]] = strtab_len + 1
      strtab_len += real_length(s) + 1
    } else {
      strtab = s
      offsets[sorted_strings[i]] = 0
      strtab_len += real_length(s)
    }
  }

  gsub(/\2/, "\\\\", strtab)
  gsub(/\1/, "\")\"\\0\"\n\tN_(\"", strtab)
  print "\tN_(\"" strtab "\")"
  print "\t\"\";"

  print "enum {"
  for (i = 0; i < num_vars; i++) {
    sep = (i+1) != num_vars ? "," : ""
    s = vars[i]
    o = offsets[strings[s]]
    print "\t" s " = " o sep
    if (o > max) {
      max = o
    }
  }
  print "};"
  print "\n#define STRTAB_MAX_OFFSET " max
}

# finish_input_string(strings, ident, val)
#
# Deal with backslash-escapes and special characters in val, then set
# strings[ident] = val.
function finish_string_input(strings, ident, val, n, tmpval)
{
  gsub(/\\\\/, "\1", val)
  val = val (endline > startline ? "\n" : "")
  gsub(/\\\n/, "", val)

  tmpval = ""
  while ((n = match(val, /\\[^abtnvfr]/)) > 0) {
    tmpval = tmpval substr(val, 1, n-1)
    val = substr(val, n+1)
  }
  tmpval = tmpval val

  # Escape special characters
  gsub(/"/, "\\\"", tmpval)
  gsub(/\t/, "\\t", tmpval)
  gsub(/\n/, "\\n", tmpval)
  gsub(/\1/, "\\\\", tmpval)

  strings[ident] = tmpval
}

function real_length(s, t)
{
  t = length(s)
  return t - gsub(/\\./, "&", s)
}

# bucketsort(dst, src)
#
# Sort the elements of src by descending string length,
# placing them into dst[0] ... dst[n].
#
# Returns the number of elements.
function bucketsort(dst, src, buckets, max, count, i, t)
{
  for (t in src) {
    i = length(src[t])
    if (i > max) { max = i }
    buckets[i]++
  }

  for (i = max; i > 0; i--) {
    if (i in buckets) {
      t = buckets[i]
      buckets[i] = count
      count += t
    }
  }

  for (t in src) {
    i = length(t = src[t])
    dst[buckets[i]++] = t
  }

  return count
}