From 9a50128ac12ab834fd97699c7ca543fc040ea8f1 Mon Sep 17 00:00:00 2001 From: Nick Bowler Date: Wed, 14 Jun 2023 20:34:56 -0400 Subject: [PATCH] libcdecl: Tweak invalid character error from scanner. By adjusting how we format the error message, the format string is changed to be completely identical to a format string used by the parser error reporting, which avoids some wasted code space in the library. Furthermore, make some tweaks to the invalid character pretty-printing which seems to let GCC generate a bit more compact code. --- src/scan.l | 87 +++++++++++++++++++++++++++++++++++------------- tests/general.at | 25 +++++++++++++- 2 files changed, 88 insertions(+), 24 deletions(-) diff --git a/src/scan.l b/src/scan.l index 5610dc3..0a4db93 100644 --- a/src/scan.l +++ b/src/scan.l @@ -53,6 +53,67 @@ } \ strcpy(yylval->strval, yytext); \ } while(0) + +static char *to_octal(char *dst, unsigned val) +{ + unsigned i; + + for (i = 0; i < 3; i++) { + *dst++ = '0' + ((val >> 6) & 7u); + val <<= 3; + } + + return dst; +} + +/* + * Convert a single character to a C-style character constant, including quote + * characters. At most 7 bytes are written to the buffer for the longest + * octal encoding, e.g., '\177' + */ +static void to_readable_ch(char *dst, char c) +{ + unsigned char uc = c; + unsigned i; + char esc; + + /* + * The 7 standard C control characters are contiguous in ASCII, + * permitting a simple and compact lookup table; separating their + * handling from backslash and quote characters hopefully allows + * the compiler to recognize that. + */ + switch (c) { + case '\a': i = 0; break; + case '\b': i = 1; break; + case '\t': i = 2; break; + case '\n': i = 3; break; + case '\v': i = 4; break; + case '\f': i = 5; break; + case '\r': i = 6; break; + default: i = 7; break; + } + esc = "abtnvfr"[i]; + + /* Otherwise printable characters that should still be escaped. */ + switch (c) { + case '\\': case '\'': esc = c; break; + } + + *dst++ = '\''; + if (esc) { + *dst++ = '\\'; + *dst++ = esc; + } else if (isprint(uc)) { + *dst++ = c; + } else { + *dst++ = '\\'; + dst = to_octal(dst, uc); + } + *dst++ = '\''; + *dst++ = 0; +} + %} %s ENGLISH @@ -142,29 +203,9 @@ INTEGER 0x[[:xdigit:]]+|0[0-7]+|[[:digit:]]+ [[:space:]]+ . { - char buf[5] = { yytext[0] }; - unsigned char c = buf[0]; - - if (!isprint(c) || c == '\\' || c == '\'') { - /* Encode nonprinting characters with C-style escapes */ - buf[0] = '\\'; - switch (c) { - case '\a': buf[1] = 'a'; break; - case '\b': buf[1] = 'b'; break; - case '\f': buf[1] = 'f'; break; - case '\n': buf[1] = 'n'; break; - case '\r': buf[1] = 'r'; break; - case '\t': buf[1] = 't'; break; - case '\v': buf[1] = 'v'; break; - case '\\': buf[1] = '\\'; break; - case '\'': buf[1] = '\''; break; - default: - buf[1] = '0' + ((c >> 6) & 3); - buf[2] = '0' + ((c >> 3) & 7); - buf[3] = '0' + ((c >> 0) & 7); - } - } + char buf[8]; - cdecl__err(CDECL_ENOPARSE, _("syntax error, unexpected '%s'"), buf); + to_readable_ch(buf, yytext[0]); + cdecl__err(CDECL_ENOPARSE, _("syntax error, unexpected %s"), buf); return T_LEX_ERROR; } diff --git a/tests/general.at b/tests/general.at index 899c7e4..5141902 100644 --- a/tests/general.at +++ b/tests/general.at @@ -97,7 +97,7 @@ AT_SETUP([cdecl99 command error messages]) # This will only get the start of progname if it includes spaces; # so we won't worry too hard about the exact format later. AT_CHECK([LC_ALL=C cdecl99 --help], [0], [stdout]) -progname=`$AWK 'NR == 1 { print $2; }' stdout` +progname=`$AWK 'NR == 1 { print $2; }' stdout`dnl' # every line is erroneous AT_DATA([input], @@ -117,3 +117,26 @@ AT_CHECK([LC_ALL=C cdecl99 --file=input || exit 42], [42], [], [stderr]) AT_CHECK([$AWK -v progname="$progname" -f check.awk stderr]) AT_CLEANUP + +AT_SETUP([cdecl99 invalid character error messages]) + +$AWK -f- >test.dat