From c4d0ba7251867ee9bdb8466357cba5a9fa352b76 Mon Sep 17 00:00:00 2001 From: Nick Bowler Date: Sun, 9 Jul 2023 14:43:43 -0400 Subject: [PATCH] libcdecl: Use gperf to identify keywords during scanning. Instead of having a flex rule for each keyword, we can use a catch-all rule for identifiers and then use gperf to distinguish keywords from ordinary identifiers. This reduces the size of the scanner much more than the addition of gperf-generated code increases it. --- Makefile.am | 5 ++- src/.gitignore | 1 + src/cdecl-internal.h | 2 + src/keywords.gperf | 90 ++++++++++++++++++++++++++++++++++++++++++++ src/scan.l | 84 ++++++++++++++++------------------------- tests/decl-bad.at | 4 ++ 6 files changed, 133 insertions(+), 53 deletions(-) create mode 100644 src/keywords.gperf diff --git a/Makefile.am b/Makefile.am index 6d2ea65..4fdf566 100644 --- a/Makefile.am +++ b/Makefile.am @@ -43,7 +43,7 @@ libcdecl_la_LDFLAGS = -export-symbols-regex '^cdecl_([[:lower:]]|_gl_)' \ -no-undefined -version-info 1:0:0 libcdecl_la_SOURCES = src/scan.c src/parse.c src/parse-decl.c src/output.c \ src/explain.c src/declare.c src/error.c src/normalize.c \ - src/cdecl-internal.h src/errmsg.h + src/keywords.c src/cdecl-internal.h src/errmsg.h libcdecl_la_LIBADD = $(shared_gl_objects) $(LTLIBINTL) $(LIBTHREAD) EXTRA_libcdecl_la_DEPENDENCIES = $(shared_gl_objects) $(libcdecl_la_OBJECTS): $(gnulib_headers) @@ -101,6 +101,7 @@ $(t_rng_test_OBJECTS): $(gnulib_headers) EXTRA_DIST += t/xos256p.c src/error.lo: src/errmsg.h +src/keywords.lo: src/parse.h src/output.lo: src/parse.h src/specstr.h src/parse-decl.lo: src/scan.h src/parse.h src/typemap.h src/errmsg.h src/parse.lo: src/scan.h src/errmsg.h @@ -309,7 +310,7 @@ V_GPERF = $(V_GPERF_@AM_V@) V_GPERF_ = $(V_GPERF_@AM_DEFAULT_V@) V_GPERF_0 = @printf ' %$(DX_ALIGN_V)s %s\n' 'GPERF ' $@; -GPERFFILES = src/execute.gperf +GPERFFILES = src/execute.gperf src/keywords.gperf .gperf.c: $(V_GPERF) $(GPERF) $< >$@.tmp $(AM_V_at) mv $@.tmp $@ diff --git a/src/.gitignore b/src/.gitignore index 3786172..d32bd40 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -3,6 +3,7 @@ /commands.h /errmsg.h /execute.c +/keywords.c /options.h /parse.[ch] /scan.[ch] diff --git a/src/cdecl-internal.h b/src/cdecl-internal.h index 74c2654..cf7fc14 100644 --- a/src/cdecl-internal.h +++ b/src/cdecl-internal.h @@ -53,4 +53,6 @@ const char *cdecl__emit_specs(struct output_state *dst, struct cdecl_declspec *s, unsigned mask); +int cdecl__to_keyword(const char *s, int len, int english_mode); + #endif diff --git a/src/keywords.gperf b/src/keywords.gperf new file mode 100644 index 0000000..88b2476 --- /dev/null +++ b/src/keywords.gperf @@ -0,0 +1,90 @@ +%{ +/* + * Copyright © 2023 Nick Bowler + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include "cdecl-internal.h" +#include "parse.h" + +static const struct keyword *in_word_set(); +%} + +%struct-type +%readonly-tables +%language=ANSI-C +%global-table +%pic + +struct keyword { + int_least16_t name; + uint_least8_t token; +}; +%% +_Bool, (T_BOOL & 0x7f) +_Complex, (T_COMPLEX & 0x7f) +_Imaginary, (T_IMAGINARY & 0x7f) +auto, (T_AUTO & 0x7f) +char, (T_CHAR & 0x7f) +const, (T_CONST & 0x7f) +double, (T_DOUBLE & 0x7f) +enum, (T_ENUM & 0x7f) +extern, (T_EXTERN & 0x7f) +float, (T_FLOAT & 0x7f) +inline, (T_INLINE & 0x7f) +int, (T_INT & 0x7f) +long, (T_LONG & 0x7f) +register, (T_REGISTER & 0x7f) +restrict, (T_RESTRICT & 0x7f) +short, (T_SHORT & 0x7f) +signed, (T_SIGNED & 0x7f) +static, (T_STATIC & 0x7f) +struct, (T_STRUCT & 0x7f) +typedef, (T_TYPEDEF & 0x7f) +union, (T_UNION & 0x7f) +unsigned, (T_UNSIGNED & 0x7f) +void, (T_VOID & 0x7f) +volatile, (T_VOLATILE & 0x7f) +# english keywords +array, (T_ARRAY & 0x7f) | 0x80 +as, (T_AS & 0x7f) | 0x80 +declare, (T_DECLARE & 0x7f) | 0x80 +function, (T_FUNCTION & 0x7f) | 0x80 +of, (T_OF & 0x7f) | 0x80 +pointer, (T_POINTER & 0x7f) | 0x80 +returning, (T_RETURNING & 0x7f) | 0x80 +to, (T_TO & 0x7f) | 0x80 +type, (T_TYPE & 0x7f) | 0x80 +variable-length, (T_VLA & 0x7f) | 0x80 +%% +int cdecl__to_keyword(const char *s, int len, int english_mode) +{ + const struct keyword *k; + + if ((k = in_word_set(s, len))) { + unsigned x = (k->token & 0x7fu); + + if (english_mode || !(k->token & ~0x7fu)) { + if (T_VOID >= 256) + x += 256; + return x; + } + } + + return T_IDENT; +} diff --git a/src/scan.l b/src/scan.l index 0a4db93..1342e5f 100644 --- a/src/scan.l +++ b/src/scan.l @@ -22,7 +22,7 @@ } %option nodefault noyywrap bison-locations reentrant never-interactive -%option extra-type="_Bool" +%option extra-type="int" %option prefix="cdecl__yy" %{ @@ -51,7 +51,8 @@ cdecl__errmsg(CDECL__ENOMEM); \ return T_LEX_ERROR; \ } \ - strcpy(yylval->strval, yytext); \ + memcpy(yylval->strval, yytext, yyleng); \ + yylval->strval[yyleng] = 0; \ } while(0) static char *to_octal(char *dst, unsigned val) @@ -116,17 +117,16 @@ static void to_readable_ch(char *dst, char c) %} -%s ENGLISH - -IDENT [_[:alpha:]][_[:alnum:]]* +IDENT [_[:alpha:]][-_[:alnum:]]* INTEGER 0x[[:xdigit:]]+|0[0-7]+|[[:digit:]]+ %% %{ - if (yyextra) { - yyextra = 0; - BEGIN(ENGLISH); + char *c; + + if (yyextra > 0) { + yyextra = -yyextra; return T_ENGLISH; } %} @@ -140,35 +140,6 @@ INTEGER 0x[[:xdigit:]]+|0[0-7]+|[[:digit:]]+ "]" return T_RBRACKET; "," return T_COMMA; -"typedef" return T_TYPEDEF; -"extern" return T_EXTERN; -"static" return T_STATIC; -"auto" return T_AUTO; -"register" return T_REGISTER; - -"restrict" return T_RESTRICT; -"volatile" return T_VOLATILE; -"const" return T_CONST; - -"inline" return T_INLINE; - -"void" return T_VOID; -"char" return T_CHAR; -"short" return T_SHORT; -"int" return T_INT; -"long" return T_LONG; -"float" return T_FLOAT; -"double" return T_DOUBLE; -"signed" return T_SIGNED; -"unsigned" return T_UNSIGNED; -"_Bool" return T_BOOL; -"_Complex" return T_COMPLEX; -"_Imaginary" return T_IMAGINARY; - -"struct" return T_STRUCT; -"union" return T_UNION; -"enum" return T_ENUM; - {INTEGER} { char *end; @@ -186,26 +157,37 @@ INTEGER 0x[[:xdigit:]]+|0[0-7]+|[[:digit:]]+ return T_UINT; } -{ - "variable-length" return T_VLA; - "type" return T_TYPE; - "declare" return T_DECLARE; - "pointer" return T_POINTER; - "function" return T_FUNCTION; - "returning" return T_RETURNING; - "array" return T_ARRAY; - "to" return T_TO; - "of" return T_OF; - "as" return T_AS; +{IDENT} { + int ret = cdecl__to_keyword(yytext, yyleng, yyextra); + if (ret == T_IDENT) { + /* + * Our IDENT pattern includes hyphens so we can match + * "variable-length" as a keyword. In all other cases a + * hyphen is an error. + * + * We could use yyless to re-scan the hyphen and hit the + * error catch-all, but jumping straight to the error code + * seems to produce better results with gcc with no obvious + * downsides. + */ +#if 1 + if ((c = strchr(yytext, '-'))) + goto invalid_char; +#else + yyless(strcspn(yytext, "-")); +#endif + dup_token(); + } + return ret; } -{IDENT} { dup_token(); return T_IDENT; } - [[:space:]]+ . { char buf[8]; - to_readable_ch(buf, yytext[0]); + c = yytext; +invalid_char: + to_readable_ch(buf, *c); cdecl__err(CDECL_ENOPARSE, _("syntax error, unexpected %s"), buf); return T_LEX_ERROR; } diff --git a/tests/decl-bad.at b/tests/decl-bad.at index 96c54a3..62af91c 100644 --- a/tests/decl-bad.at +++ b/tests/decl-bad.at @@ -150,3 +150,7 @@ SIMPLE_BADDECL([Reject multiple declarators in type names], SIMPLE_BADDECL([Error recovery on multiple object declaration], [explain int inline x, y]) + +SIMPLE_BADDECL([Reject hyphens in identifiers], + [explain int ac-dc], + [explain int variable-length]) -- 2.43.2