]> git.draconx.ca Git - cdecl99.git/commitdiff
libcdecl: Use gperf to identify keywords during scanning.
authorNick Bowler <nbowler@draconx.ca>
Sun, 9 Jul 2023 18:43:43 +0000 (14:43 -0400)
committerNick Bowler <nbowler@draconx.ca>
Mon, 10 Jul 2023 06:01:52 +0000 (02:01 -0400)
Instead of having a flex rule for each keyword, we can use a catch-all
rule for identifiers and then use gperf to distinguish keywords from
ordinary identifiers.

This reduces the size of the scanner much more than the addition of
gperf-generated code increases it.

Makefile.am
src/.gitignore
src/cdecl-internal.h
src/keywords.gperf [new file with mode: 0644]
src/scan.l
tests/decl-bad.at

index 6d2ea6574b9f349502b1c73de4b1fab95605b9af..4fdf5662b06bf10d3c425629bedd2ee443b0a853 100644 (file)
@@ -43,7 +43,7 @@ libcdecl_la_LDFLAGS = -export-symbols-regex '^cdecl_([[:lower:]]|_gl_)' \
                       -no-undefined -version-info 1:0:0
 libcdecl_la_SOURCES = src/scan.c src/parse.c src/parse-decl.c src/output.c \
                       src/explain.c src/declare.c src/error.c src/normalize.c \
-                      src/cdecl-internal.h src/errmsg.h
+                      src/keywords.c src/cdecl-internal.h src/errmsg.h
 libcdecl_la_LIBADD = $(shared_gl_objects) $(LTLIBINTL) $(LIBTHREAD)
 EXTRA_libcdecl_la_DEPENDENCIES = $(shared_gl_objects)
 $(libcdecl_la_OBJECTS): $(gnulib_headers)
@@ -101,6 +101,7 @@ $(t_rng_test_OBJECTS): $(gnulib_headers)
 EXTRA_DIST += t/xos256p.c
 
 src/error.lo: src/errmsg.h
+src/keywords.lo: src/parse.h
 src/output.lo: src/parse.h src/specstr.h
 src/parse-decl.lo: src/scan.h src/parse.h src/typemap.h src/errmsg.h
 src/parse.lo: src/scan.h src/errmsg.h
@@ -309,7 +310,7 @@ V_GPERF   = $(V_GPERF_@AM_V@)
 V_GPERF_  = $(V_GPERF_@AM_DEFAULT_V@)
 V_GPERF_0 = @printf '  %$(DX_ALIGN_V)s %s\n' 'GPERF   ' $@;
 
-GPERFFILES = src/execute.gperf
+GPERFFILES = src/execute.gperf src/keywords.gperf
 .gperf.c:
        $(V_GPERF) $(GPERF) $< >$@.tmp
        $(AM_V_at) mv $@.tmp $@
index 3786172701ae5b360db3d757695858caf4e790de..d32bd40f927834523805efe19de3fdba4a2fa3b8 100644 (file)
@@ -3,6 +3,7 @@
 /commands.h
 /errmsg.h
 /execute.c
+/keywords.c
 /options.h
 /parse.[ch]
 /scan.[ch]
index 74c2654b76501016c2761d4138c7654fbdaa4994..cf7fc1487469fa9881dce071111146c9f1102369 100644 (file)
@@ -53,4 +53,6 @@ const char *cdecl__emit_specs(struct output_state *dst,
                               struct cdecl_declspec *s,
                               unsigned mask);
 
+int cdecl__to_keyword(const char *s, int len, int english_mode);
+
 #endif
diff --git a/src/keywords.gperf b/src/keywords.gperf
new file mode 100644 (file)
index 0000000..88b2476
--- /dev/null
@@ -0,0 +1,90 @@
+%{
+/*
+ * Copyright © 2023 Nick Bowler
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <string.h>
+#include <inttypes.h>
+#include "cdecl-internal.h"
+#include "parse.h"
+
+static const struct keyword *in_word_set();
+%}
+
+%struct-type
+%readonly-tables
+%language=ANSI-C
+%global-table
+%pic
+
+struct keyword {
+       int_least16_t  name;
+       uint_least8_t token;
+};
+%%
+_Bool,           (T_BOOL      & 0x7f)
+_Complex,        (T_COMPLEX   & 0x7f)
+_Imaginary,      (T_IMAGINARY & 0x7f)
+auto,            (T_AUTO      & 0x7f)
+char,            (T_CHAR      & 0x7f)
+const,           (T_CONST     & 0x7f)
+double,          (T_DOUBLE    & 0x7f)
+enum,            (T_ENUM      & 0x7f)
+extern,          (T_EXTERN    & 0x7f)
+float,           (T_FLOAT     & 0x7f)
+inline,          (T_INLINE    & 0x7f)
+int,             (T_INT       & 0x7f)
+long,            (T_LONG      & 0x7f)
+register,        (T_REGISTER  & 0x7f)
+restrict,        (T_RESTRICT  & 0x7f)
+short,           (T_SHORT     & 0x7f)
+signed,          (T_SIGNED    & 0x7f)
+static,          (T_STATIC    & 0x7f)
+struct,          (T_STRUCT    & 0x7f)
+typedef,         (T_TYPEDEF   & 0x7f)
+union,           (T_UNION     & 0x7f)
+unsigned,        (T_UNSIGNED  & 0x7f)
+void,            (T_VOID      & 0x7f)
+volatile,        (T_VOLATILE  & 0x7f)
+# english keywords
+array,           (T_ARRAY     & 0x7f) | 0x80
+as,              (T_AS        & 0x7f) | 0x80
+declare,         (T_DECLARE   & 0x7f) | 0x80
+function,        (T_FUNCTION  & 0x7f) | 0x80
+of,              (T_OF        & 0x7f) | 0x80
+pointer,         (T_POINTER   & 0x7f) | 0x80
+returning,       (T_RETURNING & 0x7f) | 0x80
+to,              (T_TO        & 0x7f) | 0x80
+type,            (T_TYPE      & 0x7f) | 0x80
+variable-length, (T_VLA       & 0x7f) | 0x80
+%%
+int cdecl__to_keyword(const char *s, int len, int english_mode)
+{
+       const struct keyword *k;
+
+       if ((k = in_word_set(s, len))) {
+               unsigned x = (k->token & 0x7fu);
+
+               if (english_mode || !(k->token & ~0x7fu)) {
+                       if (T_VOID >= 256)
+                               x += 256;
+                       return x;
+               }
+       }
+
+       return T_IDENT;
+}
index 0a4db93bb2ec74e4b64b7ca8085cd82873c7225b..1342e5fd3dbbac0f095686e1bebd5ec2dea59d96 100644 (file)
@@ -22,7 +22,7 @@
 }
 
 %option nodefault noyywrap bison-locations reentrant never-interactive
-%option extra-type="_Bool"
+%option extra-type="int"
 %option prefix="cdecl__yy"
 
 %{
@@ -51,7 +51,8 @@
                cdecl__errmsg(CDECL__ENOMEM); \
                return T_LEX_ERROR; \
        } \
-       strcpy(yylval->strval, yytext); \
+       memcpy(yylval->strval, yytext, yyleng); \
+       yylval->strval[yyleng] = 0; \
 } while(0)
 
 static char *to_octal(char *dst, unsigned val)
@@ -116,17 +117,16 @@ static void to_readable_ch(char *dst, char c)
 
 %}
 
-%s ENGLISH
-
-IDENT [_[:alpha:]][_[:alnum:]]*
+IDENT [_[:alpha:]][-_[:alnum:]]*
 INTEGER 0x[[:xdigit:]]+|0[0-7]+|[[:digit:]]+
 
 %%
 
 %{
-       if (yyextra) {
-               yyextra = 0;
-               BEGIN(ENGLISH);
+       char *c;
+
+       if (yyextra > 0) {
+               yyextra = -yyextra;
                return T_ENGLISH;
        }
 %}
@@ -140,35 +140,6 @@ INTEGER 0x[[:xdigit:]]+|0[0-7]+|[[:digit:]]+
 "]"   return T_RBRACKET;
 ","   return T_COMMA;
 
-"typedef"    return T_TYPEDEF;
-"extern"     return T_EXTERN;
-"static"     return T_STATIC;
-"auto"       return T_AUTO;
-"register"   return T_REGISTER;
-
-"restrict"   return T_RESTRICT;
-"volatile"   return T_VOLATILE;
-"const"      return T_CONST;
-
-"inline"     return T_INLINE;
-
-"void"       return T_VOID;
-"char"       return T_CHAR;
-"short"      return T_SHORT;
-"int"        return T_INT;
-"long"       return T_LONG;
-"float"      return T_FLOAT;
-"double"     return T_DOUBLE;
-"signed"     return T_SIGNED;
-"unsigned"   return T_UNSIGNED;
-"_Bool"      return T_BOOL;
-"_Complex"   return T_COMPLEX;
-"_Imaginary" return T_IMAGINARY;
-
-"struct"     return T_STRUCT;
-"union"      return T_UNION;
-"enum"       return T_ENUM;
-
 {INTEGER} {
        char *end;
 
@@ -186,26 +157,37 @@ INTEGER 0x[[:xdigit:]]+|0[0-7]+|[[:digit:]]+
        return T_UINT;
 }
 
-<ENGLISH>{
-       "variable-length" return T_VLA;
-       "type"            return T_TYPE;
-       "declare"         return T_DECLARE;
-       "pointer"         return T_POINTER;
-       "function"        return T_FUNCTION;
-       "returning"       return T_RETURNING;
-       "array"           return T_ARRAY;
-       "to"              return T_TO;
-       "of"              return T_OF;
-       "as"              return T_AS;
+{IDENT} {
+       int ret = cdecl__to_keyword(yytext, yyleng, yyextra);
+       if (ret == T_IDENT) {
+               /*
+                * Our IDENT pattern includes hyphens so we can match
+                * "variable-length" as a keyword.  In all other cases a
+                * hyphen is an error.
+                *
+                * We could use yyless to re-scan the hyphen and hit the
+                * error catch-all, but jumping straight to the error code
+                * seems to produce better results with gcc with no obvious
+                * downsides.
+                */
+#if 1
+               if ((c = strchr(yytext, '-')))
+                       goto invalid_char;
+#else
+               yyless(strcspn(yytext, "-"));
+#endif
+               dup_token();
+       }
+       return ret;
 }
 
-{IDENT} { dup_token(); return T_IDENT; }
-
 [[:space:]]+
 . {
        char buf[8];
 
-       to_readable_ch(buf, yytext[0]);
+       c = yytext;
+invalid_char:
+       to_readable_ch(buf, *c);
        cdecl__err(CDECL_ENOPARSE, _("syntax error, unexpected %s"), buf);
        return T_LEX_ERROR;
 }
index 96c54a3f7fce4323440a4cdf770932439e9e4f5e..62af91ccd1240b0bdcd5c7efd73e513f93db8c3b 100644 (file)
@@ -150,3 +150,7 @@ SIMPLE_BADDECL([Reject multiple declarators in type names],
 
 SIMPLE_BADDECL([Error recovery on multiple object declaration],
   [explain int inline x, y])
+
+SIMPLE_BADDECL([Reject hyphens in identifiers],
+  [explain int ac-dc],
+  [explain int variable-length])