]> git.draconx.ca Git - cdecl99.git/blobdiff - src/scan.l
libcdecl: Combine scanner rules for punctuation.
[cdecl99.git] / src / scan.l
index f2e92a8703509d32c734abfa7a42391ab7d9559c..7e3e365af90f99102b3fc11adf198ed09e9f423d 100644 (file)
@@ -1,7 +1,7 @@
 %top{
 /*
  *  Scanner for C declarations.
- *  Copyright © 2011 Nick Bowler
+ *  Copyright © 2011, 2021, 2023 Nick Bowler
  *
  *  This program is free software: you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
- #include "parse.h"
+#include <config.h>
+#include "parse.h"
 }
 
-%option noyywrap bison-locations
+%option nodefault noyywrap bison-locations reentrant never-interactive
+%option extra-type="int"
+%option prefix="cdecl__yy"
 
 %{
-#define lex_error(msg) do { \
-       yyerror(yylloc, NULL, (msg)); \
-       return T_LEX_ERROR; \
+#include <ctype.h>
+#include "cdecl-internal.h"
+#include "cdecl.h"
+#include "errmsg.h"
+
+#if HAVE_STRTOUMAX
+/* Best case, implementation provides strtoumax. */
+#  define STRTOUMAX strtoumax
+#elif HAVE_STRTOULL
+/* Fall back to strtoull, with possibly reduced range. */
+#define STRTOUMAX strtoull
+#elif HAVE___STRTOULL
+/* HP-UX 11 has __strtoull in <inttypes.h> */
+#define STRTOUMAX __strtoull
+#else
+/* Fall back to strtoul, with possibly reduced range. */
+#define STRTOUMAX strtoul
+#endif
+
+#define dup_token() do { \
+       yylval->strval = malloc(yyleng+1); \
+       if (!yylval->strval) { \
+               cdecl__errmsg(CDECL__ENOMEM); \
+               return T_LEX_ERROR; \
+       } \
+       memcpy(yylval->strval, yytext, yyleng); \
+       yylval->strval[yyleng] = 0; \
 } while(0)
+
+static char *to_octal(char *dst, unsigned val)
+{
+       unsigned i;
+
+       for (i = 0; i < 3; i++) {
+               *dst++ = '0' + ((val >> 6) & 7u);
+               val <<= 3;
+       }
+
+       return dst;
+}
+
+/*
+ * Convert a single character to a C-style character constant, including quote
+ * characters.  At most 7 bytes are written to the buffer for the longest
+ * octal encoding, e.g., '\177'
+ */
+static void to_readable_ch(char *dst, char c)
+{
+       unsigned char uc = c;
+       unsigned i;
+       char esc;
+
+       /*
+        * The 7 standard C control characters are contiguous in ASCII,
+        * permitting a simple and compact lookup table; separating their
+        * handling from backslash and quote characters hopefully allows
+        * the compiler to recognize that.
+        */
+       switch (c) {
+       case '\a': i = 0; break;
+       case '\b': i = 1; break;
+       case '\t': i = 2; break;
+       case '\n': i = 3; break;
+       case '\v': i = 4; break;
+       case '\f': i = 5; break;
+       case '\r': i = 6; break;
+       default:   i = 7; break;
+       }
+       esc = "abtnvfr"[i];
+
+       /* Otherwise printable characters that should still be escaped. */
+       switch (c) {
+       case '\\': case '\'': esc = c; break;
+       }
+
+       *dst++ = '\'';
+       if (esc) {
+               *dst++ = '\\';
+               *dst++ = esc;
+       } else if (isprint(uc)) {
+               *dst++ = c;
+       } else {
+               *dst++ = '\\';
+               dst = to_octal(dst, uc);
+       }
+       *dst++ = '\'';
+       *dst++ = 0;
+}
+
 %}
 
-IDENT [_[:alpha:]][_[:alnum:]]*
+IDENT [_[:alpha:]][-_[:alnum:]]*
+INTEGER 0x[[:xdigit:]]+|0[0-7]+|[[:digit:]]+
 
 %%
 
-";" return T_SEMICOLON;
-"*" return T_ASTERISK;
-"(" return T_LPAREN;
-")" return T_RPAREN;
-"[" return T_LBRACKET;
-"]" return T_RBRACKET;
-"," return T_COMMA;
-
-"typedef"  return T_TYPEDEF;
-"extern"   return T_EXTERN;
-"static"   return T_STATIC;
-"auto"     return T_AUTO;
-"register" return T_REGISTER;
-
-"restrict" return T_RESTRICT;
-"volatile" return T_VOLATILE;
-"const"    return T_CONST;
-
-"inline"   return T_INLINE;
-
-"void"     return T_VOID;
-"char"     return T_CHAR;
-"short"    return T_SHORT;
-"int"      return T_INT;
-"long"     return T_LONG;
-"float"    return T_FLOAT;
-"double"   return T_DOUBLE;
-"signed"   return T_SIGNED;
-"unsigned" return T_UNSIGNED;
-"_Bool"    return T_BOOL;
-"_Complex" return T_COMPLEX;
-
-"struct"   return T_STRUCT;
-"union"    return T_UNION;
-"enum"     return T_ENUM;
+%{
+       char *c;
 
-{IDENT} {
-       yylval->strval = malloc(yyleng+1);
-       if (!yylval->strval)
-               lex_error("failed to allocate memory");
+       if (yyextra > 0) {
+               yyextra = -yyextra;
+               return T_ENGLISH;
+       }
+%}
+
+"..."|[][;*(),] {
+       static const unsigned char tab[2][8] = {
+               "*[](),.;",
+               {
+                       T_ASTERISK  & 0xff,
+                       T_LBRACKET  & 0xff,
+                       T_RBRACKET  & 0xff,
+                       T_LPAREN    & 0xff,
+                       T_RPAREN    & 0xff,
+                       T_COMMA     & 0xff,
+                       T_ELLIPSIS  & 0xff,
+                       T_SEMICOLON & 0xff
+               }
+       };
+
+       unsigned char *match;
+       int x;
+
+       match = memchr(&tab, yytext[0], sizeof tab[0]);
+       x = match[sizeof tab[0]];
+
+       if (T_VOID >= 256)
+               x += 256;
+       return x;
+}
+
+{INTEGER} {
+       char *end;
 
-       strcpy(yylval->strval, yytext);
-       return T_IDENT;
+       errno = 0;
+       yylval->uintval = STRTOUMAX(yytext, &end, 0);
+       if (errno == ERANGE) {
+               cdecl__errmsg(CDECL__ERANGE);
+               return T_LEX_ERROR;
+       }
+       if (*end) {
+               cdecl__errmsg(CDECL__EBADINT);
+               return T_LEX_ERROR;
+       }
+
+       return T_UINT;
+}
+
+{IDENT} {
+       int ret = cdecl__to_keyword(yytext, yyleng, yyextra);
+       if (ret == T_IDENT) {
+               /*
+                * Our IDENT pattern includes hyphens so we can match
+                * "variable-length" as a keyword.  In all other cases a
+                * hyphen is an error.
+                *
+                * We could use yyless to re-scan the hyphen and hit the
+                * error catch-all, but jumping straight to the error code
+                * seems to produce better results with gcc with no obvious
+                * downsides.
+                */
+#if 1
+               if ((c = strchr(yytext, '-')))
+                       goto invalid_char;
+#else
+               yyless(strcspn(yytext, "-"));
+#endif
+               dup_token();
+       }
+       return ret;
 }
 
 [[:space:]]+
 . {
-       char buf[] = "syntax error, unexpected #";
-       *strchr(buf, '#') = *yytext;
-       lex_error(buf);
+       char buf[8];
+
+       c = yytext;
+invalid_char:
+       to_readable_ch(buf, *c);
+       cdecl__err(CDECL_ENOPARSE, _("syntax error, unexpected %s"), buf);
+       return T_LEX_ERROR;
 }