X-Git-Url: https://git.draconx.ca/gitweb/cdecl99.git/blobdiff_plain/363f15bd4b53bd90b86610b143695762037ae706..05d0ced1aa6c1e685e3228e7c43aab3fbb1d88c3:/src/scan.l diff --git a/src/scan.l b/src/scan.l index 75657fb..8a6572e 100644 --- a/src/scan.l +++ b/src/scan.l @@ -1,7 +1,7 @@ %top{ /* * Scanner for C declarations. - * Copyright © 2011 Nick Bowler + * Copyright © 2011, 2021, 2023 Nick Bowler * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -17,86 +17,186 @@ * along with this program. If not, see . */ - #include "parse.h" +#include +#include "parse.h" } -%option noyywrap bison-locations reentrant +%option nodefault noyywrap bison-locations reentrant never-interactive +%option extra-type="int" +%option prefix="cdecl__yy" %{ -#define lex_error(msg) do { \ - yyerror(yylloc, NULL, NULL, (msg)); \ - return T_LEX_ERROR; \ +#include +#include "cdecl-internal.h" +#include "cdecl.h" +#include "errmsg.h" + +#if HAVE_STRTOUMAX +/* Best case, implementation provides strtoumax. */ +# define STRTOUMAX strtoumax +#elif HAVE_STRTOULL +/* Fall back to strtoull, with possibly reduced range. */ +#define STRTOUMAX strtoull +#elif HAVE___STRTOULL +/* HP-UX 11 has __strtoull in */ +#define STRTOUMAX __strtoull +#else +/* Fall back to strtoul, with possibly reduced range. */ +#define STRTOUMAX strtoul +#endif + +#define dup_token() do { \ + yylval->strval = malloc(yyleng+1); \ + if (!yylval->strval) { \ + cdecl__errmsg(CDECL__ENOMEM); \ + return T_LEX_ERROR; \ + } \ + memcpy(yylval->strval, yytext, yyleng); \ + yylval->strval[yyleng] = 0; \ } while(0) + +static char *to_octal(char *dst, unsigned val) +{ + unsigned i; + + for (i = 0; i < 3; i++) { + *dst++ = '0' + ((val >> 6) & 7u); + val <<= 3; + } + + return dst; +} + +/* + * Convert a single character to a C-style character constant, including quote + * characters. At most 7 bytes are written to the buffer for the longest + * octal encoding, e.g., '\177' + */ +static void to_readable_ch(char *dst, char c) +{ + unsigned char uc = c; + unsigned i; + char esc; + + /* + * The 7 standard C control characters are contiguous in ASCII, + * permitting a simple and compact lookup table; separating their + * handling from backslash and quote characters hopefully allows + * the compiler to recognize that. + */ + switch (c) { + case '\a': i = 0; break; + case '\b': i = 1; break; + case '\t': i = 2; break; + case '\n': i = 3; break; + case '\v': i = 4; break; + case '\f': i = 5; break; + case '\r': i = 6; break; + default: i = 7; break; + } + esc = "abtnvfr"[i]; + + /* Otherwise printable characters that should still be escaped. */ + switch (c) { + case '\\': case '\'': esc = c; break; + } + + *dst++ = '\''; + if (esc) { + *dst++ = '\\'; + *dst++ = esc; + } else if (isprint(uc)) { + *dst++ = c; + } else { + *dst++ = '\\'; + dst = to_octal(dst, uc); + } + *dst++ = '\''; + *dst++ = 0; +} + %} -IDENT [_[:alpha:]][_[:alnum:]]* +IDENT [_[:alpha:]][-_[:alnum:]]* INTEGER 0x[[:xdigit:]]+|0[0-7]+|[[:digit:]]+ %% -"..." return T_ELLIPSIS; -";" return T_SEMICOLON; -"*" return T_ASTERISK; -"(" return T_LPAREN; -")" return T_RPAREN; -"[" return T_LBRACKET; -"]" return T_RBRACKET; -"," return T_COMMA; - -"typedef" return T_TYPEDEF; -"extern" return T_EXTERN; -"static" return T_STATIC; -"auto" return T_AUTO; -"register" return T_REGISTER; - -"restrict" return T_RESTRICT; -"volatile" return T_VOLATILE; -"const" return T_CONST; - -"inline" return T_INLINE; - -"void" return T_VOID; -"char" return T_CHAR; -"short" return T_SHORT; -"int" return T_INT; -"long" return T_LONG; -"float" return T_FLOAT; -"double" return T_DOUBLE; -"signed" return T_SIGNED; -"unsigned" return T_UNSIGNED; -"_Bool" return T_BOOL; -"_Complex" return T_COMPLEX; -"_Imaginary" return T_IMAGINARY; - -"struct" return T_STRUCT; -"union" return T_UNION; -"enum" return T_ENUM; +%{ + char *c; +%} + +"..."|[][;*(),] { + unsigned char *match; + static const unsigned char tab[2][8] = { + "*[](),.;", + { + PACK_TOKEN(T_ASTERISK), + PACK_TOKEN(T_LBRACKET), + PACK_TOKEN(T_RBRACKET), + PACK_TOKEN(T_LPAREN), + PACK_TOKEN(T_RPAREN), + PACK_TOKEN(T_COMMA), + PACK_TOKEN(T_ELLIPSIS), + PACK_TOKEN(T_SEMICOLON) + } + }; + + match = memchr(&tab, yytext[0], sizeof tab[0]); + return UNPACK_TOKEN(match[sizeof tab[0]]); +} {INTEGER} { char *end; errno = 0; - yylval->uintval = strtoumax(yytext, &end, 0); - if (errno == ERANGE) - lex_error("integer constant out of range"); - if (*end) - lex_error("invalid integer constant"); + yylval->uintval = STRTOUMAX(yytext, &end, 0); + if (errno == ERANGE) { + cdecl__errmsg(CDECL__ERANGE); + return T_LEX_ERROR; + } + if (*end) { + cdecl__errmsg(CDECL__EBADINT); + return T_LEX_ERROR; + } return T_UINT; } {IDENT} { - yylval->strval = malloc(yyleng+1); - if (!yylval->strval) - lex_error("failed to allocate memory"); - - strcpy(yylval->strval, yytext); - return T_IDENT; + unsigned x = cdecl__to_keyword(yytext, yyleng, yyextra); + int tok; + + yylval->spectype = UNPACK_SPEC(x & 0xff); + if ((tok = (x >> 8)) == PACK_TOKEN(T_IDENT)) { + /* + * Our IDENT pattern includes hyphens so we can match + * "variable-length" as a keyword. In all other cases a + * hyphen is an error. + * + * We could use yyless to re-scan the hyphen and hit the + * error catch-all, but jumping straight to the error code + * seems to produce better results with gcc with no obvious + * downsides. + */ +#if 1 + if ((c = strchr(yytext, '-'))) + goto invalid_char; +#else + yyless(strcspn(yytext, "-")); +#endif + dup_token(); + } + return UNPACK_TOKEN(tok); } [[:space:]]+ . { - char buf[] = "syntax error, unexpected #"; - *strchr(buf, '#') = *yytext; - lex_error(buf); + char buf[8]; + + c = yytext; +invalid_char: + to_readable_ch(buf, *c); + cdecl__err(CDECL_ENOPARSE, _("syntax error, unexpected %s"), buf); + return T_LEX_ERROR; }