From: jordan@hack_attack Date: Fri, 30 Sep 2016 00:31:58 +0000 (-0700) Subject: ragel state machine for lexer/scanner X-Git-Url: https://git.kengrimes.com/?p=henge%2Fwebcc.git;a=commitdiff_plain;h=dc6e33a26d6488e388817d4dd255dcdff22b4a2a ragel state machine for lexer/scanner --- diff --git a/src/apc/lexer.c b/src/apc/lexer.c index 67aeb23..4d3ab0a 100644 --- a/src/apc/lexer.c +++ b/src/apc/lexer.c @@ -30,8 +30,9 @@ /* Public */ int lexer_init(void); int lexer(void); -int lexer_lex(const char*); -void lexer_pushtok(int, int); +void lexer_pushtok(int, YYSTYPE); +extern //ragel +int lexer_lex(const char*); struct dirent* lexer_direntpa[DE_STACKSIZE]; /* Private */ extern //scanner.c @@ -44,13 +45,14 @@ extern //bison YYSTYPE yylval; static struct tok -{ int lval; - int tok; +{ union YYSTYPE val; //token val + int tt; //token type } token_stack[TK_STACKSIZE]; static union tokp -{ int* i; - struct tok* t; +{ int* tpt; //token pointer type + struct tok* tok; + union YYSTYPE* tvp; //token value pointer } tks, tkx; static struct dirent** dps; @@ -76,14 +78,16 @@ struct dirent** dps; times in a sequence! */ #define TK_STACK (token_stack) -#define TK_STACKP (tks.t) -#define TK_STACKPI (tks.i) -#define TK_STACKX (tkx.t) -#define TK_STACKXI (tkx.i) +#define TK_STACKP (tks.tok) +#define TK_STACKPI (tks.tpt) +#define TK_STACKPL (tks.tvp) +#define TK_STACKX (tkx.tok) +#define TK_STACKXI (tkx.tpt) #define TK_LEN() (TK_STACKP - TK_STACKX) #define TK_INIT() (TK_STACKP = TK_STACKX = TK_STACK) #define TK_POP() (*TK_STACKP++) #define TK_POPI() (*TK_STACKPI++); +#define TK_POPL() (*TK_STACKPL++); #define TK_PUSH(T,L) (*TK_STACKX++ = (struct tok){L,T}) /* Initializer @@ -114,20 +118,10 @@ int lexer return 0; } } - yylval.val = TK_POPI(); + yylval = TK_POPL(); return TK_POPI(); } -/* Lexical Analysis - Ragel state machine for tokenizing text. -*/ -int lexer_lex -(const char* str) -{ lexer_pushtok(1, 2); - printf (str); - return 1; -} - /* Token Receiver This receiver takes a struct tok and pushes it to the FIFO stack. @@ -135,7 +129,7 @@ int lexer_lex void lexer_pushtok #define S(S)#S //stringifier #define ERR_TK "Fatal: Generated over " S(TK_STACKSIZE) " tokens in one pass." -( int tok, int lval ) +( int tok, YYSTYPE lval ) { if (TK_LEN() >= TK_STACKSIZE) { fprintf(stderr, ERR_TK); exit(EXIT_FAILURE); diff --git a/src/apc/lexer_lex.c b/src/apc/lexer_lex.c new file mode 100644 index 0000000..ca2b477 --- /dev/null +++ b/src/apc/lexer_lex.c @@ -0,0 +1,307 @@ + +#line 1 "lexer_lex.rl" +/* Ragel State Machine for tokenizing text */ +#include +#include +#include + +extern void lexer_pushtok(int, YYSTYPE); + +int lexer_lex(const char*); +int ipow(int, int); +int ttov(const char* str, int); +uint64_t ttor(const char* str, int); +char* ttos(const char* str, int); + + +#define MAX_TOK_LEN 64 +#define MAX_TOKENS 16 +#define MAX_STR_SIZE (MAX_TOK_LEN * MAX_TOKENS) + + + +#line 47 "lexer_lex.rl" + + + + +#line 29 "lexer_lex.c" +static const char _token_matcher_actions[] = { + 0, 1, 0, 1, 1, 1, 2 +}; + +static const char _token_matcher_key_offsets[] = { + 0, 0, 7, 13, 17, 20, 27 +}; + +static const char _token_matcher_trans_keys[] = { + 48, 49, 57, 65, 90, 97, 122, 48, + 57, 65, 70, 97, 102, 95, 120, 48, + 57, 95, 48, 57, 95, 48, 57, 65, + 70, 97, 102, 95, 65, 90, 97, 122, + 0 +}; + +static const char _token_matcher_single_lengths[] = { + 0, 1, 0, 2, 1, 1, 1 +}; + +static const char _token_matcher_range_lengths[] = { + 0, 3, 3, 1, 1, 3, 2 +}; + +static const char _token_matcher_index_offsets[] = { + 0, 0, 5, 9, 13, 16, 21 +}; + +static const char _token_matcher_indicies[] = { + 0, 2, 3, 3, 1, 4, 4, 4, + 1, 5, 6, 2, 1, 5, 2, 1, + 7, 4, 4, 4, 1, 8, 3, 3, + 1, 0 +}; + +static const char _token_matcher_trans_targs[] = { + 3, 0, 4, 6, 5, 1, 2, 1, + 1 +}; + +static const char _token_matcher_trans_actions[] = { + 0, 0, 0, 0, 0, 3, 0, 1, + 5 +}; + +static const char _token_matcher_eof_actions[] = { + 0, 0, 0, 3, 3, 1, 5 +}; + +static const int token_matcher_start = 1; +static const int token_matcher_first_final = 3; +static const int token_matcher_error = 0; + +static const int token_matcher_en_main = 1; + + +#line 51 "lexer_lex.rl" + +/* 0xxdigit+ => tok_t REF, yylval.ref = uint64_t + [0-9]+ => tok_t NUM, yylval.val = int + [a-zA-Z]+ => tok_t NAME, yylval.str = char* */ + +/* Scan filename and push the its tokens + onto the stack */ +int lexer_lex (const char* str) +{ + const char *p, *pe, *ts, *eof; + int cs, tok_t ; //tok_t == token type + + p = ts = str; + pe = p + strlen(str) + 1; + +#line 102 "lexer_lex.c" + { + cs = token_matcher_start; + } + +#line 66 "lexer_lex.rl" + +#line 109 "lexer_lex.c" + { + int _klen; + unsigned int _trans; + const char *_acts; + unsigned int _nacts; + const char *_keys; + + if ( p == pe ) + goto _test_eof; + if ( cs == 0 ) + goto _out; +_resume: + _keys = _token_matcher_trans_keys + _token_matcher_key_offsets[cs]; + _trans = _token_matcher_index_offsets[cs]; + + _klen = _token_matcher_single_lengths[cs]; + if ( _klen > 0 ) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + _klen - 1; + while (1) { + if ( _upper < _lower ) + break; + + _mid = _lower + ((_upper-_lower) >> 1); + if ( (*p) < *_mid ) + _upper = _mid - 1; + else if ( (*p) > *_mid ) + _lower = _mid + 1; + else { + _trans += (unsigned int)(_mid - _keys); + goto _match; + } + } + _keys += _klen; + _trans += _klen; + } + + _klen = _token_matcher_range_lengths[cs]; + if ( _klen > 0 ) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + (_klen<<1) - 2; + while (1) { + if ( _upper < _lower ) + break; + + _mid = _lower + (((_upper-_lower) >> 1) & ~1); + if ( (*p) < _mid[0] ) + _upper = _mid - 2; + else if ( (*p) > _mid[1] ) + _lower = _mid + 2; + else { + _trans += (unsigned int)((_mid - _keys)>>1); + goto _match; + } + } + _trans += _klen; + } + +_match: + _trans = _token_matcher_indicies[_trans]; + cs = _token_matcher_trans_targs[_trans]; + + if ( _token_matcher_trans_actions[_trans] == 0 ) + goto _again; + + _acts = _token_matcher_actions + _token_matcher_trans_actions[_trans]; + _nacts = (unsigned int) *_acts++; + while ( _nacts-- > 0 ) + { + switch ( *_acts++ ) + { + case 0: +#line 24 "lexer_lex.rl" + { + tok_t = REF; \ + yylval.ref = ttor(ts, p-ts); \ + lexer_pushtok(tok_t, yylval); \ + ts = p; } + break; + case 1: +#line 30 "lexer_lex.rl" + { tok_t = NUM; \ + yylval.val = ttov(ts, p-ts); \ + lexer_pushtok(tok_t, yylval); \ + ts = p; } + break; + case 2: +#line 35 "lexer_lex.rl" + { tok_t = NAME; \ + yylval.str = ttos(ts, p-ts); \ + lexer_pushtok(tok_t, yylval); \ + ts = p; } + break; +#line 205 "lexer_lex.c" + } + } + +_again: + if ( cs == 0 ) + goto _out; + if ( ++p != pe ) + goto _resume; + _test_eof: {} + if ( p == eof ) + { + const char *__acts = _token_matcher_actions + _token_matcher_eof_actions[cs]; + unsigned int __nacts = (unsigned int) *__acts++; + while ( __nacts-- > 0 ) { + switch ( *__acts++ ) { + case 0: +#line 24 "lexer_lex.rl" + { + tok_t = REF; \ + yylval.ref = ttor(ts, p-ts); \ + lexer_pushtok(tok_t, yylval); \ + ts = p; } + break; + case 1: +#line 30 "lexer_lex.rl" + { tok_t = NUM; \ + yylval.val = ttov(ts, p-ts); \ + lexer_pushtok(tok_t, yylval); \ + ts = p; } + break; + case 2: +#line 35 "lexer_lex.rl" + { tok_t = NAME; \ + yylval.str = ttos(ts, p-ts); \ + lexer_pushtok(tok_t, yylval); \ + ts = p; } + break; +#line 243 "lexer_lex.c" + } + } + } + + _out: {} + } + +#line 67 "lexer_lex.rl" + + lexer_pushtok(tok_t, yylval); + + printf (str); + return 1; +} + +int ipow(int base, int exp) +{ + int result = 1; + while (exp) + { + if (exp & 1) + result = result * base; + exp = exp >> 1; + base *= base; + } + + return result; +} + +/* Token to Value */ +int ttov(const char* str, int len) +{ + int i, val = 0; + + for (i = 0; i < len; i++) + { + val += ((str[len - (i + 1)] - '0') * ipow(10,i)); + } + + return val; +} + +uint64_t ttor(const char* str, int len) +{ + int i; + uint64_t num = 0; + + for (i = 0; i < len; i++) + { + num += ((str[len - (i + 1)] - '0') * ipow(10,i)); + } + + return num; +} + +char* ttos(const char* str, int len) +{ + int i; + char token_buf[MAX_TOK_LEN]; + + memmove(token_buf, str, len); + token_buf[len+1] = '\0'; + + return strdup(token_buf); +} diff --git a/src/apc/lexer_lex.rl b/src/apc/lexer_lex.rl new file mode 100644 index 0000000..cb068d6 --- /dev/null +++ b/src/apc/lexer_lex.rl @@ -0,0 +1,123 @@ +/* Ragel State Machine for tokenizing text */ +#include +#include +#include + +extern void lexer_pushtok(int, YYSTYPE); + +int lexer_lex(const char*); +int ipow(int, int); +int ttov(const char* str, int); +uint64_t ttor(const char* str, int); +char* ttos(const char* str, int); + + +#define MAX_TOK_LEN 64 +#define MAX_TOKENS 16 +#define MAX_STR_SIZE (MAX_TOK_LEN * MAX_TOKENS) + + +%%{ + machine token_matcher; + + # set up yylval and tok_t to be pushed to stack + action set_ref { + tok_t = REF; \ + yylval.ref = ttor(ts, p-ts); \ + lexer_pushtok(tok_t, yylval); \ + ts = p; } + + action set_val { tok_t = NUM; \ + yylval.val = ttov(ts, p-ts); \ + lexer_pushtok(tok_t, yylval); \ + ts = p; } + + action set_name { tok_t = NAME; \ + yylval.str = ttos(ts, p-ts); \ + lexer_pushtok(tok_t, yylval); \ + ts = p; } + + # instantiate machines for each possible token + ref = '0x' xdigit+ %set_ref; + val = digit+ %set_val; + name = alpha+ %set_name; + tok = ref | val | name; + + main := (tok . '_')* . tok; +}%% + + +%%write data; + +/* 0xxdigit+ => tok_t REF, yylval.ref = uint64_t + [0-9]+ => tok_t NUM, yylval.val = int + [a-zA-Z]+ => tok_t NAME, yylval.str = char* */ + +/* Scan filename and push the its tokens + onto the stack */ +int lexer_lex (const char* str) +{ + const char *p, *pe, *ts, *eof; + int cs, tok_t ; //tok_t == token type + + p = ts = str; + pe = p + strlen(str) + 1; + %%write init; + %%write exec; + + lexer_pushtok(tok_t, yylval); + + printf (str); + return 1; +} + +int ipow(int base, int exp) +{ + int result = 1; + while (exp) + { + if (exp & 1) + result = result * base; + exp = exp >> 1; + base *= base; + } + + return result; +} + +/* Token to Value */ +int ttov(const char* str, int len) +{ + int i, val = 0; + + for (i = 0; i < len; i++) + { + val += ((str[len - (i + 1)] - '0') * ipow(10,i)); + } + + return val; +} + +uint64_t ttor(const char* str, int len) +{ + int i; + uint64_t num = 0; + + for (i = 0; i < len; i++) + { + num += ((str[len - (i + 1)] - '0') * ipow(10,i)); + } + + return num; +} + +char* ttos(const char* str, int len) +{ + int i; + char token_buf[MAX_TOK_LEN]; + + memmove(token_buf, str, len); + token_buf[len+1] = '\0'; + + return strdup(token_buf); +}