ragel state machine for lexer/scanner

[henge/webcc.git] / src / apc / lexer_lex.c
diff --git a/src/apc/lexer_lex.c b/src/apc/lexer_lex.c

new file mode 100644 (file)

index 0000000..ca2b477
--- /dev/null
+++ b/src/apc/lexer_lex.c
@@ -0,0 +1,307 @@
+
+#line 1 "lexer_lex.rl"
+/* Ragel State Machine for tokenizing text */
+#include <stdio.h>
+#include <string.h>
+#include <apc/parser.tab.h>
+
+extern void lexer_pushtok(int, YYSTYPE);
+
+int lexer_lex(const char*);
+int ipow(int, int);
+int ttov(const char* str, int);
+uint64_t ttor(const char* str, int);
+char* ttos(const char* str, int);
+
+
+#define MAX_TOK_LEN 64
+#define MAX_TOKENS 16
+#define MAX_STR_SIZE (MAX_TOK_LEN * MAX_TOKENS)
+
+
+
+#line 47 "lexer_lex.rl"
+
+
+
+
+#line 29 "lexer_lex.c"
+static const char _token_matcher_actions[] = {
+       0, 1, 0, 1, 1, 1, 2
+};
+
+static const char _token_matcher_key_offsets[] = {
+       0, 0, 7, 13, 17, 20, 27
+};
+
+static const char _token_matcher_trans_keys[] = {
+       48, 49, 57, 65, 90, 97, 122, 48, 
+       57, 65, 70, 97, 102, 95, 120, 48, 
+       57, 95, 48, 57, 95, 48, 57, 65, 
+       70, 97, 102, 95, 65, 90, 97, 122, 
+       0
+};
+
+static const char _token_matcher_single_lengths[] = {
+       0, 1, 0, 2, 1, 1, 1
+};
+
+static const char _token_matcher_range_lengths[] = {
+       0, 3, 3, 1, 1, 3, 2
+};
+
+static const char _token_matcher_index_offsets[] = {
+       0, 0, 5, 9, 13, 16, 21
+};
+
+static const char _token_matcher_indicies[] = {
+       0, 2, 3, 3, 1, 4, 4, 4, 
+       1, 5, 6, 2, 1, 5, 2, 1, 
+       7, 4, 4, 4, 1, 8, 3, 3, 
+       1, 0
+};
+
+static const char _token_matcher_trans_targs[] = {
+       3, 0, 4, 6, 5, 1, 2, 1, 
+       1
+};
+
+static const char _token_matcher_trans_actions[] = {
+       0, 0, 0, 0, 0, 3, 0, 1, 
+       5
+};
+
+static const char _token_matcher_eof_actions[] = {
+       0, 0, 0, 3, 3, 1, 5
+};
+
+static const int token_matcher_start = 1;
+static const int token_matcher_first_final = 3;
+static const int token_matcher_error = 0;
+
+static const int token_matcher_en_main = 1;
+
+
+#line 51 "lexer_lex.rl"
+
+/*  0xxdigit+  => tok_t REF, yylval.ref = uint64_t
+    [0-9]+  => tok_t NUM, yylval.val = int
+    [a-zA-Z]+ => tok_t NAME, yylval.str = char*       */
+
+/* Scan filename and push the its tokens
+   onto the stack */
+int lexer_lex (const char* str)
+{
+  const char *p, *pe, *ts, *eof;
+  int  cs, tok_t ; //tok_t == token type
+
+  p = ts = str;
+  pe = p + strlen(str) + 1;
+  
+#line 102 "lexer_lex.c"
+       {
+       cs = token_matcher_start;
+       }
+
+#line 66 "lexer_lex.rl"
+  
+#line 109 "lexer_lex.c"
+       {
+       int _klen;
+       unsigned int _trans;
+       const char *_acts;
+       unsigned int _nacts;
+       const char *_keys;
+
+       if ( p == pe )
+               goto _test_eof;
+       if ( cs == 0 )
+               goto _out;
+_resume:
+       _keys = _token_matcher_trans_keys + _token_matcher_key_offsets[cs];
+       _trans = _token_matcher_index_offsets[cs];
+
+       _klen = _token_matcher_single_lengths[cs];
+       if ( _klen > 0 ) {
+               const char *_lower = _keys;
+               const char *_mid;
+               const char *_upper = _keys + _klen - 1;
+               while (1) {
+                       if ( _upper < _lower )
+                               break;
+
+                       _mid = _lower + ((_upper-_lower) >> 1);
+                       if ( (*p) < *_mid )
+                               _upper = _mid - 1;
+                       else if ( (*p) > *_mid )
+                               _lower = _mid + 1;
+                       else {
+                               _trans += (unsigned int)(_mid - _keys);
+                               goto _match;
+                       }
+               }
+               _keys += _klen;
+               _trans += _klen;
+       }
+
+       _klen = _token_matcher_range_lengths[cs];
+       if ( _klen > 0 ) {
+               const char *_lower = _keys;
+               const char *_mid;
+               const char *_upper = _keys + (_klen<<1) - 2;
+               while (1) {
+                       if ( _upper < _lower )
+                               break;
+
+                       _mid = _lower + (((_upper-_lower) >> 1) & ~1);
+                       if ( (*p) < _mid[0] )
+                               _upper = _mid - 2;
+                       else if ( (*p) > _mid[1] )
+                               _lower = _mid + 2;
+                       else {
+                               _trans += (unsigned int)((_mid - _keys)>>1);
+                               goto _match;
+                       }
+               }
+               _trans += _klen;
+       }
+
+_match:
+       _trans = _token_matcher_indicies[_trans];
+       cs = _token_matcher_trans_targs[_trans];
+
+       if ( _token_matcher_trans_actions[_trans] == 0 )
+               goto _again;
+
+       _acts = _token_matcher_actions + _token_matcher_trans_actions[_trans];
+       _nacts = (unsigned int) *_acts++;
+       while ( _nacts-- > 0 )
+       {
+               switch ( *_acts++ )
+               {
+       case 0:
+#line 24 "lexer_lex.rl"
+       {
+                   tok_t = REF;                      \
+                   yylval.ref = ttor(ts, p-ts);      \
+                   lexer_pushtok(tok_t, yylval);     \
+                   ts = p;   }
+       break;
+       case 1:
+#line 30 "lexer_lex.rl"
+       { tok_t = NUM;                      \
+                   yylval.val = ttov(ts, p-ts);      \
+                   lexer_pushtok(tok_t, yylval);     \
+                   ts = p;   }
+       break;
+       case 2:
+#line 35 "lexer_lex.rl"
+       { tok_t = NAME;                    \
+                    yylval.str = ttos(ts, p-ts);     \
+                    lexer_pushtok(tok_t, yylval);    \
+                    ts = p;   }
+       break;
+#line 205 "lexer_lex.c"
+               }
+       }
+
+_again:
+       if ( cs == 0 )
+               goto _out;
+       if ( ++p != pe )
+               goto _resume;
+       _test_eof: {}
+       if ( p == eof )
+       {
+       const char *__acts = _token_matcher_actions + _token_matcher_eof_actions[cs];
+       unsigned int __nacts = (unsigned int) *__acts++;
+       while ( __nacts-- > 0 ) {
+               switch ( *__acts++ ) {
+       case 0:
+#line 24 "lexer_lex.rl"
+       {
+                   tok_t = REF;                      \
+                   yylval.ref = ttor(ts, p-ts);      \
+                   lexer_pushtok(tok_t, yylval);     \
+                   ts = p;   }
+       break;
+       case 1:
+#line 30 "lexer_lex.rl"
+       { tok_t = NUM;                      \
+                   yylval.val = ttov(ts, p-ts);      \
+                   lexer_pushtok(tok_t, yylval);     \
+                   ts = p;   }
+       break;
+       case 2:
+#line 35 "lexer_lex.rl"
+       { tok_t = NAME;                    \
+                    yylval.str = ttos(ts, p-ts);     \
+                    lexer_pushtok(tok_t, yylval);    \
+                    ts = p;   }
+       break;
+#line 243 "lexer_lex.c"
+               }
+       }
+       }
+
+       _out: {}
+       }
+
+#line 67 "lexer_lex.rl"
+
+  lexer_pushtok(tok_t, yylval);
+
+  printf (str);
+  return 1;
+}
+
+int ipow(int base, int exp)
+{
+  int result = 1;
+  while (exp)
+    {
+      if (exp & 1)
+        result = result * base;
+      exp = exp >> 1;
+      base *= base;
+    }
+
+  return result;
+}
+
+/*  Token to Value */
+int ttov(const char* str, int len)
+{
+  int i, val = 0;
+
+  for (i = 0; i < len; i++)
+    {
+      val += ((str[len - (i + 1)] - '0') * ipow(10,i));
+    }
+
+  return val;
+}
+
+uint64_t ttor(const char* str, int len)
+{
+  int i;
+  uint64_t num = 0;
+
+  for (i = 0; i < len; i++)
+    {
+      num += ((str[len - (i + 1)] - '0') * ipow(10,i));
+    }
+
+  return num;
+}
+
+char* ttos(const char* str, int len)
+{
+  int i;
+  char token_buf[MAX_TOK_LEN];
+
+  memmove(token_buf, str, len);
+  token_buf[len+1] = '\0';
+
+  return strdup(token_buf);
+}