improved expression parser (can now detect literals and identifiers) - TODO: tokenizer must not split strings and "backstick-identifiers"

Wed, 01 Apr 2015 11:42:38 +0200

author
Mike Becker <universe@uap-core.de>
date
Wed, 01 Apr 2015 11:42:38 +0200
changeset 83
7d20ce5d235b
parent 82
0567444f2d76
child 84
7fca3788261d

improved expression parser (can now detect literals and identifiers) - TODO: tokenizer must not split strings and "backstick-identifiers"

libidav/davqlparser.c file | annotate | diff | comparison | revisions
libidav/davqlparser.h file | annotate | diff | comparison | revisions
--- a/libidav/davqlparser.c	Tue Mar 31 13:00:17 2015 +0200
+++ b/libidav/davqlparser.c	Wed Apr 01 11:42:38 2015 +0200
@@ -61,13 +61,17 @@
 static const char* _map_operator(davqloperator_t op) {
     // don't use string array, because enum values may change
     switch(op) {
-    case ADD: return "+"; case SUB: return "-"; case MUL: return "*";
-    case DIV: return "/"; case AND: return "&"; case OR: return "|";
-    case XOR: return "^"; case NEG: return "~"; case NOT: return "NOT";
-    case LAND: return "AND"; case LOR: return "OR"; case LXOR: return "XOR";
-    case EQ: return "="; case NEQ: return "!="; case LT: return "<";
-    case GT: return ">"; case LE: return "<="; case GE: return ">=";
-    case LIKE: return "LIKE"; case UNLIKE: return "UNLIKE";
+    case DAVQL_NOOP: return "no operator";
+    case DAVQL_ADD: return "+"; case DAVQL_SUB: return "-";
+    case DAVQL_MUL: return "*"; case DAVQL_DIV: return "/";
+    case DAVQL_AND: return "&"; case DAVQL_OR: return "|";
+    case DAVQL_XOR: return "^"; case DAVQL_NEG: return "~";
+    case DAVQL_NOT: return "NOT"; case DAVQL_LAND: return "AND";
+    case DAVQL_LOR: return "OR"; case DAVQL_LXOR: return "XOR";
+    case DAVQL_EQ: return "="; case DAVQL_NEQ: return "!=";
+    case DAVQL_LT: return "<"; case DAVQL_GT: return ">";
+    case DAVQL_LE: return "<="; case DAVQL_GE: return ">=";
+    case DAVQL_LIKE: return "LIKE"; case DAVQL_UNLIKE: return "UNLIKE";
     default: return "unknown";
     }
 }
@@ -86,7 +90,7 @@
     UCX_FOREACH(elm, stmt->fields) {
         DavQLExpression* expr = (DavQLExpression*)elm->data;
         if (expr->type == DAVQL_IDENTIFIER &&
-            expr->srctext.length == 1 && *(expr->srctext.ptr) == '*') {
+            expr->srctext.length == 1 && expr->srctext.ptr[0] == '*') {
             wildcard = 1;
         }
     }
@@ -265,12 +269,61 @@
         tokens = ucx_list_append(tokens, token);
     }
     
+    // now find quotes and backsticks and merge enclosed tokens
+    // TODO: make it so or disable tokenization in such cases in above code
+    
     return tokens;
 }
 
-static DavQLExpression* dav_parse_expression(sstr_t src) {
+#define token_sstr(listelem) ((sstr_t*)(listelem)->data)
+static DavQLExpression* dav_parse_expression(UcxList* starttoken, size_t n) {
+    if (n == 0) {
+        return NULL;
+    }
+    
     DavQLExpression *expr = calloc(1, sizeof(DavQLExpression));
-    expr->srctext = src;
+    
+    // set pointer for source text
+    expr->srctext.ptr = token_sstr(starttoken)->ptr;
+    
+    // special case - only one token
+    if (n == 1) {
+        expr->srctext.length = token_sstr(starttoken)->length;
+        char firstchar = expr->srctext.ptr[0];
+        if (firstchar == '\'' || isdigit(firstchar)) {
+            expr->type = DAVQL_LITERAL;
+        } else {
+            expr->type = DAVQL_IDENTIFIER;
+        }
+    } else {
+        UcxList* token = starttoken;
+        
+        // check, if first token is (
+        // if so, verify that last token is ) and throw both away
+        if (!sstrcmp(*token_sstr(token), S("("))) {
+            if (!sstrcmp(*token_sstr(ucx_list_get(token, n-1)), S(")"))) {
+                token = token->next;
+                n -= 2;
+            } else {
+                // TODO: throw syntax error
+            }
+        }
+
+        // process tokens        
+        for (size_t i = 0 ; i < n ; i++) {
+            
+            // TODO: make it so
+
+            // go to next token (if this is not the last token)
+            if (i < n-1) {
+                token = token->next;
+            }
+        }
+
+        // compute length of source text (including delimiters)
+        expr->srctext.length = token_sstr(token)->ptr +
+            token_sstr(token)->length - expr->srctext.ptr;
+    }
     
     return expr;
 }
@@ -290,8 +343,13 @@
      */
     int step = 10;
     
+    // Variables for token sublists for expressions
+    UcxList *exprstart = NULL;
+    size_t exprlen = 0;
+    
+    // Process tokens
     UCX_FOREACH(token, tokens) {
-        sstr_t tokendata = *(sstr_t*)token->data;
+        sstr_t tokendata = *token_sstr(token);
         
         switch (step) {
         // optional clauses
@@ -306,17 +364,35 @@
             }
             break;
         // field list
-        case 10:
-            if (!sstrcasecmp(tokendata, S("from"))) {
-                step = 20;
+        case 10: {
+            _Bool fromkeyword = !sstrcasecmp(tokendata, S("from"));
+            if (fromkeyword || !sstrcmp(tokendata, S(","))) {
+                if (exprstart) {
+                    stmt->fields = ucx_list_append(stmt->fields,
+                        dav_parse_expression(exprstart, exprlen));
+                    exprstart = NULL;
+                    exprlen = 0;
+                } else {
+                    // TODO: throw syntax error
+                }
+                
+                if (fromkeyword) {
+                    step = 20;
+                }
             } else {
-                stmt->fields = ucx_list_append(stmt->fields,
-                    dav_parse_expression(tokendata));
+                // collect tokens for field expression
+                if (exprstart) {
+                    exprlen++;
+                } else {
+                    exprstart = token;
+                    exprlen = 1;
+                }
             }
             break;
+        }
         // from clause
         case 20:
-            stmt->path = dav_parse_expression(tokendata);
+            stmt->path = dav_parse_expression(token, 1);
             step = 520;
             break;
         // where clause
@@ -328,8 +404,6 @@
             step = 500;
             break;
         }
-        
-        free(token->data);
     }
     
     if (step < 500) {
@@ -342,11 +416,8 @@
     stmt->type = DAVQL_SET;
     
     UCX_FOREACH(token, tokens) {
-        sstr_t tokendata = *(sstr_t*)token->data;
+        sstr_t tokendata = *token_sstr(token);
         
-        // just free the tokens, until the function is implemented
-        
-        free(token->data);
     }
 }
 
@@ -363,11 +434,12 @@
     // tokenization
     UcxList* tokens = dav_parse_tokenize(stmt->srctext);
     
-    // use first token to determine query type
     if (tokens) {
-        sstr_t token = *(sstr_t*)tokens->data;
+        // use first token to determine query type
+        sstr_t token = *token_sstr(tokens);
         free(tokens->data);
         tokens = ucx_list_remove(tokens, tokens);
+        
         if (!sstrcasecmp(token, S("get"))) {
             dav_parse_get_statement(stmt, tokens);
         } else if (!sstrcasecmp(token, S("set"))) {
@@ -375,6 +447,11 @@
         } else {
             stmt->type = DAVQL_ERROR;
         }
+        
+        // free token data
+        UCX_FOREACH(token, tokens) {
+            free(token->data);
+        }
         ucx_list_free(tokens);
     } else {
         stmt->type = DAVQL_ERROR;
--- a/libidav/davqlparser.h	Tue Mar 31 13:00:17 2015 +0200
+++ b/libidav/davqlparser.h	Wed Apr 01 11:42:38 2015 +0200
@@ -54,11 +54,12 @@
  * Enumeration of possible expression operators.
  */
 typedef enum {
-    ADD, SUB, MUL, DIV,
-    AND, OR, XOR, NEG,
-    NOT, LAND, LOR, LXOR,
-    EQ, NEQ, LT, GT, LE, GE,
-    LIKE, UNLIKE
+    DAVQL_NOOP,
+    DAVQL_ADD, DAVQL_SUB, DAVQL_MUL, DAVQL_DIV,
+    DAVQL_AND, DAVQL_OR, DAVQL_XOR, DAVQL_NEG,
+    DAVQL_NOT, DAVQL_LAND, DAVQL_LOR, DAVQL_LXOR,
+    DAVQL_EQ, DAVQL_NEQ, DAVQL_LT, DAVQL_GT, DAVQL_LE, DAVQL_GE,
+    DAVQL_LIKE, DAVQL_UNLIKE
 } davqloperator_t;
 
 /**
@@ -110,7 +111,7 @@
  *            | "(", Expression, ")";
  * 
  * FunctionCall    = Identifier, "(", Expression, ")";
- * Identifier      = IdentifierChar, {IdentifierChar}
+ * Identifier      = IdentifierChar - ?Digit?, {IdentifierChar}
  *                 | "`", ?Character?, {?Character?}, "`";
  * IdentifierChar  = ?Character - (" "|",")?;
  * Literal         = ?Digit?, {?Digit?} | String;

mercurial