tokenizer now correctly handles quoted tokens

Sat, 04 Apr 2015 20:37:03 +0200

author
Mike Becker <universe@uap-core.de>
date
Sat, 04 Apr 2015 20:37:03 +0200
changeset 88
4d6b03bd7034
parent 87
ed21d95984bb
child 89
785f6007a0c1
child 100
f4127c4d1018

tokenizer now correctly handles quoted tokens

libidav/davqlparser.c file | annotate | diff | comparison | revisions
libidav/davqlparser.h file | annotate | diff | comparison | revisions
--- a/libidav/davqlparser.c	Sat Apr 04 19:45:58 2015 +0200
+++ b/libidav/davqlparser.c	Sat Apr 04 20:37:03 2015 +0200
@@ -245,14 +245,39 @@
 #define _unexpected_end_msg "unexpected end of statement"
 #define _invalid_msg "invalid statement"
 #define _unexpected_token "unexpected token (%.*s [->]%.*s %.*s)"
+#define _missing_quote "missing closing quote symbol (%.*s)"
 
 static UcxList* dav_parse_tokenize(sstr_t src) {
     UcxList *tokens = NULL;
     
     // Delimiters: whitespace and dead whitespace around commas
     sstr_t *token = NULL;
+    char insequence = '\0';
     for (size_t i = 0 ; i < src.length ; i++) {
-        if (isspace(src.ptr[i])) {
+        // quoted strings / identifiers are a single token
+        if (src.ptr[i] == '\'' || src.ptr[i] == '`') {
+            if (src.ptr[i] == insequence) {
+                // add quoted token to list
+                token->length++;
+                tokens = ucx_list_append(tokens, token);
+                token = NULL;
+                insequence = '\0';
+            } else if (insequence == '\0') {
+                insequence = src.ptr[i];
+                // always create new token for quoted strings
+                if (token) {
+                    tokens = ucx_list_append(tokens, token);
+                }
+                token = malloc(sizeof(sstr_t));
+                token->ptr = src.ptr + i;
+                token->length = 1;
+            } else {
+                // add other kind of quotes to token
+                token->length++;
+            }
+        } else if (insequence) {
+            token->length++;
+        } else if (isspace(src.ptr[i])) {
             // add token before spaces to list (if any)
             if (token) {
                 tokens = ucx_list_append(tokens, token);
@@ -287,14 +312,12 @@
         tokens = ucx_list_append(tokens, token);
     }
     
-    // now find quotes and backsticks and merge enclosed tokens
-    // TODO: make it so or disable tokenization in such cases in above code
-    
     return tokens;
 }
 
 #define token_sstr(listelem) ((sstr_t*)(listelem)->data)
-static DavQLExpression* dav_parse_expression(UcxList* starttoken, size_t n) {
+static DavQLExpression* dav_parse_expression(
+        DavQLStatement* stmt, UcxList* starttoken, size_t n) {
     if (n == 0) {
         return NULL;
     }
@@ -308,11 +331,26 @@
     if (n == 1) {
         expr->srctext.length = token_sstr(starttoken)->length;
         char firstchar = expr->srctext.ptr[0];
+        char lastchar = expr->srctext.ptr[expr->srctext.length-1];
         if (firstchar == '\'' || isdigit(firstchar)) {
             expr->type = DAVQL_LITERAL;
         } else {
             expr->type = DAVQL_IDENTIFIER;
         }
+        // remove quotes (if any)
+        if (firstchar == '\'' || firstchar == '`') {
+            if (lastchar != firstchar) {
+                stmt->errorcode = DAVQL_ERROR_MISSING_QUOTE;
+                stmt->errormessage =
+                    ucx_sprintf(_missing_quote, sfmtarg(expr->srctext)).ptr;
+            }
+            expr->srctext.ptr++;
+            if (expr->srctext.length > 2) {
+                expr->srctext.length -= 2;
+            } else {
+                expr->srctext.length = 0;
+            }
+        }
     } else {
         UcxList* token = starttoken;
         
@@ -388,6 +426,10 @@
     
     // Process tokens
     UCX_FOREACH(token, tokens) {
+        if (stmt->errorcode) {
+            ultrabreak: break;
+        }
+        
         sstr_t tokendata = *token_sstr(token);
         
         switch (step) {
@@ -402,7 +444,7 @@
                 step = 40;
             } else {
                 dav_parse_unexpected_token(stmt, token);
-                step = 999;
+                goto ultrabreak;
             }
             break;
         // field list
@@ -411,7 +453,7 @@
             if (fromkeyword || !sstrcmp(tokendata, S(","))) {
                 if (exprstart) {
                     stmt->fields = ucx_list_append(stmt->fields,
-                        dav_parse_expression(exprstart, exprlen));
+                        dav_parse_expression(stmt, exprstart, exprlen));
                     exprstart = NULL;
                     exprlen = 0;
                 } else {
@@ -434,7 +476,7 @@
         }
         // from clause
         case 20: {
-            DavQLExpression *expr = dav_parse_expression(token, 1);
+            DavQLExpression *expr = dav_parse_expression(stmt, token, 1);
             stmt->path = expr->srctext;
             dav_free_expression(expr);
             step = 520;
--- a/libidav/davqlparser.h	Sat Apr 04 19:45:58 2015 +0200
+++ b/libidav/davqlparser.h	Sat Apr 04 20:37:03 2015 +0200
@@ -207,11 +207,14 @@
 /** Infinity recursion depth for a DavQLStatement. */
 #define DAV_DEPTH_INFINITY -1
 
+/** A quote symbol (' or `) is missing. */
+#define DAVQL_ERROR_MISSING_QUOTE 50
+
 /** No more tokens to parse, but the parser expected more. */
-#define DAVQL_ERROR_UNEXPECTED_END 1000
+#define DAVQL_ERROR_UNEXPECTED_END 100
 
 /** A token was found, which has not been expected. */
-#define DAVQL_ERROR_UNEXPECTED_TOKEN 1010
+#define DAVQL_ERROR_UNEXPECTED_TOKEN 101
 
 /** Nothing about the statement seems legit. */
 #define DAVQL_ERROR_INVALID -1

mercurial