src/ucx/string.c

branch
config
changeset 254
4784c14aa639
parent 135
471e28cca288
child 260
4779a6fb4fbe
--- a/src/ucx/string.c	Sun Aug 23 22:02:01 2020 +0200
+++ b/src/ucx/string.c	Sun Aug 23 23:04:17 2020 +0200
@@ -1,7 +1,7 @@
 /*
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
  *
- * Copyright 2016 Olaf Wintermann. All rights reserved.
+ * Copyright 2017 Mike Becker, Olaf Wintermann All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,13 +26,19 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "ucx/string.h"
+
+#include "ucx/allocator.h"
+
 #include <stdlib.h>
 #include <string.h>
 #include <stdarg.h>
+#include <stdint.h>
 #include <ctype.h>
 
-#include "string.h"
-#include "allocator.h"
+#ifndef _WIN32
+#include <strings.h> /* for strncasecmp() */
+#endif /* _WIN32 */
 
 sstr_t sstr(char *cstring) {
     sstr_t string;
@@ -48,13 +54,35 @@
     return string;
 }
 
-size_t sstrnlen(size_t n, sstr_t s, ...) {
+scstr_t scstr(const char *cstring) {
+    scstr_t string;
+    string.ptr = cstring;
+    string.length = strlen(cstring);
+    return string;
+}
+
+scstr_t scstrn(const char *cstring, size_t length) {
+    scstr_t string;
+    string.ptr = cstring;
+    string.length = length;
+    return string;
+}
+
+
+size_t scstrnlen(size_t n, ...) {
+    if (n == 0) return 0;
+    
     va_list ap;
-    size_t size = s.length;
-    va_start(ap, s);
+    va_start(ap, n);
+    
+    size_t size = 0;
 
-    for (size_t i = 1 ; i < n ; i++) {
-        sstr_t str = va_arg(ap, sstr_t);
+    for (size_t i = 0 ; i < n ; i++) {
+        scstr_t str = va_arg(ap, scstr_t);
+        if(SIZE_MAX - str.length < size) {
+            size = SIZE_MAX;
+            break;
+        }
         size += str.length;
     }
     va_end(ap);
@@ -65,8 +93,7 @@
 static sstr_t sstrvcat_a(
         UcxAllocator *a,
         size_t count,
-        sstr_t s1,
-        sstr_t s2,
+        scstr_t s1,
         va_list ap) {
     sstr_t str;
     str.ptr = NULL;
@@ -75,7 +102,13 @@
         return str;
     }
     
-    sstr_t *strings = (sstr_t*) calloc(count, sizeof(sstr_t));
+    scstr_t s2 = va_arg (ap, scstr_t);
+    
+    if(((size_t)-1) - s1.length < s2.length) {
+        return str;
+    }
+    
+    scstr_t *strings = (scstr_t*) calloc(count, sizeof(scstr_t));
     if(!strings) {
         return str;
     }
@@ -83,16 +116,25 @@
     // get all args and overall length
     strings[0] = s1;
     strings[1] = s2;
-    size_t strlen = s1.length + s2.length;
+    size_t slen = s1.length + s2.length;
+    int error = 0;
     for (size_t i=2;i<count;i++) {
-        sstr_t s = va_arg (ap, sstr_t);
+        scstr_t s = va_arg (ap, scstr_t);
         strings[i] = s;
-        strlen += s.length;
+        if(((size_t)-1) - s.length < slen) {
+            error = 1;
+            break;
+        }
+        slen += s.length;
+    }
+    if(error) {
+        free(strings);
+        return str;
     }
     
     // create new string
-    str.ptr = (char*) almalloc(a, strlen + 1);
-    str.length = strlen;
+    str.ptr = (char*) almalloc(a, slen + 1);
+    str.length = slen;
     if(!str.ptr) {
         free(strings);
         str.length = 0;
@@ -102,7 +144,7 @@
     // concatenate strings
     size_t pos = 0;
     for (size_t i=0;i<count;i++) {
-        sstr_t s = strings[i];
+        scstr_t s = strings[i];
         memcpy(str.ptr + pos, s.ptr, s.length);
         pos += s.length;
     }
@@ -114,20 +156,42 @@
     return str;
 }
 
-sstr_t sstrcat(size_t count, sstr_t s1, sstr_t s2, ...) {
+sstr_t scstrcat(size_t count, scstr_t s1, ...) {
     va_list ap;
-    va_start(ap, s2);
-    sstr_t s = sstrvcat_a(ucx_default_allocator(), count, s1, s2, ap);
+    va_start(ap, s1);
+    sstr_t s = sstrvcat_a(ucx_default_allocator(), count, s1, ap);
+    va_end(ap);
+    return s;
+}
+
+sstr_t scstrcat_a(UcxAllocator *a, size_t count, scstr_t s1, ...) {
+    va_list ap;
+    va_start(ap, s1);
+    sstr_t s = sstrvcat_a(a, count, s1, ap);
     va_end(ap);
     return s;
 }
 
-sstr_t sstrcat_a(UcxAllocator *a, size_t count, sstr_t s1, sstr_t s2, ...) {
-    va_list ap;
-    va_start(ap, s2);
-    sstr_t s = sstrvcat_a(a, count, s1, s2, ap);
-    va_end(ap);
-    return s;
+static int ucx_substring(
+        size_t str_length,
+        size_t start,
+        size_t length,
+        size_t *newlen,
+        size_t *newpos)
+{
+    *newlen = 0;
+    *newpos = 0;
+    
+    if(start > str_length) {
+        return 0;
+    }
+    
+    if(length > str_length - start) {
+        length = str_length - start;
+    }
+    *newlen = length;
+    *newpos = start;
+    return 1;
 }
 
 sstr_t sstrsubs(sstr_t s, size_t start) {
@@ -135,132 +199,301 @@
 }
 
 sstr_t sstrsubsl(sstr_t s, size_t start, size_t length) {
-    sstr_t new_sstr;
-    if (start >= s.length) {
-        new_sstr.ptr = NULL;
-        new_sstr.length = 0;
-    } else {
-        if (length > s.length-start) {
-            length = s.length-start;
+    size_t pos;
+    sstr_t ret = { NULL, 0 };
+    if(ucx_substring(s.length, start, length, &ret.length, &pos)) {
+        ret.ptr = s.ptr + pos;
+    }
+    return ret;
+}
+
+scstr_t scstrsubs(scstr_t string, size_t start) {
+    return scstrsubsl(string, start, string.length-start);
+}
+
+scstr_t scstrsubsl(scstr_t s, size_t start, size_t length) {
+    size_t pos;
+    scstr_t ret = { NULL, 0 };
+    if(ucx_substring(s.length, start, length, &ret.length, &pos)) {
+        ret.ptr = s.ptr + pos;
+    }
+    return ret;
+}
+
+
+static int ucx_strchr(const char *str, size_t length, int chr, size_t *pos) {
+    for(size_t i=0;i<length;i++) {
+        if(str[i] == chr) {
+            *pos = i;
+            return 1;
         }
-        new_sstr.ptr = &s.ptr[start];
-        new_sstr.length = length;
     }
-    return new_sstr;
+    return 0;
+}
+
+static int ucx_strrchr(const char *str, size_t length, int chr, size_t *pos) {
+    if(length > 0) {
+        for(size_t i=length ; i>0 ; i--) {
+            if(str[i-1] == chr) {
+                *pos = i-1;
+                return 1;
+            }
+        }
+    }
+    return 0;
 }
 
 sstr_t sstrchr(sstr_t s, int c) {
-    for(size_t i=0;i<s.length;i++) {
-        if(s.ptr[i] == c) {
-            return sstrsubs(s, i);
-        }
+    size_t pos = 0;
+    if(ucx_strchr(s.ptr, s.length, c, &pos)) {
+        return sstrsubs(s, pos);
     }
-    sstr_t n;
-    n.ptr = NULL;
-    n.length = 0;
-    return n;
+    return sstrn(NULL, 0);
 }
 
 sstr_t sstrrchr(sstr_t s, int c) {
-    if (s.length > 0) {
-        for(size_t i=s.length;i>0;i--) {
-            if(s.ptr[i-1] == c) {
-                return sstrsubs(s, i-1);
-            }
-        }
+    size_t pos = 0;
+    if(ucx_strrchr(s.ptr, s.length, c, &pos)) {
+        return sstrsubs(s, pos);
     }
-    sstr_t n;
-    n.ptr = NULL;
-    n.length = 0;
-    return n;
+    return sstrn(NULL, 0);
+}
+
+scstr_t scstrchr(scstr_t s, int c) {
+    size_t pos = 0;
+    if(ucx_strchr(s.ptr, s.length, c, &pos)) {
+        return scstrsubs(s, pos);
+    }
+    return scstrn(NULL, 0);
 }
 
-sstr_t sstrstr(sstr_t string, sstr_t match) {
-    if (match.length == 0) {
-        return string;
+scstr_t scstrrchr(scstr_t s, int c) {
+    size_t pos = 0;
+    if(ucx_strrchr(s.ptr, s.length, c, &pos)) {
+        return scstrsubs(s, pos);
+    }
+    return scstrn(NULL, 0);
+}
+
+#define ptable_r(dest, useheap, ptable, index) (dest = useheap ? \
+    ((size_t*)ptable)[index] : (size_t) ((uint8_t*)ptable)[index])
+
+#define ptable_w(useheap, ptable, index, src) do {\
+    if (!useheap) ((uint8_t*)ptable)[index] = (uint8_t) src;\
+    else ((size_t*)ptable)[index] = src;\
+    } while (0);
+
+
+static const char* ucx_strstr(
+        const char *str,
+        size_t length,
+        const char *match,
+        size_t matchlen,
+        size_t *newlen)
+{
+    *newlen = length;
+    if (matchlen == 0) {
+        return str;
     }
     
-    for (size_t i = 0 ; i < string.length ; i++) {
-        sstr_t substr = sstrsubs(string, i);
-        if (sstrprefix(substr, match)) {
-            return substr;
+    const char *result = NULL;
+    size_t resultlen = 0;
+    
+    /*
+     * IMPORTANT:
+     * our prefix table contains the prefix length PLUS ONE
+     * this is our decision, because we want to use the full range of size_t
+     * the original algorithm needs a (-1) at one single place
+     * and we want to avoid that
+     */
+    
+    /* static prefix table */
+    static uint8_t s_prefix_table[256];
+    
+    /* check pattern length and use appropriate prefix table */
+    /* if the pattern exceeds static prefix table, allocate on the heap */
+    register int useheap = matchlen > 255;
+    register void* ptable = useheap ?
+        calloc(matchlen+1, sizeof(size_t)): s_prefix_table;
+    
+    /* keep counter in registers */
+    register size_t i, j;
+    
+    /* fill prefix table */
+    i = 0; j = 0;
+    ptable_w(useheap, ptable, i, j);
+    while (i < matchlen) {
+        while (j >= 1 && match[j-1] != match[i]) {
+            ptable_r(j, useheap, ptable, j-1);
         }
+        i++; j++;
+        ptable_w(useheap, ptable, i, j);
+    }
+
+    /* search */
+    i = 0; j = 1;
+    while (i < length) {
+        while (j >= 1 && str[i] != match[j-1]) {
+            ptable_r(j, useheap, ptable, j-1);
+        }
+        i++; j++;
+        if (j-1 == matchlen) {
+            size_t start = i - matchlen;
+            result = str + start;
+            resultlen = length - start;
+            break;
+        }
+    }
+
+    /* if prefix table was allocated on the heap, free it */
+    if (ptable != s_prefix_table) {
+        free(ptable);
     }
     
-    sstr_t emptystr;
-    emptystr.length = 0;
-    emptystr.ptr = NULL;
-    return emptystr;
+    *newlen = resultlen;
+    return result;
+}
+
+sstr_t scstrsstr(sstr_t string, scstr_t match) {
+    sstr_t result;
+    
+    size_t reslen;
+    const char *resstr = ucx_strstr(string.ptr, string.length, match.ptr, match.length, &reslen);
+    if(!resstr) {
+        result.ptr = NULL;
+        result.length = 0;
+        return result;
+    }
+    
+    size_t pos = resstr - string.ptr;
+    result.ptr = string.ptr + pos;
+    result.length = reslen;
+    
+    return result;
 }
 
-sstr_t* sstrsplit(sstr_t s, sstr_t d, ssize_t *n) {
-    return sstrsplit_a(ucx_default_allocator(), s, d, n);
+scstr_t scstrscstr(scstr_t string, scstr_t match) {
+    scstr_t result;
+    
+    size_t reslen;
+    const char *resstr = ucx_strstr(string.ptr, string.length, match.ptr, match.length, &reslen);
+    if(!resstr) {
+        result.ptr = NULL;
+        result.length = 0;
+        return result;
+    }
+    
+    size_t pos = resstr - string.ptr;
+    result.ptr = string.ptr + pos;
+    result.length = reslen;
+    
+    return result;
 }
 
-sstr_t* sstrsplit_a(UcxAllocator *allocator, sstr_t s, sstr_t d, ssize_t *n) {
+#undef ptable_r
+#undef ptable_w
+
+sstr_t* scstrsplit(scstr_t s, scstr_t d, ssize_t *n) {
+    return scstrsplit_a(ucx_default_allocator(), s, d, n);
+}
+
+sstr_t* scstrsplit_a(UcxAllocator *allocator, scstr_t s, scstr_t d, ssize_t *n) {
     if (s.length == 0 || d.length == 0) {
         *n = -1;
         return NULL;
     }
-
-    sstr_t* result;
-    ssize_t nmax = *n;
-    *n = 1;
-
-    /* special case: exact match - no processing needed */
-    if (sstrcmp(s, d) == 0) {
-        *n = 0;
-        return NULL;
+    
+    /* special cases: delimiter is at least as large as the string */
+    if (d.length >= s.length) {
+        /* exact match */
+        if (sstrcmp(s, d) == 0) {
+            *n = 0;
+            return NULL;
+        } else /* no match possible */ {
+            *n = 1;
+            sstr_t *result = (sstr_t*) almalloc(allocator, sizeof(sstr_t));
+            if(result) {
+                *result = sstrdup_a(allocator, s);
+            } else {
+                *n = -2;
+            }
+            return result;
+        }
     }
-    sstr_t sv = sstrdup(s);
-    if (sv.length == 0) {
-        *n = -2;
-        return NULL;
-    }
-
-    for (size_t i = 0 ; i < s.length ; i++) {
-        sstr_t substr = sstrsubs(sv, i);
-        if (sstrprefix(substr, d)) {
-            (*n)++;
-            for (size_t j = 0 ; j < d.length ; j++) {
-                sv.ptr[i+j] = 0;
-            }
-            i += d.length - 1; // -1, because the loop will do a i++
-        }
-        if ((*n) == nmax) break;
-    }
-    result = (sstr_t*) almalloc(allocator, sizeof(sstr_t)*(*n));
+    
+    ssize_t nmax = *n;
+    size_t arrlen = 16;
+    sstr_t* result = (sstr_t*) alcalloc(allocator, arrlen, sizeof(sstr_t));
 
     if (result) {
-        char *pptr = sv.ptr;
-        for (ssize_t i = 0 ; i < *n ; i++) {
-            size_t l = strlen(pptr);
-            char* ptr = (char*) almalloc(allocator, l + 1);
-            if (ptr) {
-                memcpy(ptr, pptr, l);
-                ptr[l] = 0;
+        scstr_t curpos = s;
+        ssize_t j = 1;
+        while (1) {
+            scstr_t match;
+            /* optimize for one byte delimiters */
+            if (d.length == 1) {
+                match = curpos;
+                for (size_t i = 0 ; i < curpos.length ; i++) {
+                    if (curpos.ptr[i] == *(d.ptr)) {
+                        match.ptr = curpos.ptr + i;
+                        break;
+                    }
+                    match.length--;
+                }
+            } else {
+                match = scstrscstr(curpos, d);
+            }
+            if (match.length > 0) {
+                /* is this our last try? */
+                if (nmax == 0 || j < nmax) {
+                    /* copy the current string to the array */
+                    scstr_t item = scstrn(curpos.ptr, match.ptr - curpos.ptr);
+                    result[j-1] = sstrdup_a(allocator, item);
+                    size_t processed = item.length + d.length;
+                    curpos.ptr += processed;
+                    curpos.length -= processed;
 
-                result[i] = sstrn(ptr, l);
-                pptr += l + d.length;
+                    /* allocate memory for the next string */
+                    j++;
+                    if (j > arrlen) {
+                        arrlen *= 2;
+                        size_t reallocsz;
+                        sstr_t* reallocated = NULL;
+                        if(!ucx_szmul(arrlen, sizeof(sstr_t), &reallocsz)) {
+                            reallocated = (sstr_t*) alrealloc(
+                                    allocator, result, reallocsz);
+                        }
+                        if (reallocated) {
+                            result = reallocated;
+                        } else {
+                            for (ssize_t i = 0 ; i < j-1 ; i++) {
+                                alfree(allocator, result[i].ptr);
+                            }
+                            alfree(allocator, result);
+                            *n = -2;
+                            return NULL;
+                        }
+                    }
+                } else {
+                    /* nmax reached, copy the _full_ remaining string */
+                    result[j-1] = sstrdup_a(allocator, curpos);
+                    break;
+                }
             } else {
-                for (ssize_t j = i-1 ; j >= 0 ; j--) {
-                    alfree(allocator, result[j].ptr);
-                }
-                alfree(allocator, result);
-                *n = -2;
+                /* no more matches, copy last string */
+                result[j-1] = sstrdup_a(allocator, curpos);
                 break;
             }
         }
+        *n = j;
     } else {
         *n = -2;
     }
-    
-    free(sv.ptr);
 
     return result;
 }
 
-int sstrcmp(sstr_t s1, sstr_t s2) {
+int scstrcmp(scstr_t s1, scstr_t s2) {
     if (s1.length == s2.length) {
         return memcmp(s1.ptr, s2.ptr, s1.length);
     } else if (s1.length > s2.length) {
@@ -270,7 +503,7 @@
     }
 }
 
-int sstrcasecmp(sstr_t s1, sstr_t s2) {
+int scstrcasecmp(scstr_t s1, scstr_t s2) {
     if (s1.length == s2.length) {
 #ifdef _WIN32
         return _strnicmp(s1.ptr, s2.ptr, s1.length);
@@ -284,11 +517,11 @@
     }
 }
 
-sstr_t sstrdup(sstr_t s) {
+sstr_t scstrdup(scstr_t s) {
     return sstrdup_a(ucx_default_allocator(), s);
 }
 
-sstr_t sstrdup_a(UcxAllocator *allocator, sstr_t s) {
+sstr_t scstrdup_a(UcxAllocator *allocator, scstr_t s) {
     sstr_t newstring;
     newstring.ptr = (char*)almalloc(allocator, s.length + 1);
     if (newstring.ptr) {
@@ -303,21 +536,38 @@
     return newstring;
 }
 
-sstr_t sstrtrim(sstr_t string) {
-    sstr_t newstr = string;
+
+static size_t ucx_strtrim(const char *s, size_t len, size_t *newlen) {
+    const char *newptr = s;
+    size_t length = len;
     
-    while (newstr.length > 0 && isspace(*newstr.ptr)) {
-        newstr.ptr++;
-        newstr.length--;
+    while(length > 0 && isspace(*newptr)) {
+        newptr++;
+        length--;
     }
-    while (newstr.length > 0 && isspace(newstr.ptr[newstr.length-1])) {
-        newstr.length--;
+    while(length > 0 && isspace(newptr[length-1])) {
+        length--;
     }
     
+    *newlen = length;
+    return newptr - s;
+}
+
+sstr_t sstrtrim(sstr_t string) {
+    sstr_t newstr;
+    newstr.ptr = string.ptr
+                 + ucx_strtrim(string.ptr, string.length, &newstr.length);
     return newstr;
 }
 
-int sstrprefix(sstr_t string, sstr_t prefix) {
+scstr_t scstrtrim(scstr_t string) {
+    scstr_t newstr;
+    newstr.ptr = string.ptr
+                 + ucx_strtrim(string.ptr, string.length, &newstr.length);
+    return newstr;
+}
+
+int scstrprefix(scstr_t string, scstr_t prefix) {
     if (string.length == 0) {
         return prefix.length == 0;
     }
@@ -332,7 +582,7 @@
     }
 }
 
-int sstrsuffix(sstr_t string, sstr_t suffix) {
+int scstrsuffix(scstr_t string, scstr_t suffix) {
     if (string.length == 0) {
         return suffix.length == 0;
     }
@@ -348,7 +598,39 @@
     }
 }
 
-sstr_t sstrlower(sstr_t string) {
+int scstrcaseprefix(scstr_t string, scstr_t prefix) {
+    if (string.length == 0) {
+        return prefix.length == 0;
+    }
+    if (prefix.length == 0) {
+        return 1;
+    }
+    
+    if (prefix.length > string.length) {
+        return 0;
+    } else {
+        scstr_t subs = scstrsubsl(string, 0, prefix.length);
+        return scstrcasecmp(subs, prefix) == 0;
+    }
+}
+
+int scstrcasesuffix(scstr_t string, scstr_t suffix) {
+    if (string.length == 0) {
+        return suffix.length == 0;
+    }
+    if (suffix.length == 0) {
+        return 1;
+    }
+    
+    if (suffix.length > string.length) {
+        return 0;
+    } else {
+        scstr_t subs = scstrsubs(string, string.length-suffix.length);
+        return scstrcasecmp(subs, suffix) == 0;
+    }
+}
+
+sstr_t scstrlower(scstr_t string) {
     sstr_t ret = sstrdup(string);
     for (size_t i = 0; i < ret.length ; i++) {
         ret.ptr[i] = tolower(ret.ptr[i]);
@@ -356,7 +638,7 @@
     return ret;
 }
 
-sstr_t sstrlower_a(UcxAllocator *allocator, sstr_t string) {
+sstr_t scstrlower_a(UcxAllocator *allocator, scstr_t string) {
     sstr_t ret = sstrdup_a(allocator, string);
     for (size_t i = 0; i < ret.length ; i++) {
         ret.ptr[i] = tolower(ret.ptr[i]);
@@ -364,7 +646,7 @@
     return ret;
 }
 
-sstr_t sstrupper(sstr_t string) {
+sstr_t scstrupper(scstr_t string) {
     sstr_t ret = sstrdup(string);
     for (size_t i = 0; i < ret.length ; i++) {
         ret.ptr[i] = toupper(ret.ptr[i]);
@@ -372,10 +654,24 @@
     return ret;
 }
 
-sstr_t sstrupper_a(UcxAllocator *allocator, sstr_t string) {
+sstr_t scstrupper_a(UcxAllocator *allocator, scstr_t string) {
     sstr_t ret = sstrdup_a(allocator, string);
     for (size_t i = 0; i < ret.length ; i++) {
         ret.ptr[i] = toupper(ret.ptr[i]);
     }
     return ret;
 }
+
+// type adjustment functions
+scstr_t ucx_sc2sc(scstr_t str) {
+    return str;
+}
+scstr_t ucx_ss2sc(sstr_t str) {
+    scstr_t cs;
+    cs.ptr = str.ptr;
+    cs.length = str.length;
+    return cs;
+}
+scstr_t ucx_ss2c_s(scstr_t c) {
+    return c;
+}

mercurial