ucx/string.c

Sat, 05 Dec 2020 11:54:58 +0100

author
Olaf Wintermann <olaf.wintermann@gmail.com>
date
Sat, 05 Dec 2020 11:54:58 +0100
changeset 162
18892c0a9adc
parent 157
0b33b9396851
permissions
-rw-r--r--

ucx update

/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 *
 * Copyright 2017 Mike Becker, Olaf Wintermann All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "ucx/string.h"

#include "ucx/allocator.h"

#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <stdint.h>
#include <ctype.h>

#ifndef _WIN32
#include <strings.h> /* for strncasecmp() */
#endif /* _WIN32 */

sstr_t sstr(char *cstring) {
    sstr_t string;
    string.ptr = cstring;
    string.length = strlen(cstring);
    return string;
}

sstr_t sstrn(char *cstring, size_t length) {
    sstr_t string;
    string.ptr = cstring;
    string.length = length;
    return string;
}

scstr_t scstr(const char *cstring) {
    scstr_t string;
    string.ptr = cstring;
    string.length = strlen(cstring);
    return string;
}

scstr_t scstrn(const char *cstring, size_t length) {
    scstr_t string;
    string.ptr = cstring;
    string.length = length;
    return string;
}


size_t scstrnlen(size_t n, ...) {
    if (n == 0) return 0;
    
    va_list ap;
    va_start(ap, n);
    
    size_t size = 0;

    for (size_t i = 0 ; i < n ; i++) {
        scstr_t str = va_arg(ap, scstr_t);
        if(SIZE_MAX - str.length < size) {
            size = SIZE_MAX;
            break;
        }
        size += str.length;
    }
    va_end(ap);

    return size;
}

static sstr_t sstrvcat_a(
        UcxAllocator *a,
        size_t count,
        scstr_t s1,
        va_list ap) {
    sstr_t str;
    str.ptr = NULL;
    str.length = 0;
    if(count < 2) {
        return str;
    }
    
    scstr_t s2 = va_arg (ap, scstr_t);
    
    if(((size_t)-1) - s1.length < s2.length) {
        return str;
    }
    
    scstr_t *strings = (scstr_t*) calloc(count, sizeof(scstr_t));
    if(!strings) {
        return str;
    }
    
    // get all args and overall length
    strings[0] = s1;
    strings[1] = s2;
    size_t slen = s1.length + s2.length;
    int error = 0;
    for (size_t i=2;i<count;i++) {
        scstr_t s = va_arg (ap, scstr_t);
        strings[i] = s;
        if(((size_t)-1) - s.length < slen) {
            error = 1;
            break;
        }
        slen += s.length;
    }
    if(error) {
        free(strings);
        return str;
    }
    
    // create new string
    str.ptr = (char*) almalloc(a, slen + 1);
    str.length = slen;
    if(!str.ptr) {
        free(strings);
        str.length = 0;
        return str;
    }
    
    // concatenate strings
    size_t pos = 0;
    for (size_t i=0;i<count;i++) {
        scstr_t s = strings[i];
        memcpy(str.ptr + pos, s.ptr, s.length);
        pos += s.length;
    }
    
    str.ptr[str.length] = '\0';
    
    free(strings);
    
    return str;
}

sstr_t scstrcat(size_t count, scstr_t s1, ...) {
    va_list ap;
    va_start(ap, s1);
    sstr_t s = sstrvcat_a(ucx_default_allocator(), count, s1, ap);
    va_end(ap);
    return s;
}

sstr_t scstrcat_a(UcxAllocator *a, size_t count, scstr_t s1, ...) {
    va_list ap;
    va_start(ap, s1);
    sstr_t s = sstrvcat_a(a, count, s1, ap);
    va_end(ap);
    return s;
}

static int ucx_substring(
        size_t str_length,
        size_t start,
        size_t length,
        size_t *newlen,
        size_t *newpos)
{
    *newlen = 0;
    *newpos = 0;
    
    if(start > str_length) {
        return 0;
    }
    
    if(length > str_length - start) {
        length = str_length - start;
    }
    *newlen = length;
    *newpos = start;
    return 1;
}

sstr_t sstrsubs(sstr_t s, size_t start) {
    return sstrsubsl (s, start, s.length-start);
}

sstr_t sstrsubsl(sstr_t s, size_t start, size_t length) {
    size_t pos;
    sstr_t ret = { NULL, 0 };
    if(ucx_substring(s.length, start, length, &ret.length, &pos)) {
        ret.ptr = s.ptr + pos;
    }
    return ret;
}

scstr_t scstrsubs(scstr_t string, size_t start) {
    return scstrsubsl(string, start, string.length-start);
}

scstr_t scstrsubsl(scstr_t s, size_t start, size_t length) {
    size_t pos;
    scstr_t ret = { NULL, 0 };
    if(ucx_substring(s.length, start, length, &ret.length, &pos)) {
        ret.ptr = s.ptr + pos;
    }
    return ret;
}


static int ucx_strchr(const char *str, size_t length, int chr, size_t *pos) {
    for(size_t i=0;i<length;i++) {
        if(str[i] == chr) {
            *pos = i;
            return 1;
        }
    }
    return 0;
}

static int ucx_strrchr(const char *str, size_t length, int chr, size_t *pos) {
    if(length > 0) {
        for(size_t i=length ; i>0 ; i--) {
            if(str[i-1] == chr) {
                *pos = i-1;
                return 1;
            }
        }
    }
    return 0;
}

sstr_t sstrchr(sstr_t s, int c) {
    size_t pos = 0;
    if(ucx_strchr(s.ptr, s.length, c, &pos)) {
        return sstrsubs(s, pos);
    }
    return sstrn(NULL, 0);
}

sstr_t sstrrchr(sstr_t s, int c) {
    size_t pos = 0;
    if(ucx_strrchr(s.ptr, s.length, c, &pos)) {
        return sstrsubs(s, pos);
    }
    return sstrn(NULL, 0);
}

scstr_t scstrchr(scstr_t s, int c) {
    size_t pos = 0;
    if(ucx_strchr(s.ptr, s.length, c, &pos)) {
        return scstrsubs(s, pos);
    }
    return scstrn(NULL, 0);
}

scstr_t scstrrchr(scstr_t s, int c) {
    size_t pos = 0;
    if(ucx_strrchr(s.ptr, s.length, c, &pos)) {
        return scstrsubs(s, pos);
    }
    return scstrn(NULL, 0);
}

#define ptable_r(dest, useheap, ptable, index) (dest = useheap ? \
    ((size_t*)ptable)[index] : (size_t) ((uint8_t*)ptable)[index])

#define ptable_w(useheap, ptable, index, src) do {\
    if (!useheap) ((uint8_t*)ptable)[index] = (uint8_t) src;\
    else ((size_t*)ptable)[index] = src;\
    } while (0);


static const char* ucx_strstr(
        const char *str,
        size_t length,
        const char *match,
        size_t matchlen,
        size_t *newlen)
{
    *newlen = length;
    if (matchlen == 0) {
        return str;
    }
    
    const char *result = NULL;
    size_t resultlen = 0;
    
    /*
     * IMPORTANT:
     * our prefix table contains the prefix length PLUS ONE
     * this is our decision, because we want to use the full range of size_t
     * the original algorithm needs a (-1) at one single place
     * and we want to avoid that
     */
    
    /* static prefix table */
    static uint8_t s_prefix_table[256];
    
    /* check pattern length and use appropriate prefix table */
    /* if the pattern exceeds static prefix table, allocate on the heap */
    register int useheap = matchlen > 255;
    register void* ptable = useheap ?
        calloc(matchlen+1, sizeof(size_t)): s_prefix_table;
    
    /* keep counter in registers */
    register size_t i, j;
    
    /* fill prefix table */
    i = 0; j = 0;
    ptable_w(useheap, ptable, i, j);
    while (i < matchlen) {
        while (j >= 1 && match[j-1] != match[i]) {
            ptable_r(j, useheap, ptable, j-1);
        }
        i++; j++;
        ptable_w(useheap, ptable, i, j);
    }

    /* search */
    i = 0; j = 1;
    while (i < length) {
        while (j >= 1 && str[i] != match[j-1]) {
            ptable_r(j, useheap, ptable, j-1);
        }
        i++; j++;
        if (j-1 == matchlen) {
            size_t start = i - matchlen;
            result = str + start;
            resultlen = length - start;
            break;
        }
    }

    /* if prefix table was allocated on the heap, free it */
    if (ptable != s_prefix_table) {
        free(ptable);
    }
    
    *newlen = resultlen;
    return result;
}

sstr_t scstrsstr(sstr_t string, scstr_t match) {
    sstr_t result;
    
    size_t reslen;
    const char *resstr = ucx_strstr(string.ptr, string.length, match.ptr, match.length, &reslen);
    if(!resstr) {
        result.ptr = NULL;
        result.length = 0;
        return result;
    }
    
    size_t pos = resstr - string.ptr;
    result.ptr = string.ptr + pos;
    result.length = reslen;
    
    return result;
}

scstr_t scstrscstr(scstr_t string, scstr_t match) {
    scstr_t result;
    
    size_t reslen;
    const char *resstr = ucx_strstr(string.ptr, string.length, match.ptr, match.length, &reslen);
    if(!resstr) {
        result.ptr = NULL;
        result.length = 0;
        return result;
    }
    
    size_t pos = resstr - string.ptr;
    result.ptr = string.ptr + pos;
    result.length = reslen;
    
    return result;
}

#undef ptable_r
#undef ptable_w

sstr_t* scstrsplit(scstr_t s, scstr_t d, ssize_t *n) {
    return scstrsplit_a(ucx_default_allocator(), s, d, n);
}

sstr_t* scstrsplit_a(UcxAllocator *allocator, scstr_t s, scstr_t d, ssize_t *n) {
    if (s.length == 0 || d.length == 0) {
        *n = -1;
        return NULL;
    }
    
    /* special cases: delimiter is at least as large as the string */
    if (d.length >= s.length) {
        /* exact match */
        if (sstrcmp(s, d) == 0) {
            *n = 0;
            return NULL;
        } else /* no match possible */ {
            *n = 1;
            sstr_t *result = (sstr_t*) almalloc(allocator, sizeof(sstr_t));
            if(result) {
                *result = sstrdup_a(allocator, s);
            } else {
                *n = -2;
            }
            return result;
        }
    }
    
    ssize_t nmax = *n;
    size_t arrlen = 16;
    sstr_t* result = (sstr_t*) alcalloc(allocator, arrlen, sizeof(sstr_t));

    if (result) {
        scstr_t curpos = s;
        ssize_t j = 1;
        while (1) {
            scstr_t match;
            /* optimize for one byte delimiters */
            if (d.length == 1) {
                match = curpos;
                for (size_t i = 0 ; i < curpos.length ; i++) {
                    if (curpos.ptr[i] == *(d.ptr)) {
                        match.ptr = curpos.ptr + i;
                        break;
                    }
                    match.length--;
                }
            } else {
                match = scstrscstr(curpos, d);
            }
            if (match.length > 0) {
                /* is this our last try? */
                if (nmax == 0 || j < nmax) {
                    /* copy the current string to the array */
                    scstr_t item = scstrn(curpos.ptr, match.ptr - curpos.ptr);
                    result[j-1] = sstrdup_a(allocator, item);
                    size_t processed = item.length + d.length;
                    curpos.ptr += processed;
                    curpos.length -= processed;

                    /* allocate memory for the next string */
                    j++;
                    if (j > arrlen) {
                        arrlen *= 2;
                        size_t reallocsz;
                        sstr_t* reallocated = NULL;
                        if(!ucx_szmul(arrlen, sizeof(sstr_t), &reallocsz)) {
                            reallocated = (sstr_t*) alrealloc(
                                    allocator, result, reallocsz);
                        }
                        if (reallocated) {
                            result = reallocated;
                        } else {
                            for (ssize_t i = 0 ; i < j-1 ; i++) {
                                alfree(allocator, result[i].ptr);
                            }
                            alfree(allocator, result);
                            *n = -2;
                            return NULL;
                        }
                    }
                } else {
                    /* nmax reached, copy the _full_ remaining string */
                    result[j-1] = sstrdup_a(allocator, curpos);
                    break;
                }
            } else {
                /* no more matches, copy last string */
                result[j-1] = sstrdup_a(allocator, curpos);
                break;
            }
        }
        *n = j;
    } else {
        *n = -2;
    }

    return result;
}

int scstrcmp(scstr_t s1, scstr_t s2) {
    if (s1.length == s2.length) {
        return memcmp(s1.ptr, s2.ptr, s1.length);
    } else if (s1.length > s2.length) {
        return 1;
    } else {
        return -1;
    }
}

int scstrcasecmp(scstr_t s1, scstr_t s2) {
    if (s1.length == s2.length) {
#ifdef _WIN32
        return _strnicmp(s1.ptr, s2.ptr, s1.length);
#else
        return strncasecmp(s1.ptr, s2.ptr, s1.length);
#endif
    } else if (s1.length > s2.length) {
        return 1;
    } else {
        return -1;
    }
}

sstr_t scstrdup(scstr_t s) {
    return sstrdup_a(ucx_default_allocator(), s);
}

sstr_t scstrdup_a(UcxAllocator *allocator, scstr_t s) {
    sstr_t newstring;
    newstring.ptr = (char*)almalloc(allocator, s.length + 1);
    if (newstring.ptr) {
        newstring.length = s.length;
        newstring.ptr[newstring.length] = 0;
        
        memcpy(newstring.ptr, s.ptr, s.length);
    } else {
        newstring.length = 0;
    }
    
    return newstring;
}


static size_t ucx_strtrim(const char *s, size_t len, size_t *newlen) {
    const char *newptr = s;
    size_t length = len;
    
    while(length > 0 && isspace(*newptr)) {
        newptr++;
        length--;
    }
    while(length > 0 && isspace(newptr[length-1])) {
        length--;
    }
    
    *newlen = length;
    return newptr - s;
}

sstr_t sstrtrim(sstr_t string) {
    sstr_t newstr;
    newstr.ptr = string.ptr
                 + ucx_strtrim(string.ptr, string.length, &newstr.length);
    return newstr;
}

scstr_t scstrtrim(scstr_t string) {
    scstr_t newstr;
    newstr.ptr = string.ptr
                 + ucx_strtrim(string.ptr, string.length, &newstr.length);
    return newstr;
}

int scstrprefix(scstr_t string, scstr_t prefix) {
    if (string.length == 0) {
        return prefix.length == 0;
    }
    if (prefix.length == 0) {
        return 1;
    }
    
    if (prefix.length > string.length) {
        return 0;
    } else {
        return memcmp(string.ptr, prefix.ptr, prefix.length) == 0;
    }
}

int scstrsuffix(scstr_t string, scstr_t suffix) {
    if (string.length == 0) {
        return suffix.length == 0;
    }
    if (suffix.length == 0) {
        return 1;
    }
    
    if (suffix.length > string.length) {
        return 0;
    } else {
        return memcmp(string.ptr+string.length-suffix.length,
            suffix.ptr, suffix.length) == 0;
    }
}

int scstrcaseprefix(scstr_t string, scstr_t prefix) {
    if (string.length == 0) {
        return prefix.length == 0;
    }
    if (prefix.length == 0) {
        return 1;
    }
    
    if (prefix.length > string.length) {
        return 0;
    } else {
        scstr_t subs = scstrsubsl(string, 0, prefix.length);
        return scstrcasecmp(subs, prefix) == 0;
    }
}

int scstrcasesuffix(scstr_t string, scstr_t suffix) {
    if (string.length == 0) {
        return suffix.length == 0;
    }
    if (suffix.length == 0) {
        return 1;
    }
    
    if (suffix.length > string.length) {
        return 0;
    } else {
        scstr_t subs = scstrsubs(string, string.length-suffix.length);
        return scstrcasecmp(subs, suffix) == 0;
    }
}

sstr_t scstrlower(scstr_t string) {
    sstr_t ret = sstrdup(string);
    for (size_t i = 0; i < ret.length ; i++) {
        ret.ptr[i] = tolower(ret.ptr[i]);
    }
    return ret;
}

sstr_t scstrlower_a(UcxAllocator *allocator, scstr_t string) {
    sstr_t ret = sstrdup_a(allocator, string);
    for (size_t i = 0; i < ret.length ; i++) {
        ret.ptr[i] = tolower(ret.ptr[i]);
    }
    return ret;
}

sstr_t scstrupper(scstr_t string) {
    sstr_t ret = sstrdup(string);
    for (size_t i = 0; i < ret.length ; i++) {
        ret.ptr[i] = toupper(ret.ptr[i]);
    }
    return ret;
}

sstr_t scstrupper_a(UcxAllocator *allocator, scstr_t string) {
    sstr_t ret = sstrdup_a(allocator, string);
    for (size_t i = 0; i < ret.length ; i++) {
        ret.ptr[i] = toupper(ret.ptr[i]);
    }
    return ret;
}

#define REPLACE_INDEX_BUFFER_MAX 100

struct scstrreplace_ibuf {
    size_t* buf;
    unsigned int len; /* small indices */
    struct scstrreplace_ibuf* next;
};

static void scstrrepl_free_ibuf(struct scstrreplace_ibuf *buf) {
    while (buf) {
        struct scstrreplace_ibuf *next = buf->next;
        free(buf->buf);
        free(buf);
        buf = next;
    }
}

sstr_t scstrreplacen_a(UcxAllocator *allocator, scstr_t str,
                     scstr_t pattern, scstr_t replacement, size_t replmax) {

    if (pattern.length == 0 || pattern.length > str.length || replmax == 0)
        return sstrdup(str);

    /* Compute expected buffer length */
    size_t ibufmax = str.length / pattern.length;
    size_t ibuflen = replmax < ibufmax ? replmax : ibufmax;
    if (ibuflen > REPLACE_INDEX_BUFFER_MAX) {
        ibuflen = REPLACE_INDEX_BUFFER_MAX;
    }

    /* Allocate first index buffer */
    struct scstrreplace_ibuf *firstbuf, *curbuf;
    firstbuf = curbuf = calloc(1, sizeof(struct scstrreplace_ibuf));
    if (!firstbuf) return sstrn(NULL, 0);
    firstbuf->buf = calloc(ibuflen, sizeof(size_t));
    if (!firstbuf->buf) {
        free(firstbuf);
        return sstrn(NULL, 0);
    }

    /* Search occurrences */
    scstr_t searchstr = str;
    size_t found = 0;
    do {
        scstr_t match = scstrscstr(searchstr, pattern);
        if (match.length > 0) {
            /* Allocate next buffer in chain, if required */
            if (curbuf->len == ibuflen) {
                struct scstrreplace_ibuf *nextbuf =
                        calloc(1, sizeof(struct scstrreplace_ibuf));
                if (!nextbuf) {
                    scstrrepl_free_ibuf(firstbuf);
                    return sstrn(NULL, 0);
                }
                nextbuf->buf = calloc(ibuflen, sizeof(size_t));
                if (!nextbuf->buf) {
                    free(nextbuf);
                    scstrrepl_free_ibuf(firstbuf);
                    return sstrn(NULL, 0);
                }
                curbuf->next = nextbuf;
                curbuf = nextbuf;
            }

            /* Record match index */
            found++;
            size_t idx = match.ptr - str.ptr;
            curbuf->buf[curbuf->len++] = idx;
            searchstr.ptr = match.ptr + pattern.length;
            searchstr.length = str.length - idx - pattern.length;
        } else {
            break;
        }
    } while (searchstr.length > 0 && found < replmax);

    /* Allocate result string */
    sstr_t result;
    {
        ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length;
        size_t rcount = 0;
        curbuf = firstbuf;
        do {
            rcount += curbuf->len;
            curbuf = curbuf->next;
        } while (curbuf);
        result.length = str.length + rcount * adjlen;
        result.ptr = almalloc(allocator, result.length);
        if (!result.ptr) {
            scstrrepl_free_ibuf(firstbuf);
            return sstrn(NULL, 0);
        }
    }

    /* Build result string */
    curbuf = firstbuf;
    size_t srcidx = 0;
    char* destptr = result.ptr;
    do {
        for (size_t i = 0; i < curbuf->len; i++) {
            /* Copy source part up to next match*/
            size_t idx = curbuf->buf[i];
            size_t srclen = idx - srcidx;
            if (srclen > 0) {
                memcpy(destptr, str.ptr+srcidx, srclen);
                destptr += srclen;
                srcidx += srclen;
            }

            /* Copy the replacement and skip the source pattern */
            srcidx += pattern.length;
            memcpy(destptr, replacement.ptr, replacement.length);
            destptr += replacement.length;
        }
        curbuf = curbuf->next;
    } while (curbuf);
    memcpy(destptr, str.ptr+srcidx, str.length-srcidx);

    /* Free index buffer */
    scstrrepl_free_ibuf(firstbuf);

    return result;
}

sstr_t scstrreplacen(scstr_t str, scstr_t pattern,
        scstr_t replacement, size_t replmax) {
    return scstrreplacen_a(ucx_default_allocator(),
            str, pattern, replacement, replmax);
}


// type adjustment functions
scstr_t ucx_sc2sc(scstr_t str) {
    return str;
}
scstr_t ucx_ss2sc(sstr_t str) {
    scstr_t cs;
    cs.ptr = str.ptr;
    cs.length = str.length;
    return cs;
}
scstr_t ucx_ss2c_s(scstr_t c) {
    return c;
}

mercurial