src/ucx/string.c

Sat, 12 Nov 2022 11:52:47 +0100

author
Olaf Wintermann <olaf.wintermann@gmail.com>
date
Sat, 12 Nov 2022 11:52:47 +0100
changeset 424
3df9258cd3cc
parent 415
d938228c382e
child 438
22eca559aded
permissions
-rw-r--r--

allow '-' in tokens, add support for negative integers

/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 *
 * Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "cx/string.h"
#include "cx/utils.h"

#include <string.h>
#include <stdarg.h>
#include <ctype.h>

#ifndef _WIN32

#include <strings.h> /* for strncasecmp() */

#endif /* _WIN32 */

cxmutstr cx_mutstr(char *cstring) {
    return (cxmutstr) {cstring, strlen(cstring)};
}

cxmutstr cx_mutstrn(
        char *cstring,
        size_t length
) {
    return (cxmutstr) {cstring, length};
}

cxstring cx_str(const char *cstring) {
    return (cxstring) {cstring, strlen(cstring)};
}

cxstring cx_strn(
        const char *cstring,
        size_t length
) {
    return (cxstring) {cstring, length};
}

cxstring cx_strcast(cxmutstr str) {
    return (cxstring) {str.ptr, str.length};
}

void cx_strfree(cxmutstr *str) {
    free(str->ptr);
    str->ptr = NULL;
    str->length = 0;
}

void cx_strfree_a(
        CxAllocator *alloc,
        cxmutstr *str
) {
    cxFree(alloc, str->ptr);
    str->ptr = NULL;
    str->length = 0;
}

size_t cx_strlen(
        size_t count,
        ...
) {
    if (count == 0) return 0;

    va_list ap;
    va_start(ap, count);
    size_t size = 0;
    cx_for_n(i, count) {
        cxstring str = va_arg(ap, cxstring);
        size += str.length;
    }
    va_end(ap);

    return size;
}

cxmutstr cx_strcat_a(
        CxAllocator *alloc,
        size_t count,
        ...
) {
    cxstring *strings = calloc(count, sizeof(cxstring));
    if (!strings) abort();

    va_list ap;
    va_start(ap, count);

    // get all args and overall length
    size_t slen = 0;
    cx_for_n(i, count) {
        cxstring s = va_arg (ap, cxstring);
        strings[i] = s;
        slen += s.length;
    }

    // create new string
    cxmutstr result;
    result.ptr = cxMalloc(alloc, slen + 1);
    result.length = slen;
    if (result.ptr == NULL) abort();

    // concatenate strings
    size_t pos = 0;
    cx_for_n(i, count) {
        cxstring s = strings[i];
        memcpy(result.ptr + pos, s.ptr, s.length);
        pos += s.length;
    }

    // terminate string
    result.ptr[result.length] = '\0';

    // free temporary array
    free(strings);

    return result;
}

cxstring cx_strsubs(
        cxstring string,
        size_t start
) {
    return cx_strsubsl(string, start, string.length - start);
}

cxmutstr cx_strsubs_m(
        cxmutstr string,
        size_t start
) {
    return cx_strsubsl_m(string, start, string.length - start);
}

cxstring cx_strsubsl(
        cxstring string,
        size_t start,
        size_t length
) {
    if (start > string.length) {
        return (cxstring) {NULL, 0};
    }

    size_t rem_len = string.length - start;
    if (length > rem_len) {
        length = rem_len;
    }

    return (cxstring) {string.ptr + start, length};
}

cxmutstr cx_strsubsl_m(
        cxmutstr string,
        size_t start,
        size_t length
) {
    cxstring result = cx_strsubsl(cx_strcast(string), start, length);
    return (cxmutstr) {(char *) result.ptr, result.length};
}

cxstring cx_strchr(
        cxstring string,
        int chr
) {
    chr = 0xFF & chr;
    // TODO: improve by comparing multiple bytes at once
    cx_for_n(i, string.length) {
        if (string.ptr[i] == chr) {
            return cx_strsubs(string, i);
        }
    }
    return (cxstring) {NULL, 0};
}

cxmutstr cx_strchr_m(
        cxmutstr string,
        int chr
) {
    cxstring result = cx_strchr(cx_strcast(string), chr);
    return (cxmutstr) {(char *) result.ptr, result.length};
}

cxstring cx_strrchr(
        cxstring string,
        int chr
) {
    chr = 0xFF & chr;
    size_t i = string.length;
    while (i > 0) {
        i--;
        // TODO: improve by comparing multiple bytes at once
        if (string.ptr[i] == chr) {
            return cx_strsubs(string, i);
        }
    }
    return (cxstring) {NULL, 0};
}

cxmutstr cx_strrchr_m(
        cxmutstr string,
        int chr
) {
    cxstring result = cx_strrchr(cx_strcast(string), chr);
    return (cxmutstr) {(char *) result.ptr, result.length};
}

#define STRSTR_SBO_BUFLEN 512

cxstring cx_strstr(
        cxstring haystack,
        cxstring needle
) {
    if (needle.length == 0) {
        return haystack;
    }

    /* optimize for single-char needles */
    if (needle.length == 1) {
        return cx_strchr(haystack, *needle.ptr);
    }

    /*
     * IMPORTANT:
     * Our prefix table contains the prefix length PLUS ONE
     * this is our decision, because we want to use the full range of size_t.
     * The original algorithm needs a (-1) at one single place,
     * and we want to avoid that.
     */

    /* local prefix table */
    size_t s_prefix_table[STRSTR_SBO_BUFLEN];

    /* check needle length and use appropriate prefix table */
    /* if the pattern exceeds static prefix table, allocate on the heap */
    bool useheap = needle.length >= STRSTR_SBO_BUFLEN;
    register size_t *ptable = useheap ? calloc(needle.length + 1,
                                               sizeof(size_t)) : s_prefix_table;

    /* keep counter in registers */
    register size_t i, j;

    /* fill prefix table */
    i = 0;
    j = 0;
    ptable[i] = j;
    while (i < needle.length) {
        while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) {
            j = ptable[j - 1];
        }
        i++;
        j++;
        ptable[i] = j;
    }

    /* search */
    cxstring result = {NULL, 0};
    i = 0;
    j = 1;
    while (i < haystack.length) {
        while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) {
            j = ptable[j - 1];
        }
        i++;
        j++;
        if (j - 1 == needle.length) {
            size_t start = i - needle.length;
            result.ptr = haystack.ptr + start;
            result.length = haystack.length - start;
            break;
        }
    }

    /* if prefix table was allocated on the heap, free it */
    if (ptable != s_prefix_table) {
        free(ptable);
    }

    return result;
}

cxmutstr cx_strstr_m(
        cxmutstr haystack,
        cxstring needle
) {
    cxstring result = cx_strstr(cx_strcast(haystack), needle);
    return (cxmutstr) {(char *) result.ptr, result.length};
}

size_t cx_strsplit(
        cxstring string,
        cxstring delim,
        size_t limit,
        cxstring *output
) {
    /* special case: output limit is zero */
    if (limit == 0) return 0;

    /* special case: delimiter is empty */
    if (delim.length == 0) {
        output[0] = string;
        return 1;
    }

    /* special cases: delimiter is at least as large as the string */
    if (delim.length >= string.length) {
        /* exact match */
        if (cx_strcmp(string, delim) == 0) {
            output[0] = cx_strn(string.ptr, 0);
            output[1] = cx_strn(string.ptr + string.length, 0);
            return 2;
        } else /* no match possible */ {
            output[0] = string;
            return 1;
        }
    }

    size_t n = 0;
    cxstring curpos = string;
    while (1) {
        ++n;
        cxstring match = cx_strstr(curpos, delim);
        if (match.length > 0) {
            /* is the limit reached? */
            if (n < limit) {
                /* copy the current string to the array */
                cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr);
                output[n - 1] = item;
                size_t processed = item.length + delim.length;
                curpos.ptr += processed;
                curpos.length -= processed;
            } else {
                /* limit reached, copy the _full_ remaining string */
                output[n - 1] = curpos;
                break;
            }
        } else {
            /* no more matches, copy last string */
            output[n - 1] = curpos;
            break;
        }
    }

    return n;
}

size_t cx_strsplit_a(
        CxAllocator *allocator,
        cxstring string,
        cxstring delim,
        size_t limit,
        cxstring **output
) {
    /* find out how many splits we're going to make and allocate memory */
    size_t n = 0;
    cxstring curpos = string;
    while (1) {
        ++n;
        cxstring match = cx_strstr(curpos, delim);
        if (match.length > 0) {
            /* is the limit reached? */
            if (n < limit) {
                size_t processed = match.ptr - curpos.ptr + delim.length;
                curpos.ptr += processed;
                curpos.length -= processed;
            } else {
                /* limit reached */
                break;
            }
        } else {
            /* no more matches */
            break;
        }
    }
    *output = cxCalloc(allocator, n, sizeof(cxstring));
    return cx_strsplit(string, delim, n, *output);
}

size_t cx_strsplit_m(
        cxmutstr string,
        cxstring delim,
        size_t limit,
        cxmutstr *output
) {
    return cx_strsplit(cx_strcast(string),
                       delim, limit, (cxstring *) output);
}

size_t cx_strsplit_ma(
        CxAllocator *allocator,
        cxmutstr string,
        cxstring delim,
        size_t limit,
        cxmutstr **output
) {
    return cx_strsplit_a(allocator, cx_strcast(string),
                         delim, limit, (cxstring **) output);
}

int cx_strcmp(
        cxstring s1,
        cxstring s2
) {
    if (s1.length == s2.length) {
        return memcmp(s1.ptr, s2.ptr, s1.length);
    } else if (s1.length > s2.length) {
        return 1;
    } else {
        return -1;
    }
}

int cx_strcasecmp(
        cxstring s1,
        cxstring s2
) {
    if (s1.length == s2.length) {
#ifdef _WIN32
        return _strnicmp(s1.ptr, s2.ptr, s1.length);
#else
        return strncasecmp(s1.ptr, s2.ptr, s1.length);
#endif
    } else if (s1.length > s2.length) {
        return 1;
    } else {
        return -1;
    }
}

cxmutstr cx_strdup_a(
        CxAllocator *allocator,
        cxstring string
) {
    cxmutstr result = {
            cxMalloc(allocator, string.length + 1),
            string.length
    };
    if (result.ptr == NULL) {
        result.length = 0;
        return result;
    }
    memcpy(result.ptr, string.ptr, string.length);
    result.ptr[string.length] = '\0';
    return result;
}

cxstring cx_strtrim(cxstring string) {
    cxstring result = string;
    // TODO: optimize by comparing multiple bytes at once
    while (result.length > 0 && isspace(*result.ptr)) {
        result.ptr++;
        result.length--;
    }
    while (result.length > 0 && isspace(result.ptr[result.length - 1])) {
        result.length--;
    }
    return result;
}

cxmutstr cx_strtrim_m(cxmutstr string) {
    cxstring result = cx_strtrim(cx_strcast(string));
    return (cxmutstr) {(char *) result.ptr, result.length};
}

bool cx_strprefix(
        cxstring string,
        cxstring prefix
) {
    if (string.length < prefix.length) return false;
    return memcmp(string.ptr, prefix.ptr, prefix.length) == 0;
}

bool cx_strsuffix(
        cxstring string,
        cxstring suffix
) {
    if (string.length < suffix.length) return false;
    return memcmp(string.ptr + string.length - suffix.length,
                  suffix.ptr, suffix.length) == 0;
}

bool cx_strcaseprefix(
        cxstring string,
        cxstring prefix
) {
    if (string.length < prefix.length) return false;
#ifdef _WIN32
    return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0;
#else
    return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0;
#endif
}

bool cx_strcasesuffix(
        cxstring string,
        cxstring suffix
) {
    if (string.length < suffix.length) return false;
#ifdef _WIN32
    return _strnicmp(string.ptr+string.length-suffix.length,
                  suffix.ptr, suffix.length) == 0;
#else
    return strncasecmp(string.ptr + string.length - suffix.length,
                       suffix.ptr, suffix.length) == 0;
#endif
}

void cx_strlower(cxmutstr string) {
    cx_for_n(i, string.length) {
        string.ptr[i] = (char) tolower(string.ptr[i]);
    }
}

void cx_strupper(cxmutstr string) {
    cx_for_n(i, string.length) {
        string.ptr[i] = (char) toupper(string.ptr[i]);
    }
}

#define REPLACE_INDEX_BUFFER_MAX 100

struct cx_strreplace_ibuf {
    size_t *buf;
    struct cx_strreplace_ibuf *next;
    unsigned int len;
};

static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) {
    while (buf) {
        struct cx_strreplace_ibuf *next = buf->next;
        free(buf->buf);
        free(buf);
        buf = next;
    }
}

cxmutstr cx_strreplacen_a(
        CxAllocator *allocator,
        cxstring str,
        cxstring pattern,
        cxstring replacement,
        size_t replmax
) {

    if (pattern.length == 0 || pattern.length > str.length || replmax == 0)
        return cx_strdup_a(allocator, str);

    /* Compute expected buffer length */
    size_t ibufmax = str.length / pattern.length;
    size_t ibuflen = replmax < ibufmax ? replmax : ibufmax;
    if (ibuflen > REPLACE_INDEX_BUFFER_MAX) {
        ibuflen = REPLACE_INDEX_BUFFER_MAX;
    }

    /* Allocate first index buffer */
    struct cx_strreplace_ibuf *firstbuf, *curbuf;
    firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf));
    if (!firstbuf) return cx_mutstrn(NULL, 0);
    firstbuf->buf = calloc(ibuflen, sizeof(size_t));
    if (!firstbuf->buf) {
        free(firstbuf);
        return cx_mutstrn(NULL, 0);
    }

    /* Search occurrences */
    cxstring searchstr = str;
    size_t found = 0;
    do {
        cxstring match = cx_strstr(searchstr, pattern);
        if (match.length > 0) {
            /* Allocate next buffer in chain, if required */
            if (curbuf->len == ibuflen) {
                struct cx_strreplace_ibuf *nextbuf =
                        calloc(1, sizeof(struct cx_strreplace_ibuf));
                if (!nextbuf) {
                    cx_strrepl_free_ibuf(firstbuf);
                    return cx_mutstrn(NULL, 0);
                }
                nextbuf->buf = calloc(ibuflen, sizeof(size_t));
                if (!nextbuf->buf) {
                    free(nextbuf);
                    cx_strrepl_free_ibuf(firstbuf);
                    return cx_mutstrn(NULL, 0);
                }
                curbuf->next = nextbuf;
                curbuf = nextbuf;
            }

            /* Record match index */
            found++;
            size_t idx = match.ptr - str.ptr;
            curbuf->buf[curbuf->len++] = idx;
            searchstr.ptr = match.ptr + pattern.length;
            searchstr.length = str.length - idx - pattern.length;
        } else {
            break;
        }
    } while (searchstr.length > 0 && found < replmax);

    /* Allocate result string */
    cxmutstr result;
    {
        ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length;
        size_t rcount = 0;
        curbuf = firstbuf;
        do {
            rcount += curbuf->len;
            curbuf = curbuf->next;
        } while (curbuf);
        result.length = str.length + rcount * adjlen;
        result.ptr = cxMalloc(allocator, result.length + 1);
        if (!result.ptr) {
            cx_strrepl_free_ibuf(firstbuf);
            return cx_mutstrn(NULL, 0);
        }
    }

    /* Build result string */
    curbuf = firstbuf;
    size_t srcidx = 0;
    char *destptr = result.ptr;
    do {
        for (size_t i = 0; i < curbuf->len; i++) {
            /* Copy source part up to next match*/
            size_t idx = curbuf->buf[i];
            size_t srclen = idx - srcidx;
            if (srclen > 0) {
                memcpy(destptr, str.ptr + srcidx, srclen);
                destptr += srclen;
                srcidx += srclen;
            }

            /* Copy the replacement and skip the source pattern */
            srcidx += pattern.length;
            memcpy(destptr, replacement.ptr, replacement.length);
            destptr += replacement.length;
        }
        curbuf = curbuf->next;
    } while (curbuf);
    memcpy(destptr, str.ptr + srcidx, str.length - srcidx);

    /* Result is guaranteed to be zero-terminated */
    result.ptr[result.length] = '\0';

    /* Free index buffer */
    cx_strrepl_free_ibuf(firstbuf);

    return result;
}

mercurial