--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ucx/string.c Mon May 22 16:17:26 2023 +0200 @@ -0,0 +1,785 @@ +/* + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. + * + * Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "cx/string.h" +#include "cx/utils.h" + +#include <string.h> +#include <stdarg.h> +#include <ctype.h> + +#ifndef _WIN32 + +#include <strings.h> // for strncasecmp() + +#endif // _WIN32 + +cxmutstr cx_mutstr(char *cstring) { + return (cxmutstr) {cstring, strlen(cstring)}; +} + +cxmutstr cx_mutstrn( + char *cstring, + size_t length +) { + return (cxmutstr) {cstring, length}; +} + +cxstring cx_str(const char *cstring) { + return (cxstring) {cstring, strlen(cstring)}; +} + +cxstring cx_strn( + const char *cstring, + size_t length +) { + return (cxstring) {cstring, length}; +} + +cxstring cx_strcast(cxmutstr str) { + return (cxstring) {str.ptr, str.length}; +} + +void cx_strfree(cxmutstr *str) { + free(str->ptr); + str->ptr = NULL; + str->length = 0; +} + +void cx_strfree_a( + CxAllocator const *alloc, + cxmutstr *str +) { + cxFree(alloc, str->ptr); + str->ptr = NULL; + str->length = 0; +} + +size_t cx_strlen( + size_t count, + ... +) { + if (count == 0) return 0; + + va_list ap; + va_start(ap, count); + size_t size = 0; + cx_for_n(i, count) { + cxstring str = va_arg(ap, cxstring); + size += str.length; + } + va_end(ap); + + return size; +} + +cxmutstr cx_strcat_ma( + CxAllocator const *alloc, + cxmutstr str, + size_t count, + ... +) { + if (count == 0) return str; + + cxstring *strings = calloc(count, sizeof(cxstring)); + if (!strings) abort(); + + va_list ap; + va_start(ap, count); + + // get all args and overall length + size_t slen = str.length; + cx_for_n(i, count) { + cxstring s = va_arg (ap, cxstring); + strings[i] = s; + slen += s.length; + } + va_end(ap); + + // reallocate or create new string + if (str.ptr == NULL) { + str.ptr = cxMalloc(alloc, slen + 1); + } else { + str.ptr = cxRealloc(alloc, str.ptr, slen + 1); + } + if (str.ptr == NULL) abort(); + + // concatenate strings + size_t pos = str.length; + str.length = slen; + cx_for_n(i, count) { + cxstring s = strings[i]; + memcpy(str.ptr + pos, s.ptr, s.length); + pos += s.length; + } + + // terminate string + str.ptr[str.length] = '\0'; + + // free temporary array + free(strings); + + return str; +} + +cxstring cx_strsubs( + cxstring string, + size_t start +) { + return cx_strsubsl(string, start, string.length - start); +} + +cxmutstr cx_strsubs_m( + cxmutstr string, + size_t start +) { + return cx_strsubsl_m(string, start, string.length - start); +} + +cxstring cx_strsubsl( + cxstring string, + size_t start, + size_t length +) { + if (start > string.length) { + return (cxstring) {NULL, 0}; + } + + size_t rem_len = string.length - start; + if (length > rem_len) { + length = rem_len; + } + + return (cxstring) {string.ptr + start, length}; +} + +cxmutstr cx_strsubsl_m( + cxmutstr string, + size_t start, + size_t length +) { + cxstring result = cx_strsubsl(cx_strcast(string), start, length); + return (cxmutstr) {(char *) result.ptr, result.length}; +} + +cxstring cx_strchr( + cxstring string, + int chr +) { + chr = 0xFF & chr; + // TODO: improve by comparing multiple bytes at once + cx_for_n(i, string.length) { + if (string.ptr[i] == chr) { + return cx_strsubs(string, i); + } + } + return (cxstring) {NULL, 0}; +} + +cxmutstr cx_strchr_m( + cxmutstr string, + int chr +) { + cxstring result = cx_strchr(cx_strcast(string), chr); + return (cxmutstr) {(char *) result.ptr, result.length}; +} + +cxstring cx_strrchr( + cxstring string, + int chr +) { + chr = 0xFF & chr; + size_t i = string.length; + while (i > 0) { + i--; + // TODO: improve by comparing multiple bytes at once + if (string.ptr[i] == chr) { + return cx_strsubs(string, i); + } + } + return (cxstring) {NULL, 0}; +} + +cxmutstr cx_strrchr_m( + cxmutstr string, + int chr +) { + cxstring result = cx_strrchr(cx_strcast(string), chr); + return (cxmutstr) {(char *) result.ptr, result.length}; +} + +#ifndef CX_STRSTR_SBO_SIZE +#define CX_STRSTR_SBO_SIZE 512 +#endif + +cxstring cx_strstr( + cxstring haystack, + cxstring needle +) { + if (needle.length == 0) { + return haystack; + } + + // optimize for single-char needles + if (needle.length == 1) { + return cx_strchr(haystack, *needle.ptr); + } + + /* + * IMPORTANT: + * Our prefix table contains the prefix length PLUS ONE + * this is our decision, because we want to use the full range of size_t. + * The original algorithm needs a (-1) at one single place, + * and we want to avoid that. + */ + + // local prefix table + size_t s_prefix_table[CX_STRSTR_SBO_SIZE]; + + // check needle length and use appropriate prefix table + // if the pattern exceeds static prefix table, allocate on the heap + bool useheap = needle.length >= CX_STRSTR_SBO_SIZE; + register size_t *ptable = useheap ? calloc(needle.length + 1, + sizeof(size_t)) : s_prefix_table; + + // keep counter in registers + register size_t i, j; + + // fill prefix table + i = 0; + j = 0; + ptable[i] = j; + while (i < needle.length) { + while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) { + j = ptable[j - 1]; + } + i++; + j++; + ptable[i] = j; + } + + // search + cxstring result = {NULL, 0}; + i = 0; + j = 1; + while (i < haystack.length) { + while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) { + j = ptable[j - 1]; + } + i++; + j++; + if (j - 1 == needle.length) { + size_t start = i - needle.length; + result.ptr = haystack.ptr + start; + result.length = haystack.length - start; + break; + } + } + + // if prefix table was allocated on the heap, free it + if (ptable != s_prefix_table) { + free(ptable); + } + + return result; +} + +cxmutstr cx_strstr_m( + cxmutstr haystack, + cxstring needle +) { + cxstring result = cx_strstr(cx_strcast(haystack), needle); + return (cxmutstr) {(char *) result.ptr, result.length}; +} + +size_t cx_strsplit( + cxstring string, + cxstring delim, + size_t limit, + cxstring *output +) { + // special case: output limit is zero + if (limit == 0) return 0; + + // special case: delimiter is empty + if (delim.length == 0) { + output[0] = string; + return 1; + } + + // special cases: delimiter is at least as large as the string + if (delim.length >= string.length) { + // exact match + if (cx_strcmp(string, delim) == 0) { + output[0] = cx_strn(string.ptr, 0); + output[1] = cx_strn(string.ptr + string.length, 0); + return 2; + } else { + // no match possible + output[0] = string; + return 1; + } + } + + size_t n = 0; + cxstring curpos = string; + while (1) { + ++n; + cxstring match = cx_strstr(curpos, delim); + if (match.length > 0) { + // is the limit reached? + if (n < limit) { + // copy the current string to the array + cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr); + output[n - 1] = item; + size_t processed = item.length + delim.length; + curpos.ptr += processed; + curpos.length -= processed; + } else { + // limit reached, copy the _full_ remaining string + output[n - 1] = curpos; + break; + } + } else { + // no more matches, copy last string + output[n - 1] = curpos; + break; + } + } + + return n; +} + +size_t cx_strsplit_a( + CxAllocator const *allocator, + cxstring string, + cxstring delim, + size_t limit, + cxstring **output +) { + // find out how many splits we're going to make and allocate memory + size_t n = 0; + cxstring curpos = string; + while (1) { + ++n; + cxstring match = cx_strstr(curpos, delim); + if (match.length > 0) { + // is the limit reached? + if (n < limit) { + size_t processed = match.ptr - curpos.ptr + delim.length; + curpos.ptr += processed; + curpos.length -= processed; + } else { + // limit reached + break; + } + } else { + // no more matches + break; + } + } + *output = cxCalloc(allocator, n, sizeof(cxstring)); + return cx_strsplit(string, delim, n, *output); +} + +size_t cx_strsplit_m( + cxmutstr string, + cxstring delim, + size_t limit, + cxmutstr *output +) { + return cx_strsplit(cx_strcast(string), + delim, limit, (cxstring *) output); +} + +size_t cx_strsplit_ma( + CxAllocator const *allocator, + cxmutstr string, + cxstring delim, + size_t limit, + cxmutstr **output +) { + return cx_strsplit_a(allocator, cx_strcast(string), + delim, limit, (cxstring **) output); +} + +int cx_strcmp( + cxstring s1, + cxstring s2 +) { + if (s1.length == s2.length) { + return memcmp(s1.ptr, s2.ptr, s1.length); + } else if (s1.length > s2.length) { + return 1; + } else { + return -1; + } +} + +int cx_strcasecmp( + cxstring s1, + cxstring s2 +) { + if (s1.length == s2.length) { +#ifdef _WIN32 + return _strnicmp(s1.ptr, s2.ptr, s1.length); +#else + return strncasecmp(s1.ptr, s2.ptr, s1.length); +#endif + } else if (s1.length > s2.length) { + return 1; + } else { + return -1; + } +} + +int cx_strcmp_p( + void const *s1, + void const *s2 +) { + cxstring const *left = s1; + cxstring const *right = s2; + return cx_strcmp(*left, *right); +} + +int cx_strcasecmp_p( + void const *s1, + void const *s2 +) { + cxstring const *left = s1; + cxstring const *right = s2; + return cx_strcasecmp(*left, *right); +} + +cxmutstr cx_strdup_a( + CxAllocator const *allocator, + cxstring string +) { + cxmutstr result = { + cxMalloc(allocator, string.length + 1), + string.length + }; + if (result.ptr == NULL) { + result.length = 0; + return result; + } + memcpy(result.ptr, string.ptr, string.length); + result.ptr[string.length] = '\0'; + return result; +} + +cxstring cx_strtrim(cxstring string) { + cxstring result = string; + // TODO: optimize by comparing multiple bytes at once + while (result.length > 0 && isspace(*result.ptr)) { + result.ptr++; + result.length--; + } + while (result.length > 0 && isspace(result.ptr[result.length - 1])) { + result.length--; + } + return result; +} + +cxmutstr cx_strtrim_m(cxmutstr string) { + cxstring result = cx_strtrim(cx_strcast(string)); + return (cxmutstr) {(char *) result.ptr, result.length}; +} + +bool cx_strprefix( + cxstring string, + cxstring prefix +) { + if (string.length < prefix.length) return false; + return memcmp(string.ptr, prefix.ptr, prefix.length) == 0; +} + +bool cx_strsuffix( + cxstring string, + cxstring suffix +) { + if (string.length < suffix.length) return false; + return memcmp(string.ptr + string.length - suffix.length, + suffix.ptr, suffix.length) == 0; +} + +bool cx_strcaseprefix( + cxstring string, + cxstring prefix +) { + if (string.length < prefix.length) return false; +#ifdef _WIN32 + return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0; +#else + return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0; +#endif +} + +bool cx_strcasesuffix( + cxstring string, + cxstring suffix +) { + if (string.length < suffix.length) return false; +#ifdef _WIN32 + return _strnicmp(string.ptr+string.length-suffix.length, + suffix.ptr, suffix.length) == 0; +#else + return strncasecmp(string.ptr + string.length - suffix.length, + suffix.ptr, suffix.length) == 0; +#endif +} + +void cx_strlower(cxmutstr string) { + cx_for_n(i, string.length) { + string.ptr[i] = (char) tolower(string.ptr[i]); + } +} + +void cx_strupper(cxmutstr string) { + cx_for_n(i, string.length) { + string.ptr[i] = (char) toupper(string.ptr[i]); + } +} + +#ifndef CX_STRREPLACE_INDEX_BUFFER_SIZE +#define CX_STRREPLACE_INDEX_BUFFER_SIZE 64 +#endif + +struct cx_strreplace_ibuf { + size_t *buf; + struct cx_strreplace_ibuf *next; + unsigned int len; +}; + +static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) { + while (buf) { + struct cx_strreplace_ibuf *next = buf->next; + free(buf->buf); + free(buf); + buf = next; + } +} + +cxmutstr cx_strreplacen_a( + CxAllocator const *allocator, + cxstring str, + cxstring pattern, + cxstring replacement, + size_t replmax +) { + + if (pattern.length == 0 || pattern.length > str.length || replmax == 0) + return cx_strdup_a(allocator, str); + + // Compute expected buffer length + size_t ibufmax = str.length / pattern.length; + size_t ibuflen = replmax < ibufmax ? replmax : ibufmax; + if (ibuflen > CX_STRREPLACE_INDEX_BUFFER_SIZE) { + ibuflen = CX_STRREPLACE_INDEX_BUFFER_SIZE; + } + + // Allocate first index buffer + struct cx_strreplace_ibuf *firstbuf, *curbuf; + firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf)); + if (!firstbuf) return cx_mutstrn(NULL, 0); + firstbuf->buf = calloc(ibuflen, sizeof(size_t)); + if (!firstbuf->buf) { + free(firstbuf); + return cx_mutstrn(NULL, 0); + } + + // Search occurrences + cxstring searchstr = str; + size_t found = 0; + do { + cxstring match = cx_strstr(searchstr, pattern); + if (match.length > 0) { + // Allocate next buffer in chain, if required + if (curbuf->len == ibuflen) { + struct cx_strreplace_ibuf *nextbuf = + calloc(1, sizeof(struct cx_strreplace_ibuf)); + if (!nextbuf) { + cx_strrepl_free_ibuf(firstbuf); + return cx_mutstrn(NULL, 0); + } + nextbuf->buf = calloc(ibuflen, sizeof(size_t)); + if (!nextbuf->buf) { + free(nextbuf); + cx_strrepl_free_ibuf(firstbuf); + return cx_mutstrn(NULL, 0); + } + curbuf->next = nextbuf; + curbuf = nextbuf; + } + + // Record match index + found++; + size_t idx = match.ptr - str.ptr; + curbuf->buf[curbuf->len++] = idx; + searchstr.ptr = match.ptr + pattern.length; + searchstr.length = str.length - idx - pattern.length; + } else { + break; + } + } while (searchstr.length > 0 && found < replmax); + + // Allocate result string + cxmutstr result; + { + ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length; + size_t rcount = 0; + curbuf = firstbuf; + do { + rcount += curbuf->len; + curbuf = curbuf->next; + } while (curbuf); + result.length = str.length + rcount * adjlen; + result.ptr = cxMalloc(allocator, result.length + 1); + if (!result.ptr) { + cx_strrepl_free_ibuf(firstbuf); + return cx_mutstrn(NULL, 0); + } + } + + // Build result string + curbuf = firstbuf; + size_t srcidx = 0; + char *destptr = result.ptr; + do { + for (size_t i = 0; i < curbuf->len; i++) { + // Copy source part up to next match + size_t idx = curbuf->buf[i]; + size_t srclen = idx - srcidx; + if (srclen > 0) { + memcpy(destptr, str.ptr + srcidx, srclen); + destptr += srclen; + srcidx += srclen; + } + + // Copy the replacement and skip the source pattern + srcidx += pattern.length; + memcpy(destptr, replacement.ptr, replacement.length); + destptr += replacement.length; + } + curbuf = curbuf->next; + } while (curbuf); + memcpy(destptr, str.ptr + srcidx, str.length - srcidx); + + // Result is guaranteed to be zero-terminated + result.ptr[result.length] = '\0'; + + // Free index buffer + cx_strrepl_free_ibuf(firstbuf); + + return result; +} + +CxStrtokCtx cx_strtok( + cxstring str, + cxstring delim, + size_t limit +) { + CxStrtokCtx ctx; + ctx.str = str; + ctx.delim = delim; + ctx.limit = limit; + ctx.pos = 0; + ctx.next_pos = 0; + ctx.delim_pos = 0; + ctx.found = 0; + ctx.delim_more = NULL; + ctx.delim_more_count = 0; + return ctx; +} + +CxStrtokCtx cx_strtok_m( + cxmutstr str, + cxstring delim, + size_t limit +) { + return cx_strtok(cx_strcast(str), delim, limit); +} + +bool cx_strtok_next( + CxStrtokCtx *ctx, + cxstring *token +) { + // abortion criteria + if (ctx->found >= ctx->limit || ctx->delim_pos >= ctx->str.length) { + return false; + } + + // determine the search start + cxstring haystack = cx_strsubs(ctx->str, ctx->next_pos); + + // search the next delimiter + cxstring delim = cx_strstr(haystack, ctx->delim); + + // if found, make delim capture exactly the delimiter + if (delim.length > 0) { + delim.length = ctx->delim.length; + } + + // if more delimiters are specified, check them now + if (ctx->delim_more_count > 0) { + cx_for_n(i, ctx->delim_more_count) { + cxstring d = cx_strstr(haystack, ctx->delim_more[i]); + if (d.length > 0 && (delim.length == 0 || d.ptr < delim.ptr)) { + delim.ptr = d.ptr; + delim.length = ctx->delim_more[i].length; + } + } + } + + // store the token information and adjust the context + ctx->found++; + ctx->pos = ctx->next_pos; + token->ptr = &ctx->str.ptr[ctx->pos]; + ctx->delim_pos = delim.length == 0 ? + ctx->str.length : (size_t) (delim.ptr - ctx->str.ptr); + token->length = ctx->delim_pos - ctx->pos; + ctx->next_pos = ctx->delim_pos + delim.length; + + return true; +} + +bool cx_strtok_next_m( + CxStrtokCtx *ctx, + cxmutstr *token +) { + return cx_strtok_next(ctx, (cxstring *) token); +} + +void cx_strtok_delim( + CxStrtokCtx *ctx, + cxstring const *delim, + size_t count +) { + ctx->delim_more = delim; + ctx->delim_more_count = count; +}