dbutils: comparison ucx/string.c

--1:000000000000
+:1a157da63d7c
+/*
+* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
+*
+* Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+*   1. Redistributions of source code must retain the above copyright
+*      notice, this list of conditions and the following disclaimer.
+*
+*   2. Redistributions in binary form must reproduce the above copyright
+*      notice, this list of conditions and the following disclaimer in the
+*      documentation and/or other materials provided with the distribution.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+* POSSIBILITY OF SUCH DAMAGE.
+*/
+#include "cx/string.h"
+#include "cx/utils.h"
+#include <string.h>
+#include <stdarg.h>
+#include <ctype.h>
+#ifndef _WIN32
+#include <strings.h> // for strncasecmp()
+#endif // _WIN32
+cxmutstr cx_mutstr(char *cstring) {
+return (cxmutstr) {cstring, strlen(cstring)};
+}
+cxmutstr cx_mutstrn(
+char *cstring,
+size_t length
+) {
+return (cxmutstr) {cstring, length};
+}
+cxstring cx_str(const char *cstring) {
+return (cxstring) {cstring, strlen(cstring)};
+}
+cxstring cx_strn(
+const char *cstring,
+size_t length
+) {
+return (cxstring) {cstring, length};
+}
+cxstring cx_strcast(cxmutstr str) {
+return (cxstring) {str.ptr, str.length};
+}
+void cx_strfree(cxmutstr *str) {
+free(str->ptr);
+str->ptr = NULL;
+str->length = 0;
+}
+void cx_strfree_a(
+CxAllocator const *alloc,
+cxmutstr *str
+) {
+cxFree(alloc, str->ptr);
+str->ptr = NULL;
+str->length = 0;
+}
+size_t cx_strlen(
+size_t count,
+...
+) {
+if (count == 0) return 0;
+va_list ap;
+va_start(ap, count);
+size_t size = 0;
+cx_for_n(i, count) {
+cxstring str = va_arg(ap, cxstring);
+size += str.length;
+}
+va_end(ap);
+return size;
+}
+cxmutstr cx_strcat_ma(
+CxAllocator const *alloc,
+cxmutstr str,
+size_t count,
+...
+) {
+if (count == 0) return str;
+cxstring *strings = calloc(count, sizeof(cxstring));
+if (!strings) abort();
+va_list ap;
+va_start(ap, count);
+// get all args and overall length
+size_t slen = str.length;
+cx_for_n(i, count) {
+cxstring s = va_arg (ap, cxstring);
+strings[i] = s;
+slen += s.length;
+}
+va_end(ap);
+// reallocate or create new string
+if (str.ptr == NULL) {
+str.ptr = cxMalloc(alloc, slen + 1);
+} else {
+str.ptr = cxRealloc(alloc, str.ptr, slen + 1);
+}
+if (str.ptr == NULL) abort();
+// concatenate strings
+size_t pos = str.length;
+str.length = slen;
+cx_for_n(i, count) {
+cxstring s = strings[i];
+memcpy(str.ptr + pos, s.ptr, s.length);
+pos += s.length;
+}
+// terminate string
+str.ptr[str.length] = '\0';
+// free temporary array
+free(strings);
+return str;
+}
+cxstring cx_strsubs(
+cxstring string,
+size_t start
+) {
+return cx_strsubsl(string, start, string.length - start);
+}
+cxmutstr cx_strsubs_m(
+cxmutstr string,
+size_t start
+) {
+return cx_strsubsl_m(string, start, string.length - start);
+}
+cxstring cx_strsubsl(
+cxstring string,
+size_t start,
+size_t length
+) {
+if (start > string.length) {
+return (cxstring) {NULL, 0};
+}
+size_t rem_len = string.length - start;
+if (length > rem_len) {
+length = rem_len;
+}
+return (cxstring) {string.ptr + start, length};
+}
+cxmutstr cx_strsubsl_m(
+cxmutstr string,
+size_t start,
+size_t length
+) {
+cxstring result = cx_strsubsl(cx_strcast(string), start, length);
+return (cxmutstr) {(char *) result.ptr, result.length};
+}
+cxstring cx_strchr(
+cxstring string,
+int chr
+) {
+chr = 0xFF & chr;
+// TODO: improve by comparing multiple bytes at once
+cx_for_n(i, string.length) {
+if (string.ptr[i] == chr) {
+return cx_strsubs(string, i);
+}
+}
+return (cxstring) {NULL, 0};
+}
+cxmutstr cx_strchr_m(
+cxmutstr string,
+int chr
+) {
+cxstring result = cx_strchr(cx_strcast(string), chr);
+return (cxmutstr) {(char *) result.ptr, result.length};
+}
+cxstring cx_strrchr(
+cxstring string,
+int chr
+) {
+chr = 0xFF & chr;
+size_t i = string.length;
+while (i > 0) {
+i--;
+// TODO: improve by comparing multiple bytes at once
+if (string.ptr[i] == chr) {
+return cx_strsubs(string, i);
+}
+}
+return (cxstring) {NULL, 0};
+}
+cxmutstr cx_strrchr_m(
+cxmutstr string,
+int chr
+) {
+cxstring result = cx_strrchr(cx_strcast(string), chr);
+return (cxmutstr) {(char *) result.ptr, result.length};
+}
+#ifndef CX_STRSTR_SBO_SIZE
+#define CX_STRSTR_SBO_SIZE 512
+#endif
+unsigned const cx_strstr_sbo_size = CX_STRSTR_SBO_SIZE;
+cxstring cx_strstr(
+cxstring haystack,
+cxstring needle
+) {
+if (needle.length == 0) {
+return haystack;
+}
+// optimize for single-char needles
+if (needle.length == 1) {
+return cx_strchr(haystack, *needle.ptr);
+}
+/*
+* IMPORTANT:
+* Our prefix table contains the prefix length PLUS ONE
+* this is our decision, because we want to use the full range of size_t.
+* The original algorithm needs a (-1) at one single place,
+* and we want to avoid that.
+*/
+// local prefix table
+size_t s_prefix_table[CX_STRSTR_SBO_SIZE];
+// check needle length and use appropriate prefix table
+// if the pattern exceeds static prefix table, allocate on the heap
+bool useheap = needle.length >= CX_STRSTR_SBO_SIZE;
+register size_t *ptable = useheap ? calloc(needle.length + 1,
+sizeof(size_t)) : s_prefix_table;
+// keep counter in registers
+register size_t i, j;
+// fill prefix table
+i = 0;
+j = 0;
+ptable[i] = j;
+while (i < needle.length) {
+while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) {
+j = ptable[j - 1];
+}
+i++;
+j++;
+ptable[i] = j;
+}
+// search
+cxstring result = {NULL, 0};
+i = 0;
+j = 1;
+while (i < haystack.length) {
+while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) {
+j = ptable[j - 1];
+}
+i++;
+j++;
+if (j - 1 == needle.length) {
+size_t start = i - needle.length;
+result.ptr = haystack.ptr + start;
+result.length = haystack.length - start;
+break;
+}
+}
+// if prefix table was allocated on the heap, free it
+if (ptable != s_prefix_table) {
+free(ptable);
+}
+return result;
+}
+cxmutstr cx_strstr_m(
+cxmutstr haystack,
+cxstring needle
+) {
+cxstring result = cx_strstr(cx_strcast(haystack), needle);
+return (cxmutstr) {(char *) result.ptr, result.length};
+}
+size_t cx_strsplit(
+cxstring string,
+cxstring delim,
+size_t limit,
+cxstring *output
+) {
+// special case: output limit is zero
+if (limit == 0) return 0;
+// special case: delimiter is empty
+if (delim.length == 0) {
+output[0] = string;
+return 1;
+}
+// special cases: delimiter is at least as large as the string
+if (delim.length >= string.length) {
+// exact match
+if (cx_strcmp(string, delim) == 0) {
+output[0] = cx_strn(string.ptr, 0);
+output[1] = cx_strn(string.ptr + string.length, 0);
+return 2;
+} else {
+// no match possible
+output[0] = string;
+return 1;
+}
+}
+size_t n = 0;
+cxstring curpos = string;
+while (1) {
+++n;
+cxstring match = cx_strstr(curpos, delim);
+if (match.length > 0) {
+// is the limit reached?
+if (n < limit) {
+// copy the current string to the array
+cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr);
+output[n - 1] = item;
+size_t processed = item.length + delim.length;
+curpos.ptr += processed;
+curpos.length -= processed;
+} else {
+// limit reached, copy the _full_ remaining string
+output[n - 1] = curpos;
+break;
+}
+} else {
+// no more matches, copy last string
+output[n - 1] = curpos;
+break;
+}
+}
+return n;
+}
+size_t cx_strsplit_a(
+CxAllocator const *allocator,
+cxstring string,
+cxstring delim,
+size_t limit,
+cxstring **output
+) {
+// find out how many splits we're going to make and allocate memory
+size_t n = 0;
+cxstring curpos = string;
+while (1) {
+++n;
+cxstring match = cx_strstr(curpos, delim);
+if (match.length > 0) {
+// is the limit reached?
+if (n < limit) {
+size_t processed = match.ptr - curpos.ptr + delim.length;
+curpos.ptr += processed;
+curpos.length -= processed;
+} else {
+// limit reached
+break;
+}
+} else {
+// no more matches
+break;
+}
+}
+*output = cxCalloc(allocator, n, sizeof(cxstring));
+return cx_strsplit(string, delim, n, *output);
+}
+size_t cx_strsplit_m(
+cxmutstr string,
+cxstring delim,
+size_t limit,
+cxmutstr *output
+) {
+return cx_strsplit(cx_strcast(string),
+delim, limit, (cxstring *) output);
+}
+size_t cx_strsplit_ma(
+CxAllocator const *allocator,
+cxmutstr string,
+cxstring delim,
+size_t limit,
+cxmutstr **output
+) {
+return cx_strsplit_a(allocator, cx_strcast(string),
+delim, limit, (cxstring **) output);
+}
+int cx_strcmp(
+cxstring s1,
+cxstring s2
+) {
+if (s1.length == s2.length) {
+return memcmp(s1.ptr, s2.ptr, s1.length);
+} else if (s1.length > s2.length) {
+return 1;
+} else {
+return -1;
+}
+}
+int cx_strcasecmp(
+cxstring s1,
+cxstring s2
+) {
+if (s1.length == s2.length) {
+#ifdef _WIN32
+return _strnicmp(s1.ptr, s2.ptr, s1.length);
+#else
+return strncasecmp(s1.ptr, s2.ptr, s1.length);
+#endif
+} else if (s1.length > s2.length) {
+return 1;
+} else {
+return -1;
+}
+}
+int cx_strcmp_p(
+void const *s1,
+void const *s2
+) {
+cxstring const *left = s1;
+cxstring const *right = s2;
+return cx_strcmp(*left, *right);
+}
+int cx_strcasecmp_p(
+void const *s1,
+void const *s2
+) {
+cxstring const *left = s1;
+cxstring const *right = s2;
+return cx_strcasecmp(*left, *right);
+}
+cxmutstr cx_strdup_a(
+CxAllocator const *allocator,
+cxstring string
+) {
+cxmutstr result = {
+cxMalloc(allocator, string.length + 1),
+string.length
+};
+if (result.ptr == NULL) {
+result.length = 0;
+return result;
+}
+memcpy(result.ptr, string.ptr, string.length);
+result.ptr[string.length] = '\0';
+return result;
+}
+cxstring cx_strtrim(cxstring string) {
+cxstring result = string;
+// TODO: optimize by comparing multiple bytes at once
+while (result.length > 0 && isspace(*result.ptr)) {
+result.ptr++;
+result.length--;
+}
+while (result.length > 0 && isspace(result.ptr[result.length - 1])) {
+result.length--;
+}
+return result;
+}
+cxmutstr cx_strtrim_m(cxmutstr string) {
+cxstring result = cx_strtrim(cx_strcast(string));
+return (cxmutstr) {(char *) result.ptr, result.length};
+}
+bool cx_strprefix(
+cxstring string,
+cxstring prefix
+) {
+if (string.length < prefix.length) return false;
+return memcmp(string.ptr, prefix.ptr, prefix.length) == 0;
+}
+bool cx_strsuffix(
+cxstring string,
+cxstring suffix
+) {
+if (string.length < suffix.length) return false;
+return memcmp(string.ptr + string.length - suffix.length,
+suffix.ptr, suffix.length) == 0;
+}
+bool cx_strcaseprefix(
+cxstring string,
+cxstring prefix
+) {
+if (string.length < prefix.length) return false;
+#ifdef _WIN32
+return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0;
+#else
+return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0;
+#endif
+}
+bool cx_strcasesuffix(
+cxstring string,
+cxstring suffix
+) {
+if (string.length < suffix.length) return false;
+#ifdef _WIN32
+return _strnicmp(string.ptr+string.length-suffix.length,
+suffix.ptr, suffix.length) == 0;
+#else
+return strncasecmp(string.ptr + string.length - suffix.length,
+suffix.ptr, suffix.length) == 0;
+#endif
+}
+void cx_strlower(cxmutstr string) {
+cx_for_n(i, string.length) {
+string.ptr[i] = (char) tolower(string.ptr[i]);
+}
+}
+void cx_strupper(cxmutstr string) {
+cx_for_n(i, string.length) {
+string.ptr[i] = (char) toupper(string.ptr[i]);
+}
+}
+#ifndef CX_STRREPLACE_INDEX_BUFFER_SIZE
+#define CX_STRREPLACE_INDEX_BUFFER_SIZE 64
+#endif
+struct cx_strreplace_ibuf {
+size_t *buf;
+struct cx_strreplace_ibuf *next;
+unsigned int len;
+};
+static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) {
+while (buf) {
+struct cx_strreplace_ibuf *next = buf->next;
+free(buf->buf);
+free(buf);
+buf = next;
+}
+}
+cxmutstr cx_strreplacen_a(
+CxAllocator const *allocator,
+cxstring str,
+cxstring pattern,
+cxstring replacement,
+size_t replmax
+) {
+if (pattern.length == 0 || pattern.length > str.length || replmax == 0)
+return cx_strdup_a(allocator, str);
+// Compute expected buffer length
+size_t ibufmax = str.length / pattern.length;
+size_t ibuflen = replmax < ibufmax ? replmax : ibufmax;
+if (ibuflen > CX_STRREPLACE_INDEX_BUFFER_SIZE) {
+ibuflen = CX_STRREPLACE_INDEX_BUFFER_SIZE;
+}
+// Allocate first index buffer
+struct cx_strreplace_ibuf *firstbuf, *curbuf;
+firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf));
+if (!firstbuf) return cx_mutstrn(NULL, 0);
+firstbuf->buf = calloc(ibuflen, sizeof(size_t));
+if (!firstbuf->buf) {
+free(firstbuf);
+return cx_mutstrn(NULL, 0);
+}
+// Search occurrences
+cxstring searchstr = str;
+size_t found = 0;
+do {
+cxstring match = cx_strstr(searchstr, pattern);
+if (match.length > 0) {
+// Allocate next buffer in chain, if required
+if (curbuf->len == ibuflen) {
+struct cx_strreplace_ibuf *nextbuf =
+calloc(1, sizeof(struct cx_strreplace_ibuf));
+if (!nextbuf) {
+cx_strrepl_free_ibuf(firstbuf);
+return cx_mutstrn(NULL, 0);
+}
+nextbuf->buf = calloc(ibuflen, sizeof(size_t));
+if (!nextbuf->buf) {
+free(nextbuf);
+cx_strrepl_free_ibuf(firstbuf);
+return cx_mutstrn(NULL, 0);
+}
+curbuf->next = nextbuf;
+curbuf = nextbuf;
+}
+// Record match index
+found++;
+size_t idx = match.ptr - str.ptr;
+curbuf->buf[curbuf->len++] = idx;
+searchstr.ptr = match.ptr + pattern.length;
+searchstr.length = str.length - idx - pattern.length;
+} else {
+break;
+}
+} while (searchstr.length > 0 && found < replmax);
+// Allocate result string
+cxmutstr result;
+{
+ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length;
+size_t rcount = 0;
+curbuf = firstbuf;
+do {
+rcount += curbuf->len;
+curbuf = curbuf->next;
+} while (curbuf);
+result.length = str.length + rcount * adjlen;
+result.ptr = cxMalloc(allocator, result.length + 1);
+if (!result.ptr) {
+cx_strrepl_free_ibuf(firstbuf);
+return cx_mutstrn(NULL, 0);
+}
+}
+// Build result string
+curbuf = firstbuf;
+size_t srcidx = 0;
+char *destptr = result.ptr;
+do {
+for (size_t i = 0; i < curbuf->len; i++) {
+// Copy source part up to next match
+size_t idx = curbuf->buf[i];
+size_t srclen = idx - srcidx;
+if (srclen > 0) {
+memcpy(destptr, str.ptr + srcidx, srclen);
+destptr += srclen;
+srcidx += srclen;
+}
+// Copy the replacement and skip the source pattern
+srcidx += pattern.length;
+memcpy(destptr, replacement.ptr, replacement.length);
+destptr += replacement.length;
+}
+curbuf = curbuf->next;
+} while (curbuf);
+memcpy(destptr, str.ptr + srcidx, str.length - srcidx);
+// Result is guaranteed to be zero-terminated
+result.ptr[result.length] = '\0';
+// Free index buffer
+cx_strrepl_free_ibuf(firstbuf);
+return result;
+}
+CxStrtokCtx cx_strtok(
+cxstring str,
+cxstring delim,
+size_t limit
+) {
+CxStrtokCtx ctx;
+ctx.str = str;
+ctx.delim = delim;
+ctx.limit = limit;
+ctx.pos = 0;
+ctx.next_pos = 0;
+ctx.delim_pos = 0;
+ctx.found = 0;
+ctx.delim_more = NULL;
+ctx.delim_more_count = 0;
+return ctx;
+}
+CxStrtokCtx cx_strtok_m(
+cxmutstr str,
+cxstring delim,
+size_t limit
+) {
+return cx_strtok(cx_strcast(str), delim, limit);
+}
+bool cx_strtok_next(
+CxStrtokCtx *ctx,
+cxstring *token
+) {
+// abortion criteria
+if (ctx->found >= ctx->limit || ctx->delim_pos >= ctx->str.length) {
+return false;
+}
+// determine the search start
+cxstring haystack = cx_strsubs(ctx->str, ctx->next_pos);
+// search the next delimiter
+cxstring delim = cx_strstr(haystack, ctx->delim);
+// if found, make delim capture exactly the delimiter
+if (delim.length > 0) {
+delim.length = ctx->delim.length;
+}
+// if more delimiters are specified, check them now
+if (ctx->delim_more_count > 0) {
+cx_for_n(i, ctx->delim_more_count) {
+cxstring d = cx_strstr(haystack, ctx->delim_more[i]);
+if (d.length > 0 && (delim.length == 0 || d.ptr < delim.ptr)) {
+delim.ptr = d.ptr;
+delim.length = ctx->delim_more[i].length;
+}
+}
+}
+// store the token information and adjust the context
+ctx->found++;
+ctx->pos = ctx->next_pos;
+token->ptr = &ctx->str.ptr[ctx->pos];
+ctx->delim_pos = delim.length == 0 ?
+ctx->str.length : (size_t) (delim.ptr - ctx->str.ptr);
+token->length = ctx->delim_pos - ctx->pos;
+ctx->next_pos = ctx->delim_pos + delim.length;
+return true;
+}
+bool cx_strtok_next_m(
+CxStrtokCtx *ctx,
+cxmutstr *token
+) {
+return cx_strtok_next(ctx, (cxstring *) token);
+}
+void cx_strtok_delim(
+CxStrtokCtx *ctx,
+cxstring const *delim,
+size_t count
+) {
+ctx->delim_more = delim;
+ctx->delim_more_count = count;
+}

Mercurial > hg > dbutils / file comparison

comparison: ucx/string.c

ucx/string.c