ucx/string.c

Sun, 17 Dec 2023 15:33:50 +0100

author
Mike Becker <universe@uap-core.de>
date
Sun, 17 Dec 2023 15:33:50 +0100
changeset 800
30d484806c2b
parent 748
49a284f61e8c
child 816
839fefbdedc7
permissions
-rw-r--r--

fix faulty string to int conversion utilities

Probably it was expected that errno is set to EINVAL when illegal characters are encountered. But this is not standard and does not happen on every system, allowing illegal strings to be parsed as valid integers.

/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 *
 * Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "cx/string.h"
#include "cx/utils.h"

#include <string.h>
#include <stdarg.h>
#include <ctype.h>

#ifndef _WIN32

#include <strings.h> // for strncasecmp()

#endif // _WIN32

cxmutstr cx_mutstr(char *cstring) {
    return (cxmutstr) {cstring, strlen(cstring)};
}

cxmutstr cx_mutstrn(
        char *cstring,
        size_t length
) {
    return (cxmutstr) {cstring, length};
}

cxstring cx_str(const char *cstring) {
    return (cxstring) {cstring, strlen(cstring)};
}

cxstring cx_strn(
        const char *cstring,
        size_t length
) {
    return (cxstring) {cstring, length};
}

cxstring cx_strcast(cxmutstr str) {
    return (cxstring) {str.ptr, str.length};
}

void cx_strfree(cxmutstr *str) {
    free(str->ptr);
    str->ptr = NULL;
    str->length = 0;
}

void cx_strfree_a(
        CxAllocator const *alloc,
        cxmutstr *str
) {
    cxFree(alloc, str->ptr);
    str->ptr = NULL;
    str->length = 0;
}

size_t cx_strlen(
        size_t count,
        ...
) {
    if (count == 0) return 0;

    va_list ap;
    va_start(ap, count);
    size_t size = 0;
    cx_for_n(i, count) {
        cxstring str = va_arg(ap, cxstring);
        size += str.length;
    }
    va_end(ap);

    return size;
}

cxmutstr cx_strcat_ma(
        CxAllocator const *alloc,
        cxmutstr str,
        size_t count,
        ...
) {
    if (count == 0) return str;

    cxstring *strings = calloc(count, sizeof(cxstring));
    if (!strings) abort();

    va_list ap;
    va_start(ap, count);

    // get all args and overall length
    size_t slen = str.length;
    cx_for_n(i, count) {
        cxstring s = va_arg (ap, cxstring);
        strings[i] = s;
        slen += s.length;
    }
    va_end(ap);

    // reallocate or create new string
    if (str.ptr == NULL) {
        str.ptr = cxMalloc(alloc, slen + 1);
    } else {
        str.ptr = cxRealloc(alloc, str.ptr, slen + 1);
    }
    if (str.ptr == NULL) abort();

    // concatenate strings
    size_t pos = str.length;
    str.length = slen;
    cx_for_n(i, count) {
        cxstring s = strings[i];
        memcpy(str.ptr + pos, s.ptr, s.length);
        pos += s.length;
    }

    // terminate string
    str.ptr[str.length] = '\0';

    // free temporary array
    free(strings);

    return str;
}

cxstring cx_strsubs(
        cxstring string,
        size_t start
) {
    return cx_strsubsl(string, start, string.length - start);
}

cxmutstr cx_strsubs_m(
        cxmutstr string,
        size_t start
) {
    return cx_strsubsl_m(string, start, string.length - start);
}

cxstring cx_strsubsl(
        cxstring string,
        size_t start,
        size_t length
) {
    if (start > string.length) {
        return (cxstring) {NULL, 0};
    }

    size_t rem_len = string.length - start;
    if (length > rem_len) {
        length = rem_len;
    }

    return (cxstring) {string.ptr + start, length};
}

cxmutstr cx_strsubsl_m(
        cxmutstr string,
        size_t start,
        size_t length
) {
    cxstring result = cx_strsubsl(cx_strcast(string), start, length);
    return (cxmutstr) {(char *) result.ptr, result.length};
}

cxstring cx_strchr(
        cxstring string,
        int chr
) {
    chr = 0xFF & chr;
    // TODO: improve by comparing multiple bytes at once
    cx_for_n(i, string.length) {
        if (string.ptr[i] == chr) {
            return cx_strsubs(string, i);
        }
    }
    return (cxstring) {NULL, 0};
}

cxmutstr cx_strchr_m(
        cxmutstr string,
        int chr
) {
    cxstring result = cx_strchr(cx_strcast(string), chr);
    return (cxmutstr) {(char *) result.ptr, result.length};
}

cxstring cx_strrchr(
        cxstring string,
        int chr
) {
    chr = 0xFF & chr;
    size_t i = string.length;
    while (i > 0) {
        i--;
        // TODO: improve by comparing multiple bytes at once
        if (string.ptr[i] == chr) {
            return cx_strsubs(string, i);
        }
    }
    return (cxstring) {NULL, 0};
}

cxmutstr cx_strrchr_m(
        cxmutstr string,
        int chr
) {
    cxstring result = cx_strrchr(cx_strcast(string), chr);
    return (cxmutstr) {(char *) result.ptr, result.length};
}

#ifndef CX_STRSTR_SBO_SIZE
#define CX_STRSTR_SBO_SIZE 512
#endif

cxstring cx_strstr(
        cxstring haystack,
        cxstring needle
) {
    if (needle.length == 0) {
        return haystack;
    }

    // optimize for single-char needles
    if (needle.length == 1) {
        return cx_strchr(haystack, *needle.ptr);
    }

    /*
     * IMPORTANT:
     * Our prefix table contains the prefix length PLUS ONE
     * this is our decision, because we want to use the full range of size_t.
     * The original algorithm needs a (-1) at one single place,
     * and we want to avoid that.
     */

    // local prefix table
    size_t s_prefix_table[CX_STRSTR_SBO_SIZE];

    // check needle length and use appropriate prefix table
    // if the pattern exceeds static prefix table, allocate on the heap
    bool useheap = needle.length >= CX_STRSTR_SBO_SIZE;
    register size_t *ptable = useheap ? calloc(needle.length + 1,
                                               sizeof(size_t)) : s_prefix_table;

    // keep counter in registers
    register size_t i, j;

    // fill prefix table
    i = 0;
    j = 0;
    ptable[i] = j;
    while (i < needle.length) {
        while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) {
            j = ptable[j - 1];
        }
        i++;
        j++;
        ptable[i] = j;
    }

    // search
    cxstring result = {NULL, 0};
    i = 0;
    j = 1;
    while (i < haystack.length) {
        while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) {
            j = ptable[j - 1];
        }
        i++;
        j++;
        if (j - 1 == needle.length) {
            size_t start = i - needle.length;
            result.ptr = haystack.ptr + start;
            result.length = haystack.length - start;
            break;
        }
    }

    // if prefix table was allocated on the heap, free it
    if (ptable != s_prefix_table) {
        free(ptable);
    }

    return result;
}

cxmutstr cx_strstr_m(
        cxmutstr haystack,
        cxstring needle
) {
    cxstring result = cx_strstr(cx_strcast(haystack), needle);
    return (cxmutstr) {(char *) result.ptr, result.length};
}

size_t cx_strsplit(
        cxstring string,
        cxstring delim,
        size_t limit,
        cxstring *output
) {
    // special case: output limit is zero
    if (limit == 0) return 0;

    // special case: delimiter is empty
    if (delim.length == 0) {
        output[0] = string;
        return 1;
    }

    // special cases: delimiter is at least as large as the string
    if (delim.length >= string.length) {
        // exact match
        if (cx_strcmp(string, delim) == 0) {
            output[0] = cx_strn(string.ptr, 0);
            output[1] = cx_strn(string.ptr + string.length, 0);
            return 2;
        } else {
            // no match possible
            output[0] = string;
            return 1;
        }
    }

    size_t n = 0;
    cxstring curpos = string;
    while (1) {
        ++n;
        cxstring match = cx_strstr(curpos, delim);
        if (match.length > 0) {
            // is the limit reached?
            if (n < limit) {
                // copy the current string to the array
                cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr);
                output[n - 1] = item;
                size_t processed = item.length + delim.length;
                curpos.ptr += processed;
                curpos.length -= processed;
            } else {
                // limit reached, copy the _full_ remaining string
                output[n - 1] = curpos;
                break;
            }
        } else {
            // no more matches, copy last string
            output[n - 1] = curpos;
            break;
        }
    }

    return n;
}

size_t cx_strsplit_a(
        CxAllocator const *allocator,
        cxstring string,
        cxstring delim,
        size_t limit,
        cxstring **output
) {
    // find out how many splits we're going to make and allocate memory
    size_t n = 0;
    cxstring curpos = string;
    while (1) {
        ++n;
        cxstring match = cx_strstr(curpos, delim);
        if (match.length > 0) {
            // is the limit reached?
            if (n < limit) {
                size_t processed = match.ptr - curpos.ptr + delim.length;
                curpos.ptr += processed;
                curpos.length -= processed;
            } else {
                // limit reached
                break;
            }
        } else {
            // no more matches
            break;
        }
    }
    *output = cxCalloc(allocator, n, sizeof(cxstring));
    return cx_strsplit(string, delim, n, *output);
}

size_t cx_strsplit_m(
        cxmutstr string,
        cxstring delim,
        size_t limit,
        cxmutstr *output
) {
    return cx_strsplit(cx_strcast(string),
                       delim, limit, (cxstring *) output);
}

size_t cx_strsplit_ma(
        CxAllocator const *allocator,
        cxmutstr string,
        cxstring delim,
        size_t limit,
        cxmutstr **output
) {
    return cx_strsplit_a(allocator, cx_strcast(string),
                         delim, limit, (cxstring **) output);
}

int cx_strcmp(
        cxstring s1,
        cxstring s2
) {
    if (s1.length == s2.length) {
        return memcmp(s1.ptr, s2.ptr, s1.length);
    } else if (s1.length > s2.length) {
        return 1;
    } else {
        return -1;
    }
}

int cx_strcasecmp(
        cxstring s1,
        cxstring s2
) {
    if (s1.length == s2.length) {
#ifdef _WIN32
        return _strnicmp(s1.ptr, s2.ptr, s1.length);
#else
        return strncasecmp(s1.ptr, s2.ptr, s1.length);
#endif
    } else if (s1.length > s2.length) {
        return 1;
    } else {
        return -1;
    }
}

int cx_strcmp_p(
        void const *s1,
        void const *s2
) {
    cxstring const *left = s1;
    cxstring const *right = s2;
    return cx_strcmp(*left, *right);
}

int cx_strcasecmp_p(
        void const *s1,
        void const *s2
) {
    cxstring const *left = s1;
    cxstring const *right = s2;
    return cx_strcasecmp(*left, *right);
}

cxmutstr cx_strdup_a(
        CxAllocator const *allocator,
        cxstring string
) {
    cxmutstr result = {
            cxMalloc(allocator, string.length + 1),
            string.length
    };
    if (result.ptr == NULL) {
        result.length = 0;
        return result;
    }
    memcpy(result.ptr, string.ptr, string.length);
    result.ptr[string.length] = '\0';
    return result;
}

cxstring cx_strtrim(cxstring string) {
    cxstring result = string;
    // TODO: optimize by comparing multiple bytes at once
    while (result.length > 0 && isspace(*result.ptr)) {
        result.ptr++;
        result.length--;
    }
    while (result.length > 0 && isspace(result.ptr[result.length - 1])) {
        result.length--;
    }
    return result;
}

cxmutstr cx_strtrim_m(cxmutstr string) {
    cxstring result = cx_strtrim(cx_strcast(string));
    return (cxmutstr) {(char *) result.ptr, result.length};
}

bool cx_strprefix(
        cxstring string,
        cxstring prefix
) {
    if (string.length < prefix.length) return false;
    return memcmp(string.ptr, prefix.ptr, prefix.length) == 0;
}

bool cx_strsuffix(
        cxstring string,
        cxstring suffix
) {
    if (string.length < suffix.length) return false;
    return memcmp(string.ptr + string.length - suffix.length,
                  suffix.ptr, suffix.length) == 0;
}

bool cx_strcaseprefix(
        cxstring string,
        cxstring prefix
) {
    if (string.length < prefix.length) return false;
#ifdef _WIN32
    return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0;
#else
    return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0;
#endif
}

bool cx_strcasesuffix(
        cxstring string,
        cxstring suffix
) {
    if (string.length < suffix.length) return false;
#ifdef _WIN32
    return _strnicmp(string.ptr+string.length-suffix.length,
                  suffix.ptr, suffix.length) == 0;
#else
    return strncasecmp(string.ptr + string.length - suffix.length,
                       suffix.ptr, suffix.length) == 0;
#endif
}

void cx_strlower(cxmutstr string) {
    cx_for_n(i, string.length) {
        string.ptr[i] = (char) tolower(string.ptr[i]);
    }
}

void cx_strupper(cxmutstr string) {
    cx_for_n(i, string.length) {
        string.ptr[i] = (char) toupper(string.ptr[i]);
    }
}

#ifndef CX_STRREPLACE_INDEX_BUFFER_SIZE
#define CX_STRREPLACE_INDEX_BUFFER_SIZE 64
#endif

struct cx_strreplace_ibuf {
    size_t *buf;
    struct cx_strreplace_ibuf *next;
    unsigned int len;
};

static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) {
    while (buf) {
        struct cx_strreplace_ibuf *next = buf->next;
        free(buf->buf);
        free(buf);
        buf = next;
    }
}

cxmutstr cx_strreplacen_a(
        CxAllocator const *allocator,
        cxstring str,
        cxstring pattern,
        cxstring replacement,
        size_t replmax
) {

    if (pattern.length == 0 || pattern.length > str.length || replmax == 0)
        return cx_strdup_a(allocator, str);

    // Compute expected buffer length
    size_t ibufmax = str.length / pattern.length;
    size_t ibuflen = replmax < ibufmax ? replmax : ibufmax;
    if (ibuflen > CX_STRREPLACE_INDEX_BUFFER_SIZE) {
        ibuflen = CX_STRREPLACE_INDEX_BUFFER_SIZE;
    }

    // Allocate first index buffer
    struct cx_strreplace_ibuf *firstbuf, *curbuf;
    firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf));
    if (!firstbuf) return cx_mutstrn(NULL, 0);
    firstbuf->buf = calloc(ibuflen, sizeof(size_t));
    if (!firstbuf->buf) {
        free(firstbuf);
        return cx_mutstrn(NULL, 0);
    }

    // Search occurrences
    cxstring searchstr = str;
    size_t found = 0;
    do {
        cxstring match = cx_strstr(searchstr, pattern);
        if (match.length > 0) {
            // Allocate next buffer in chain, if required
            if (curbuf->len == ibuflen) {
                struct cx_strreplace_ibuf *nextbuf =
                        calloc(1, sizeof(struct cx_strreplace_ibuf));
                if (!nextbuf) {
                    cx_strrepl_free_ibuf(firstbuf);
                    return cx_mutstrn(NULL, 0);
                }
                nextbuf->buf = calloc(ibuflen, sizeof(size_t));
                if (!nextbuf->buf) {
                    free(nextbuf);
                    cx_strrepl_free_ibuf(firstbuf);
                    return cx_mutstrn(NULL, 0);
                }
                curbuf->next = nextbuf;
                curbuf = nextbuf;
            }

            // Record match index
            found++;
            size_t idx = match.ptr - str.ptr;
            curbuf->buf[curbuf->len++] = idx;
            searchstr.ptr = match.ptr + pattern.length;
            searchstr.length = str.length - idx - pattern.length;
        } else {
            break;
        }
    } while (searchstr.length > 0 && found < replmax);

    // Allocate result string
    cxmutstr result;
    {
        ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length;
        size_t rcount = 0;
        curbuf = firstbuf;
        do {
            rcount += curbuf->len;
            curbuf = curbuf->next;
        } while (curbuf);
        result.length = str.length + rcount * adjlen;
        result.ptr = cxMalloc(allocator, result.length + 1);
        if (!result.ptr) {
            cx_strrepl_free_ibuf(firstbuf);
            return cx_mutstrn(NULL, 0);
        }
    }

    // Build result string
    curbuf = firstbuf;
    size_t srcidx = 0;
    char *destptr = result.ptr;
    do {
        for (size_t i = 0; i < curbuf->len; i++) {
            // Copy source part up to next match
            size_t idx = curbuf->buf[i];
            size_t srclen = idx - srcidx;
            if (srclen > 0) {
                memcpy(destptr, str.ptr + srcidx, srclen);
                destptr += srclen;
                srcidx += srclen;
            }

            // Copy the replacement and skip the source pattern
            srcidx += pattern.length;
            memcpy(destptr, replacement.ptr, replacement.length);
            destptr += replacement.length;
        }
        curbuf = curbuf->next;
    } while (curbuf);
    memcpy(destptr, str.ptr + srcidx, str.length - srcidx);

    // Result is guaranteed to be zero-terminated
    result.ptr[result.length] = '\0';

    // Free index buffer
    cx_strrepl_free_ibuf(firstbuf);

    return result;
}

CxStrtokCtx cx_strtok(
        cxstring str,
        cxstring delim,
        size_t limit
) {
    CxStrtokCtx ctx;
    ctx.str = str;
    ctx.delim = delim;
    ctx.limit = limit;
    ctx.pos = 0;
    ctx.next_pos = 0;
    ctx.delim_pos = 0;
    ctx.found = 0;
    ctx.delim_more = NULL;
    ctx.delim_more_count = 0;
    return ctx;
}

CxStrtokCtx cx_strtok_m(
        cxmutstr str,
        cxstring delim,
        size_t limit
) {
    return cx_strtok(cx_strcast(str), delim, limit);
}

bool cx_strtok_next(
        CxStrtokCtx *ctx,
        cxstring *token
) {
    // abortion criteria
    if (ctx->found >= ctx->limit || ctx->delim_pos >= ctx->str.length) {
        return false;
    }

    // determine the search start
    cxstring haystack = cx_strsubs(ctx->str, ctx->next_pos);

    // search the next delimiter
    cxstring delim = cx_strstr(haystack, ctx->delim);

    // if found, make delim capture exactly the delimiter
    if (delim.length > 0) {
        delim.length = ctx->delim.length;
    }

    // if more delimiters are specified, check them now
    if (ctx->delim_more_count > 0) {
        cx_for_n(i, ctx->delim_more_count) {
            cxstring d = cx_strstr(haystack, ctx->delim_more[i]);
            if (d.length > 0 && (delim.length == 0 || d.ptr < delim.ptr)) {
                delim.ptr = d.ptr;
                delim.length = ctx->delim_more[i].length;
            }
        }
    }

    // store the token information and adjust the context
    ctx->found++;
    ctx->pos = ctx->next_pos;
    token->ptr = &ctx->str.ptr[ctx->pos];
    ctx->delim_pos = delim.length == 0 ?
                     ctx->str.length : (size_t) (delim.ptr - ctx->str.ptr);
    token->length = ctx->delim_pos - ctx->pos;
    ctx->next_pos = ctx->delim_pos + delim.length;

    return true;
}

bool cx_strtok_next_m(
        CxStrtokCtx *ctx,
        cxmutstr *token
) {
    return cx_strtok_next(ctx, (cxstring *) token);
}

void cx_strtok_delim(
        CxStrtokCtx *ctx,
        cxstring const *delim,
        size_t count
) {
    ctx->delim_more = delim;
    ctx->delim_more_count = count;
}

mercurial