--- a/src/ucx/string.c Mon Feb 10 17:44:51 2025 +0100 +++ b/src/ucx/string.c Sun Mar 02 18:10:52 2025 +0100 @@ -25,19 +25,21 @@ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ - #include "cx/string.h" -#include "cx/utils.h" #include <string.h> #include <stdarg.h> -#include <ctype.h> - -#ifndef _WIN32 +#include <assert.h> +#include <errno.h> +#include <limits.h> +#include <float.h> -#include <strings.h> // for strncasecmp() - -#endif // _WIN32 +#ifdef _WIN32 +#define cx_strcasecmp_impl _strnicmp +#else +#include <strings.h> +#define cx_strcasecmp_impl strncasecmp +#endif cxmutstr cx_mutstr(char *cstring) { return (cxmutstr) {cstring, strlen(cstring)}; @@ -61,20 +63,18 @@ return (cxstring) {cstring, length}; } -cxstring cx_strcast(cxmutstr str) { - return (cxstring) {str.ptr, str.length}; -} - void cx_strfree(cxmutstr *str) { + if (str == NULL) return; free(str->ptr); str->ptr = NULL; str->length = 0; } void cx_strfree_a( - CxAllocator const *alloc, + const CxAllocator *alloc, cxmutstr *str ) { + if (str == NULL) return; cxFree(alloc, str->ptr); str->ptr = NULL; str->length = 0; @@ -89,8 +89,9 @@ va_list ap; va_start(ap, count); size_t size = 0; - cx_for_n(i, count) { + for (size_t i = 0; i < count; i++) { cxstring str = va_arg(ap, cxstring); + if (size > SIZE_MAX - str.length) errno = EOVERFLOW; size += str.length; } va_end(ap); @@ -99,40 +100,66 @@ } cxmutstr cx_strcat_ma( - CxAllocator const *alloc, + const CxAllocator *alloc, cxmutstr str, size_t count, ... ) { if (count == 0) return str; - cxstring *strings = calloc(count, sizeof(cxstring)); - if (!strings) abort(); + cxstring strings_stack[8]; + cxstring *strings; + if (count > 8) { + strings = calloc(count, sizeof(cxstring)); + if (strings == NULL) { + return (cxmutstr) {NULL, 0}; + } + } else { + strings = strings_stack; + } va_list ap; va_start(ap, count); // get all args and overall length + bool overflow = false; size_t slen = str.length; - cx_for_n(i, count) { + for (size_t i = 0; i < count; i++) { cxstring s = va_arg (ap, cxstring); strings[i] = s; + if (slen > SIZE_MAX - str.length) overflow = true; slen += s.length; } va_end(ap); + // abort in case of overflow + if (overflow) { + errno = EOVERFLOW; + if (strings != strings_stack) { + free(strings); + } + return (cxmutstr) { NULL, 0 }; + } + // reallocate or create new string + char *newstr; if (str.ptr == NULL) { - str.ptr = cxMalloc(alloc, slen + 1); + newstr = cxMalloc(alloc, slen + 1); } else { - str.ptr = cxRealloc(alloc, str.ptr, slen + 1); + newstr = cxRealloc(alloc, str.ptr, slen + 1); } - if (str.ptr == NULL) abort(); + if (newstr == NULL) { + if (strings != strings_stack) { + free(strings); + } + return (cxmutstr) {NULL, 0}; + } + str.ptr = newstr; // concatenate strings size_t pos = str.length; str.length = slen; - cx_for_n(i, count) { + for (size_t i = 0; i < count; i++) { cxstring s = strings[i]; memcpy(str.ptr + pos, s.ptr, s.length); pos += s.length; @@ -142,7 +169,9 @@ str.ptr[str.length] = '\0'; // free temporary array - free(strings); + if (strings != strings_stack) { + free(strings); + } return str; } @@ -191,14 +220,9 @@ cxstring string, int chr ) { - chr = 0xFF & chr; - // TODO: improve by comparing multiple bytes at once - cx_for_n(i, string.length) { - if (string.ptr[i] == chr) { - return cx_strsubs(string, i); - } - } - return (cxstring) {NULL, 0}; + char *ret = memchr(string.ptr, 0xFF & chr, string.length); + if (ret == NULL) return (cxstring) {NULL, 0}; + return (cxstring) {ret, string.length - (ret - string.ptr)}; } cxmutstr cx_strchr_m( @@ -234,8 +258,9 @@ } #ifndef CX_STRSTR_SBO_SIZE -#define CX_STRSTR_SBO_SIZE 512 +#define CX_STRSTR_SBO_SIZE 128 #endif +const unsigned cx_strstr_sbo_size = CX_STRSTR_SBO_SIZE; cxstring cx_strstr( cxstring haystack, @@ -263,7 +288,7 @@ // check needle length and use appropriate prefix table // if the pattern exceeds static prefix table, allocate on the heap - bool useheap = needle.length >= CX_STRSTR_SBO_SIZE; + const bool useheap = needle.length >= CX_STRSTR_SBO_SIZE; register size_t *ptable = useheap ? calloc(needle.length + 1, sizeof(size_t)) : s_prefix_table; @@ -302,7 +327,7 @@ } // if prefix table was allocated on the heap, free it - if (ptable != s_prefix_table) { + if (useheap) { free(ptable); } @@ -376,7 +401,7 @@ } size_t cx_strsplit_a( - CxAllocator const *allocator, + const CxAllocator *allocator, cxstring string, cxstring delim, size_t limit, @@ -418,7 +443,7 @@ } size_t cx_strsplit_ma( - CxAllocator const *allocator, + const CxAllocator *allocator, cxmutstr string, cxstring delim, size_t limit, @@ -433,10 +458,14 @@ cxstring s2 ) { if (s1.length == s2.length) { - return memcmp(s1.ptr, s2.ptr, s1.length); + return strncmp(s1.ptr, s2.ptr, s1.length); } else if (s1.length > s2.length) { + int r = strncmp(s1.ptr, s2.ptr, s2.length); + if (r != 0) return r; return 1; } else { + int r = strncmp(s1.ptr, s2.ptr, s1.length); + if (r != 0) return r; return -1; } } @@ -446,38 +475,38 @@ cxstring s2 ) { if (s1.length == s2.length) { -#ifdef _WIN32 - return _strnicmp(s1.ptr, s2.ptr, s1.length); -#else - return strncasecmp(s1.ptr, s2.ptr, s1.length); -#endif + return cx_strcasecmp_impl(s1.ptr, s2.ptr, s1.length); } else if (s1.length > s2.length) { + int r = cx_strcasecmp_impl(s1.ptr, s2.ptr, s2.length); + if (r != 0) return r; return 1; } else { + int r = cx_strcasecmp_impl(s1.ptr, s2.ptr, s1.length); + if (r != 0) return r; return -1; } } int cx_strcmp_p( - void const *s1, - void const *s2 + const void *s1, + const void *s2 ) { - cxstring const *left = s1; - cxstring const *right = s2; + const cxstring *left = s1; + const cxstring *right = s2; return cx_strcmp(*left, *right); } int cx_strcasecmp_p( - void const *s1, - void const *s2 + const void *s1, + const void *s2 ) { - cxstring const *left = s1; - cxstring const *right = s2; + const cxstring *left = s1; + const cxstring *right = s2; return cx_strcasecmp(*left, *right); } -cxmutstr cx_strdup_a( - CxAllocator const *allocator, +cxmutstr cx_strdup_a_( + const CxAllocator *allocator, cxstring string ) { cxmutstr result = { @@ -493,14 +522,19 @@ return result; } +static bool str_isspace(char c) { + // TODO: remove once UCX has public API for this + return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\v' || c == '\f'; +} + cxstring cx_strtrim(cxstring string) { cxstring result = string; // TODO: optimize by comparing multiple bytes at once - while (result.length > 0 && isspace(*result.ptr)) { + while (result.length > 0 && str_isspace(*result.ptr)) { result.ptr++; result.length--; } - while (result.length > 0 && isspace(result.ptr[result.length - 1])) { + while (result.length > 0 && str_isspace(result.ptr[result.length - 1])) { result.length--; } return result; @@ -554,18 +588,6 @@ #endif } -void cx_strlower(cxmutstr string) { - cx_for_n(i, string.length) { - string.ptr[i] = (char) tolower(string.ptr[i]); - } -} - -void cx_strupper(cxmutstr string) { - cx_for_n(i, string.length) { - string.ptr[i] = (char) toupper(string.ptr[i]); - } -} - #ifndef CX_STRREPLACE_INDEX_BUFFER_SIZE #define CX_STRREPLACE_INDEX_BUFFER_SIZE 64 #endif @@ -577,6 +599,8 @@ }; static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) { + // remember, the first data is on the stack! + buf = buf->next; while (buf) { struct cx_strreplace_ibuf *next = buf->next; free(buf->buf); @@ -586,51 +610,48 @@ } cxmutstr cx_strreplacen_a( - CxAllocator const *allocator, + const CxAllocator *allocator, cxstring str, - cxstring pattern, + cxstring search, cxstring replacement, size_t replmax ) { - if (pattern.length == 0 || pattern.length > str.length || replmax == 0) + if (search.length == 0 || search.length > str.length || replmax == 0) return cx_strdup_a(allocator, str); // Compute expected buffer length - size_t ibufmax = str.length / pattern.length; + size_t ibufmax = str.length / search.length; size_t ibuflen = replmax < ibufmax ? replmax : ibufmax; if (ibuflen > CX_STRREPLACE_INDEX_BUFFER_SIZE) { ibuflen = CX_STRREPLACE_INDEX_BUFFER_SIZE; } - // Allocate first index buffer - struct cx_strreplace_ibuf *firstbuf, *curbuf; - firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf)); - if (!firstbuf) return cx_mutstrn(NULL, 0); - firstbuf->buf = calloc(ibuflen, sizeof(size_t)); - if (!firstbuf->buf) { - free(firstbuf); - return cx_mutstrn(NULL, 0); - } + // First index buffer can be on the stack + struct cx_strreplace_ibuf ibuf, *curbuf = &ibuf; + size_t ibuf_sbo[CX_STRREPLACE_INDEX_BUFFER_SIZE]; + ibuf.buf = ibuf_sbo; + ibuf.next = NULL; + ibuf.len = 0; // Search occurrences cxstring searchstr = str; size_t found = 0; do { - cxstring match = cx_strstr(searchstr, pattern); + cxstring match = cx_strstr(searchstr, search); if (match.length > 0) { // Allocate next buffer in chain, if required if (curbuf->len == ibuflen) { struct cx_strreplace_ibuf *nextbuf = calloc(1, sizeof(struct cx_strreplace_ibuf)); if (!nextbuf) { - cx_strrepl_free_ibuf(firstbuf); + cx_strrepl_free_ibuf(&ibuf); return cx_mutstrn(NULL, 0); } nextbuf->buf = calloc(ibuflen, sizeof(size_t)); if (!nextbuf->buf) { free(nextbuf); - cx_strrepl_free_ibuf(firstbuf); + cx_strrepl_free_ibuf(&ibuf); return cx_mutstrn(NULL, 0); } curbuf->next = nextbuf; @@ -641,8 +662,8 @@ found++; size_t idx = match.ptr - str.ptr; curbuf->buf[curbuf->len++] = idx; - searchstr.ptr = match.ptr + pattern.length; - searchstr.length = str.length - idx - pattern.length; + searchstr.ptr = match.ptr + search.length; + searchstr.length = str.length - idx - search.length; } else { break; } @@ -651,9 +672,9 @@ // Allocate result string cxmutstr result; { - ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length; + long long adjlen = (long long) replacement.length - (long long) search.length; size_t rcount = 0; - curbuf = firstbuf; + curbuf = &ibuf; do { rcount += curbuf->len; curbuf = curbuf->next; @@ -661,13 +682,13 @@ result.length = str.length + rcount * adjlen; result.ptr = cxMalloc(allocator, result.length + 1); if (!result.ptr) { - cx_strrepl_free_ibuf(firstbuf); + cx_strrepl_free_ibuf(&ibuf); return cx_mutstrn(NULL, 0); } } // Build result string - curbuf = firstbuf; + curbuf = &ibuf; size_t srcidx = 0; char *destptr = result.ptr; do { @@ -682,7 +703,7 @@ } // Copy the replacement and skip the source pattern - srcidx += pattern.length; + srcidx += search.length; memcpy(destptr, replacement.ptr, replacement.length); destptr += replacement.length; } @@ -694,12 +715,12 @@ result.ptr[result.length] = '\0'; // Free index buffer - cx_strrepl_free_ibuf(firstbuf); + cx_strrepl_free_ibuf(&ibuf); return result; } -CxStrtokCtx cx_strtok( +CxStrtokCtx cx_strtok_( cxstring str, cxstring delim, size_t limit @@ -717,14 +738,6 @@ return ctx; } -CxStrtokCtx cx_strtok_m( - cxmutstr str, - cxstring delim, - size_t limit -) { - return cx_strtok(cx_strcast(str), delim, limit); -} - bool cx_strtok_next( CxStrtokCtx *ctx, cxstring *token @@ -747,7 +760,7 @@ // if more delimiters are specified, check them now if (ctx->delim_more_count > 0) { - cx_for_n(i, ctx->delim_more_count) { + for (size_t i = 0; i < ctx->delim_more_count; i++) { cxstring d = cx_strstr(haystack, ctx->delim_more[i]); if (d.length > 0 && (delim.length == 0 || d.ptr < delim.ptr)) { delim.ptr = d.ptr; @@ -777,9 +790,368 @@ void cx_strtok_delim( CxStrtokCtx *ctx, - cxstring const *delim, + const cxstring *delim, size_t count ) { ctx->delim_more = delim; ctx->delim_more_count = count; } + +#define cx_strtoX_signed_impl(rtype, rmin, rmax) \ + long long result; \ + if (cx_strtoll_lc(str, &result, base, groupsep)) { \ + return -1; \ + } \ + if (result < rmin || result > rmax) { \ + errno = ERANGE; \ + return -1; \ + } \ + *output = (rtype) result; \ + return 0 + +int cx_strtos_lc_(cxstring str, short *output, int base, const char *groupsep) { + cx_strtoX_signed_impl(short, SHRT_MIN, SHRT_MAX); +} + +int cx_strtoi_lc_(cxstring str, int *output, int base, const char *groupsep) { + cx_strtoX_signed_impl(int, INT_MIN, INT_MAX); +} + +int cx_strtol_lc_(cxstring str, long *output, int base, const char *groupsep) { + cx_strtoX_signed_impl(long, LONG_MIN, LONG_MAX); +} + +int cx_strtoll_lc_(cxstring str, long long *output, int base, const char *groupsep) { + // strategy: parse as unsigned, check range, negate if required + bool neg = false; + size_t start_unsigned = 0; + + // emptiness check + if (str.length == 0) { + errno = EINVAL; + return -1; + } + + // test if we have a negative sign character + if (str.ptr[start_unsigned] == '-') { + neg = true; + start_unsigned++; + // must not be followed by positive sign character + if (str.length == 1 || str.ptr[start_unsigned] == '+') { + errno = EINVAL; + return -1; + } + } + + // now parse the number with strtoull + unsigned long long v; + cxstring ustr = start_unsigned == 0 ? str + : cx_strn(str.ptr + start_unsigned, str.length - start_unsigned); + int ret = cx_strtoull_lc(ustr, &v, base, groupsep); + if (ret != 0) return ret; + if (neg) { + if (v - 1 > LLONG_MAX) { + errno = ERANGE; + return -1; + } + *output = -(long long) v; + return 0; + } else { + if (v > LLONG_MAX) { + errno = ERANGE; + return -1; + } + *output = (long long) v; + return 0; + } +} + +int cx_strtoi8_lc_(cxstring str, int8_t *output, int base, const char *groupsep) { + cx_strtoX_signed_impl(int8_t, INT8_MIN, INT8_MAX); +} + +int cx_strtoi16_lc_(cxstring str, int16_t *output, int base, const char *groupsep) { + cx_strtoX_signed_impl(int16_t, INT16_MIN, INT16_MAX); +} + +int cx_strtoi32_lc_(cxstring str, int32_t *output, int base, const char *groupsep) { + cx_strtoX_signed_impl(int32_t, INT32_MIN, INT32_MAX); +} + +int cx_strtoi64_lc_(cxstring str, int64_t *output, int base, const char *groupsep) { + assert(sizeof(long long) == sizeof(int64_t)); // should be true on all platforms + return cx_strtoll_lc(str, (long long*) output, base, groupsep); +} + +#define cx_strtoX_unsigned_impl(rtype, rmax) \ + uint64_t result; \ + if (cx_strtou64_lc(str, &result, base, groupsep)) { \ + return -1; \ + } \ + if (result > rmax) { \ + errno = ERANGE; \ + return -1; \ + } \ + *output = (rtype) result; \ + return 0 + +int cx_strtous_lc_(cxstring str, unsigned short *output, int base, const char *groupsep) { + cx_strtoX_unsigned_impl(unsigned short, USHRT_MAX); +} + +int cx_strtou_lc_(cxstring str, unsigned int *output, int base, const char *groupsep) { + cx_strtoX_unsigned_impl(unsigned int, UINT_MAX); +} + +int cx_strtoul_lc_(cxstring str, unsigned long *output, int base, const char *groupsep) { + cx_strtoX_unsigned_impl(unsigned long, ULONG_MAX); +} + +int cx_strtoull_lc_(cxstring str, unsigned long long *output, int base, const char *groupsep) { + // some sanity checks + if (str.length == 0) { + errno = EINVAL; + return -1; + } + if (!(base == 2 || base == 8 || base == 10 || base == 16)) { + errno = EINVAL; + return -1; + } + if (groupsep == NULL) groupsep = ""; + + // find the actual start of the number + if (str.ptr[0] == '+') { + str.ptr++; + str.length--; + if (str.length == 0) { + errno = EINVAL; + return -1; + } + } + size_t start = 0; + + // if base is 2 or 16, some leading stuff may appear + if (base == 2) { + if ((str.ptr[0] | 32) == 'b') { + start = 1; + } else if (str.ptr[0] == '0' && str.length > 1) { + if ((str.ptr[1] | 32) == 'b') { + start = 2; + } + } + } else if (base == 16) { + if ((str.ptr[0] | 32) == 'x' || str.ptr[0] == '#') { + start = 1; + } else if (str.ptr[0] == '0' && str.length > 1) { + if ((str.ptr[1] | 32) == 'x') { + start = 2; + } + } + } + + // check if there are digits left + if (start >= str.length) { + errno = EINVAL; + return -1; + } + + // now parse the number + unsigned long long result = 0; + for (size_t i = start; i < str.length; i++) { + // ignore group separators + if (strchr(groupsep, str.ptr[i])) continue; + + // determine the digit value of the character + unsigned char c = str.ptr[i]; + if (c >= 'a') c = 10 + (c - 'a'); + else if (c >= 'A') c = 10 + (c - 'A'); + else if (c >= '0') c = c - '0'; + else c = 255; + if (c >= base) { + errno = EINVAL; + return -1; + } + + // now combine the digit with what we already have + unsigned long right = (result & 0xff) * base + c; + unsigned long long left = (result >> 8) * base + (right >> 8); + if (left > (ULLONG_MAX >> 8)) { + errno = ERANGE; + return -1; + } + result = (left << 8) + (right & 0xff); + } + + *output = result; + return 0; +} + +int cx_strtou8_lc_(cxstring str, uint8_t *output, int base, const char *groupsep) { + cx_strtoX_unsigned_impl(uint8_t, UINT8_MAX); +} + +int cx_strtou16_lc_(cxstring str, uint16_t *output, int base, const char *groupsep) { + cx_strtoX_unsigned_impl(uint16_t, UINT16_MAX); +} + +int cx_strtou32_lc_(cxstring str, uint32_t *output, int base, const char *groupsep) { + cx_strtoX_unsigned_impl(uint32_t, UINT32_MAX); +} + +int cx_strtou64_lc_(cxstring str, uint64_t *output, int base, const char *groupsep) { + assert(sizeof(unsigned long long) == sizeof(uint64_t)); // should be true on all platforms + return cx_strtoull_lc(str, (unsigned long long*) output, base, groupsep); +} + +int cx_strtoz_lc_(cxstring str, size_t *output, int base, const char *groupsep) { +#if SIZE_MAX == UINT32_MAX + return cx_strtou32_lc_(str, (uint32_t*) output, base, groupsep); +#elif SIZE_MAX == UINT64_MAX + return cx_strtoull_lc_(str, (unsigned long long *) output, base, groupsep); +#else +#error "unsupported size_t size" +#endif +} + +int cx_strtof_lc_(cxstring str, float *output, char decsep, const char *groupsep) { + // use string to double and add a range check + double d; + int ret = cx_strtod_lc_(str, &d, decsep, groupsep); + if (ret != 0) return ret; + // note: FLT_MIN is the smallest POSITIVE number that can be represented + double test = d < 0 ? -d : d; + if (test < FLT_MIN || test > FLT_MAX) { + errno = ERANGE; + return -1; + } + *output = (float) d; + return 0; +} + +static bool str_isdigit(char c) { + // TODO: remove once UCX has public API for this + return c >= '0' && c <= '9'; +} + +int cx_strtod_lc_(cxstring str, double *output, char decsep, const char *groupsep) { + // TODO: overflow check + // TODO: increase precision + + // emptiness check + if (str.length == 0) { + errno = EINVAL; + return -1; + } + + double result = 0.; + int sign = 1; + + // check if there is a sign + if (str.ptr[0] == '-') { + sign = -1; + str.ptr++; + str.length--; + } else if (str.ptr[0] == '+') { + str.ptr++; + str.length--; + } + + // there must be at least one char to parse + if (str.length == 0) { + errno = EINVAL; + return -1; + } + + // parse all digits until we find the decsep + size_t pos = 0; + do { + if (str_isdigit(str.ptr[pos])) { + result = result * 10 + (str.ptr[pos] - '0'); + } else if (strchr(groupsep, str.ptr[pos]) == NULL) { + break; + } + } while (++pos < str.length); + + // already done? + if (pos == str.length) { + *output = result * sign; + return 0; + } + + // is the next char the decsep? + if (str.ptr[pos] == decsep) { + pos++; + // it may end with the decsep, if it did not start with it + if (pos == str.length) { + if (str.length == 1) { + errno = EINVAL; + return -1; + } else { + *output = result * sign; + return 0; + } + } + // parse everything until exponent or end + double factor = 1.; + do { + if (str_isdigit(str.ptr[pos])) { + factor *= 0.1; + result = result + factor * (str.ptr[pos] - '0'); + } else if (strchr(groupsep, str.ptr[pos]) == NULL) { + break; + } + } while (++pos < str.length); + } + + // no exponent? + if (pos == str.length) { + *output = result * sign; + return 0; + } + + // now the next separator MUST be the exponent separator + // and at least one char must follow + if ((str.ptr[pos] | 32) != 'e' || str.length <= pos + 1) { + errno = EINVAL; + return -1; + } + pos++; + + // check if we have a sign for the exponent + double factor = 10.; + if (str.ptr[pos] == '-') { + factor = .1; + pos++; + } else if (str.ptr[pos] == '+') { + pos++; + } + + // at least one digit must follow + if (pos == str.length) { + errno = EINVAL; + return -1; + } + + // parse the exponent + unsigned int exp = 0; + do { + if (str_isdigit(str.ptr[pos])) { + exp = 10 * exp + (str.ptr[pos] - '0'); + } else if (strchr(groupsep, str.ptr[pos]) == NULL) { + errno = EINVAL; + return -1; + } + } while (++pos < str.length); + + // apply the exponent by fast exponentiation + do { + if (exp & 1) { + result *= factor; + } + factor *= factor; + } while ((exp >>= 1) > 0); + + // store the result and exit + *output = result * sign; + return 0; +}