diff -r 118e2386d5b4 -r 063a9f29098c ucx/string.c
--- a/ucx/string.c	Sat Feb 22 18:10:36 2025 +0100
+++ b/ucx/string.c	Sun Feb 23 14:28:47 2025 +0100
@@ -25,12 +25,10 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
-#define CX_STR_IMPLEMENTATION
 #include "cx/string.h"
 
 #include <string.h>
 #include <stdarg.h>
-#include <ctype.h>
 #include <assert.h>
 #include <errno.h>
 #include <limits.h>
@@ -222,14 +220,9 @@
         cxstring string,
         int chr
 ) {
-    chr = 0xFF & chr;
-    // TODO: improve by comparing multiple bytes at once
-    for (size_t i = 0; i < string.length; i++) {
-        if (string.ptr[i] == chr) {
-            return cx_strsubs(string, i);
-        }
-    }
-    return (cxstring) {NULL, 0};
+    char *ret = memchr(string.ptr, 0xFF & chr, string.length);
+    if (ret == NULL) return (cxstring) {NULL, 0};
+    return (cxstring) {ret, string.length - (ret - string.ptr)};
 }
 
 cxmutstr cx_strchr_m(
@@ -265,7 +258,7 @@
 }
 
 #ifndef CX_STRSTR_SBO_SIZE
-#define CX_STRSTR_SBO_SIZE 512
+#define CX_STRSTR_SBO_SIZE 128
 #endif
 const unsigned cx_strstr_sbo_size = CX_STRSTR_SBO_SIZE;
 
@@ -295,7 +288,7 @@
 
     // check needle length and use appropriate prefix table
     // if the pattern exceeds static prefix table, allocate on the heap
-    bool useheap = needle.length >= CX_STRSTR_SBO_SIZE;
+    const bool useheap = needle.length >= CX_STRSTR_SBO_SIZE;
     register size_t *ptable = useheap ? calloc(needle.length + 1,
                                                sizeof(size_t)) : s_prefix_table;
 
@@ -334,7 +327,7 @@
     }
 
     // if prefix table was allocated on the heap, free it
-    if (ptable != s_prefix_table) {
+    if (useheap) {
         free(ptable);
     }
 
@@ -512,7 +505,7 @@
     return cx_strcasecmp(*left, *right);
 }
 
-cxmutstr cx_strdup_a(
+cxmutstr cx_strdup_a_(
         const CxAllocator *allocator,
         cxstring string
 ) {
@@ -529,14 +522,19 @@
     return result;
 }
 
+static bool str_isspace(char c) {
+    // TODO: remove once UCX has public API for this
+    return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\v' || c == '\f';
+}
+
 cxstring cx_strtrim(cxstring string) {
     cxstring result = string;
     // TODO: optimize by comparing multiple bytes at once
-    while (result.length > 0 && isspace(*result.ptr)) {
+    while (result.length > 0 && str_isspace(*result.ptr)) {
         result.ptr++;
         result.length--;
     }
-    while (result.length > 0 && isspace(result.ptr[result.length - 1])) {
+    while (result.length > 0 && str_isspace(result.ptr[result.length - 1])) {
         result.length--;
     }
     return result;
@@ -590,18 +588,6 @@
 #endif
 }
 
-void cx_strlower(cxmutstr string) {
-    for (size_t i = 0; i < string.length; i++) {
-        string.ptr[i] = (char) tolower(string.ptr[i]);
-    }
-}
-
-void cx_strupper(cxmutstr string) {
-    for (size_t i = 0; i < string.length; i++) {
-        string.ptr[i] = (char) toupper(string.ptr[i]);
-    }
-}
-
 #ifndef CX_STRREPLACE_INDEX_BUFFER_SIZE
 #define CX_STRREPLACE_INDEX_BUFFER_SIZE 64
 #endif
@@ -613,6 +599,8 @@
 };
 
 static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) {
+    // remember, the first data is on the stack!
+    buf = buf->next;
     while (buf) {
         struct cx_strreplace_ibuf *next = buf->next;
         free(buf->buf);
@@ -624,49 +612,46 @@
 cxmutstr cx_strreplacen_a(
         const CxAllocator *allocator,
         cxstring str,
-        cxstring pattern,
+        cxstring search,
         cxstring replacement,
         size_t replmax
 ) {
 
-    if (pattern.length == 0 || pattern.length > str.length || replmax == 0)
+    if (search.length == 0 || search.length > str.length || replmax == 0)
         return cx_strdup_a(allocator, str);
 
     // Compute expected buffer length
-    size_t ibufmax = str.length / pattern.length;
+    size_t ibufmax = str.length / search.length;
     size_t ibuflen = replmax < ibufmax ? replmax : ibufmax;
     if (ibuflen > CX_STRREPLACE_INDEX_BUFFER_SIZE) {
         ibuflen = CX_STRREPLACE_INDEX_BUFFER_SIZE;
     }
 
-    // Allocate first index buffer
-    struct cx_strreplace_ibuf *firstbuf, *curbuf;
-    firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf));
-    if (!firstbuf) return cx_mutstrn(NULL, 0);
-    firstbuf->buf = calloc(ibuflen, sizeof(size_t));
-    if (!firstbuf->buf) {
-        free(firstbuf);
-        return cx_mutstrn(NULL, 0);
-    }
+    // First index buffer can be on the stack
+    struct cx_strreplace_ibuf ibuf, *curbuf = &ibuf;
+    size_t ibuf_sbo[CX_STRREPLACE_INDEX_BUFFER_SIZE];
+    ibuf.buf = ibuf_sbo;
+    ibuf.next = NULL;
+    ibuf.len = 0;
 
     // Search occurrences
     cxstring searchstr = str;
     size_t found = 0;
     do {
-        cxstring match = cx_strstr(searchstr, pattern);
+        cxstring match = cx_strstr(searchstr, search);
         if (match.length > 0) {
             // Allocate next buffer in chain, if required
             if (curbuf->len == ibuflen) {
                 struct cx_strreplace_ibuf *nextbuf =
                         calloc(1, sizeof(struct cx_strreplace_ibuf));
                 if (!nextbuf) {
-                    cx_strrepl_free_ibuf(firstbuf);
+                    cx_strrepl_free_ibuf(&ibuf);
                     return cx_mutstrn(NULL, 0);
                 }
                 nextbuf->buf = calloc(ibuflen, sizeof(size_t));
                 if (!nextbuf->buf) {
                     free(nextbuf);
-                    cx_strrepl_free_ibuf(firstbuf);
+                    cx_strrepl_free_ibuf(&ibuf);
                     return cx_mutstrn(NULL, 0);
                 }
                 curbuf->next = nextbuf;
@@ -677,8 +662,8 @@
             found++;
             size_t idx = match.ptr - str.ptr;
             curbuf->buf[curbuf->len++] = idx;
-            searchstr.ptr = match.ptr + pattern.length;
-            searchstr.length = str.length - idx - pattern.length;
+            searchstr.ptr = match.ptr + search.length;
+            searchstr.length = str.length - idx - search.length;
         } else {
             break;
         }
@@ -687,9 +672,9 @@
     // Allocate result string
     cxmutstr result;
     {
-        ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length;
+        long long adjlen = (long long) replacement.length - (long long) search.length;
         size_t rcount = 0;
-        curbuf = firstbuf;
+        curbuf = &ibuf;
         do {
             rcount += curbuf->len;
             curbuf = curbuf->next;
@@ -697,13 +682,13 @@
         result.length = str.length + rcount * adjlen;
         result.ptr = cxMalloc(allocator, result.length + 1);
         if (!result.ptr) {
-            cx_strrepl_free_ibuf(firstbuf);
+            cx_strrepl_free_ibuf(&ibuf);
             return cx_mutstrn(NULL, 0);
         }
     }
 
     // Build result string
-    curbuf = firstbuf;
+    curbuf = &ibuf;
     size_t srcidx = 0;
     char *destptr = result.ptr;
     do {
@@ -718,7 +703,7 @@
             }
 
             // Copy the replacement and skip the source pattern
-            srcidx += pattern.length;
+            srcidx += search.length;
             memcpy(destptr, replacement.ptr, replacement.length);
             destptr += replacement.length;
         }
@@ -730,12 +715,12 @@
     result.ptr[result.length] = '\0';
 
     // Free index buffer
-    cx_strrepl_free_ibuf(firstbuf);
+    cx_strrepl_free_ibuf(&ibuf);
 
     return result;
 }
 
-CxStrtokCtx cx_strtok(
+CxStrtokCtx cx_strtok_(
         cxstring str,
         cxstring delim,
         size_t limit
@@ -753,14 +738,6 @@
     return ctx;
 }
 
-CxStrtokCtx cx_strtok_m(
-        cxmutstr str,
-        cxstring delim,
-        size_t limit
-) {
-    return cx_strtok(cx_strcast(str), delim, limit);
-}
-
 bool cx_strtok_next(
         CxStrtokCtx *ctx,
         cxstring *token
@@ -832,25 +809,24 @@
     *output = (rtype) result; \
     return 0
 
-int cx_strtos_lc(cxstring str, short *output, int base, const char *groupsep) {
+int cx_strtos_lc_(cxstring str, short *output, int base, const char *groupsep) {
     cx_strtoX_signed_impl(short, SHRT_MIN, SHRT_MAX);
 }
 
-int cx_strtoi_lc(cxstring str, int *output, int base, const char *groupsep) {
+int cx_strtoi_lc_(cxstring str, int *output, int base, const char *groupsep) {
     cx_strtoX_signed_impl(int, INT_MIN, INT_MAX);
 }
 
-int cx_strtol_lc(cxstring str, long *output, int base, const char *groupsep) {
+int cx_strtol_lc_(cxstring str, long *output, int base, const char *groupsep) {
     cx_strtoX_signed_impl(long, LONG_MIN, LONG_MAX);
 }
 
-int cx_strtoll_lc(cxstring str, long long *output, int base, const char *groupsep) {
+int cx_strtoll_lc_(cxstring str, long long *output, int base, const char *groupsep) {
     // strategy: parse as unsigned, check range, negate if required
     bool neg = false;
     size_t start_unsigned = 0;
 
-    // trim already, to search for a sign character
-    str = cx_strtrim(str);
+    // emptiness check
     if (str.length == 0) {
         errno = EINVAL;
         return -1;
@@ -890,33 +866,23 @@
     }
 }
 
-int cx_strtoi8_lc(cxstring str, int8_t *output, int base, const char *groupsep) {
+int cx_strtoi8_lc_(cxstring str, int8_t *output, int base, const char *groupsep) {
     cx_strtoX_signed_impl(int8_t, INT8_MIN, INT8_MAX);
 }
 
-int cx_strtoi16_lc(cxstring str, int16_t *output, int base, const char *groupsep) {
+int cx_strtoi16_lc_(cxstring str, int16_t *output, int base, const char *groupsep) {
     cx_strtoX_signed_impl(int16_t, INT16_MIN, INT16_MAX);
 }
 
-int cx_strtoi32_lc(cxstring str, int32_t *output, int base, const char *groupsep) {
+int cx_strtoi32_lc_(cxstring str, int32_t *output, int base, const char *groupsep) {
     cx_strtoX_signed_impl(int32_t, INT32_MIN, INT32_MAX);
 }
 
-int cx_strtoi64_lc(cxstring str, int64_t *output, int base, const char *groupsep) {
+int cx_strtoi64_lc_(cxstring str, int64_t *output, int base, const char *groupsep) {
     assert(sizeof(long long) == sizeof(int64_t)); // should be true on all platforms
     return cx_strtoll_lc(str, (long long*) output, base, groupsep);
 }
 
-int cx_strtoz_lc(cxstring str, ssize_t *output, int base, const char *groupsep) {
-#if SSIZE_MAX == INT32_MAX
-    return cx_strtoi32_lc(str, (int32_t*) output, base, groupsep);
-#elif SSIZE_MAX == INT64_MAX
-    return cx_strtoll_lc(str, (long long*) output, base, groupsep);
-#else
-#error "unsupported ssize_t size"
-#endif
-}
-
 #define cx_strtoX_unsigned_impl(rtype, rmax) \
     uint64_t result; \
     if (cx_strtou64_lc(str, &result, base, groupsep)) { \
@@ -929,21 +895,20 @@
     *output = (rtype) result; \
     return 0
 
-int cx_strtous_lc(cxstring str, unsigned short *output, int base, const char *groupsep) {
+int cx_strtous_lc_(cxstring str, unsigned short *output, int base, const char *groupsep) {
     cx_strtoX_unsigned_impl(unsigned short, USHRT_MAX);
 }
 
-int cx_strtou_lc(cxstring str, unsigned int *output, int base, const char *groupsep) {
+int cx_strtou_lc_(cxstring str, unsigned int *output, int base, const char *groupsep) {
     cx_strtoX_unsigned_impl(unsigned int, UINT_MAX);
 }
 
-int cx_strtoul_lc(cxstring str, unsigned long *output, int base, const char *groupsep) {
+int cx_strtoul_lc_(cxstring str, unsigned long *output, int base, const char *groupsep) {
     cx_strtoX_unsigned_impl(unsigned long, ULONG_MAX);
 }
 
-int cx_strtoull_lc(cxstring str, unsigned long long *output, int base, const char *groupsep) {
+int cx_strtoull_lc_(cxstring str, unsigned long long *output, int base, const char *groupsep) {
     // some sanity checks
-    str = cx_strtrim(str);
     if (str.length == 0) {
         errno = EINVAL;
         return -1;
@@ -1021,37 +986,37 @@
     return 0;
 }
 
-int cx_strtou8_lc(cxstring str, uint8_t *output, int base, const char *groupsep) {
+int cx_strtou8_lc_(cxstring str, uint8_t *output, int base, const char *groupsep) {
     cx_strtoX_unsigned_impl(uint8_t, UINT8_MAX);
 }
 
-int cx_strtou16_lc(cxstring str, uint16_t *output, int base, const char *groupsep) {
+int cx_strtou16_lc_(cxstring str, uint16_t *output, int base, const char *groupsep) {
     cx_strtoX_unsigned_impl(uint16_t, UINT16_MAX);
 }
 
-int cx_strtou32_lc(cxstring str, uint32_t *output, int base, const char *groupsep) {
+int cx_strtou32_lc_(cxstring str, uint32_t *output, int base, const char *groupsep) {
     cx_strtoX_unsigned_impl(uint32_t, UINT32_MAX);
 }
 
-int cx_strtou64_lc(cxstring str, uint64_t *output, int base, const char *groupsep) {
+int cx_strtou64_lc_(cxstring str, uint64_t *output, int base, const char *groupsep) {
     assert(sizeof(unsigned long long) == sizeof(uint64_t)); // should be true on all platforms
     return cx_strtoull_lc(str, (unsigned long long*) output, base, groupsep);
 }
 
-int cx_strtouz_lc(cxstring str, size_t *output, int base, const char *groupsep) {
+int cx_strtoz_lc_(cxstring str, size_t *output, int base, const char *groupsep) {
 #if SIZE_MAX == UINT32_MAX
-    return cx_strtou32_lc(str, (uint32_t*) output, base, groupsep);
+    return cx_strtou32_lc_(str, (uint32_t*) output, base, groupsep);
 #elif SIZE_MAX == UINT64_MAX
-    return cx_strtoull_lc(str, (unsigned long long *) output, base, groupsep);
+    return cx_strtoull_lc_(str, (unsigned long long *) output, base, groupsep);
 #else
 #error "unsupported size_t size"
 #endif
 }
 
-int cx_strtof_lc(cxstring str, float *output, char decsep, const char *groupsep) {
+int cx_strtof_lc_(cxstring str, float *output, char decsep, const char *groupsep) {
     // use string to double and add a range check
     double d;
-    int ret = cx_strtod_lc(str, &d, decsep, groupsep);
+    int ret = cx_strtod_lc_(str, &d, decsep, groupsep);
     if (ret != 0) return ret;
     // note: FLT_MIN is the smallest POSITIVE number that can be represented
     double test = d < 0 ? -d : d;
@@ -1063,12 +1028,16 @@
     return 0;
 }
 
-int cx_strtod_lc(cxstring str, double *output, char decsep, const char *groupsep) {
+static bool str_isdigit(char c) {
+    // TODO: remove once UCX has public API for this
+    return c >= '0' && c <= '9';
+}
+
+int cx_strtod_lc_(cxstring str, double *output, char decsep, const char *groupsep) {
     // TODO: overflow check
     // TODO: increase precision
 
-    // trim and check
-    str = cx_strtrim(str);
+    // emptiness check
     if (str.length == 0) {
         errno = EINVAL;
         return -1;
@@ -1096,7 +1065,7 @@
     // parse all digits until we find the decsep
     size_t pos = 0;
     do {
-        if (isdigit(str.ptr[pos])) {
+        if (str_isdigit(str.ptr[pos])) {
             result = result * 10 + (str.ptr[pos] - '0');
         } else if (strchr(groupsep, str.ptr[pos]) == NULL) {
             break;
@@ -1125,7 +1094,7 @@
         // parse everything until exponent or end
         double factor = 1.;
         do {
-            if (isdigit(str.ptr[pos])) {
+            if (str_isdigit(str.ptr[pos])) {
                 factor *= 0.1;
                 result = result + factor * (str.ptr[pos] - '0');
             } else if (strchr(groupsep, str.ptr[pos]) == NULL) {
@@ -1166,7 +1135,7 @@
     // parse the exponent
     unsigned int exp = 0;
     do {
-        if (isdigit(str.ptr[pos])) {
+        if (str_isdigit(str.ptr[pos])) {
             exp = 10 * exp + (str.ptr[pos] - '0');
         } else if (strchr(groupsep, str.ptr[pos]) == NULL) {
             errno = EINVAL;