string.c - UNIXwork Code

1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include "cx/string.h" 30 #include "cx/utils.h" 31 32 #include <string.h> 33 #include <stdarg.h> 34 #include <ctype.h> 35 36 #ifndef _WIN32 37 38 #include <strings.h> // for strncasecmp() 39 40 #endif // _WIN32 41 42 cxmutstr cx_mutstr(char *cstring) { 43 return (cxmutstr) {cstring, strlen(cstring)}; 44 } 45 46 cxmutstr cx_mutstrn( 47 char *cstring, 48 size_t length 49 ) { 50 return (cxmutstr) {cstring, length}; 51 } 52 53 cxstring cx_str(const char *cstring) { 54 return (cxstring) {cstring, strlen(cstring)}; 55 } 56 57 cxstring cx_strn( 58 const char *cstring, 59 size_t length 60 ) { 61 return (cxstring) {cstring, length}; 62 } 63 64 cxstring cx_strcast(cxmutstr str) { 65 return (cxstring) {str.ptr, str.length}; 66 } 67 68 void cx_strfree(cxmutstr *str) { 69 free(str->ptr); 70 str->ptr = NULL; 71 str->length = 0; 72 } 73 74 void cx_strfree_a( 75 const CxAllocator *alloc, 76 cxmutstr *str 77 ) { 78 cxFree(alloc, str->ptr); 79 str->ptr = NULL; 80 str->length = 0; 81 } 82 83 size_t cx_strlen( 84 size_t count, 85 ... 86 ) { 87 if (count == 0) return 0; 88 89 va_list ap; 90 va_start(ap, count); 91 size_t size = 0; 92 cx_for_n(i, count) { 93 cxstring str = va_arg(ap, cxstring); 94 size += str.length; 95 } 96 va_end(ap); 97 98 return size; 99 } 100 101 cxmutstr cx_strcat_ma( 102 const CxAllocator *alloc, 103 cxmutstr str, 104 size_t count, 105 ... 106 ) { 107 if (count == 0) return str; 108 109 cxstring *strings = calloc(count, sizeof(cxstring)); 110 if (!strings) abort(); 111 112 va_list ap; 113 va_start(ap, count); 114 115 // get all args and overall length 116 size_t slen = str.length; 117 cx_for_n(i, count) { 118 cxstring s = va_arg (ap, cxstring); 119 strings[i] = s; 120 slen += s.length; 121 } 122 va_end(ap); 123 124 // reallocate or create new string 125 if (str.ptr == NULL) { 126 str.ptr = cxMalloc(alloc, slen + 1); 127 } else { 128 str.ptr = cxRealloc(alloc, str.ptr, slen + 1); 129 } 130 if (str.ptr == NULL) abort(); 131 132 // concatenate strings 133 size_t pos = str.length; 134 str.length = slen; 135 cx_for_n(i, count) { 136 cxstring s = strings[i]; 137 memcpy(str.ptr + pos, s.ptr, s.length); 138 pos += s.length; 139 } 140 141 // terminate string 142 str.ptr[str.length] = '\0'; 143 144 // free temporary array 145 free(strings); 146 147 return str; 148 } 149 150 cxstring cx_strsubs( 151 cxstring string, 152 size_t start 153 ) { 154 return cx_strsubsl(string, start, string.length - start); 155 } 156 157 cxmutstr cx_strsubs_m( 158 cxmutstr string, 159 size_t start 160 ) { 161 return cx_strsubsl_m(string, start, string.length - start); 162 } 163 164 cxstring cx_strsubsl( 165 cxstring string, 166 size_t start, 167 size_t length 168 ) { 169 if (start > string.length) { 170 return (cxstring) {NULL, 0}; 171 } 172 173 size_t rem_len = string.length - start; 174 if (length > rem_len) { 175 length = rem_len; 176 } 177 178 return (cxstring) {string.ptr + start, length}; 179 } 180 181 cxmutstr cx_strsubsl_m( 182 cxmutstr string, 183 size_t start, 184 size_t length 185 ) { 186 cxstring result = cx_strsubsl(cx_strcast(string), start, length); 187 return (cxmutstr) {(char *) result.ptr, result.length}; 188 } 189 190 cxstring cx_strchr( 191 cxstring string, 192 int chr 193 ) { 194 chr = 0xFF & chr; 195 // TODO: improve by comparing multiple bytes at once 196 cx_for_n(i, string.length) { 197 if (string.ptr[i] == chr) { 198 return cx_strsubs(string, i); 199 } 200 } 201 return (cxstring) {NULL, 0}; 202 } 203 204 cxmutstr cx_strchr_m( 205 cxmutstr string, 206 int chr 207 ) { 208 cxstring result = cx_strchr(cx_strcast(string), chr); 209 return (cxmutstr) {(char *) result.ptr, result.length}; 210 } 211 212 cxstring cx_strrchr( 213 cxstring string, 214 int chr 215 ) { 216 chr = 0xFF & chr; 217 size_t i = string.length; 218 while (i > 0) { 219 i--; 220 // TODO: improve by comparing multiple bytes at once 221 if (string.ptr[i] == chr) { 222 return cx_strsubs(string, i); 223 } 224 } 225 return (cxstring) {NULL, 0}; 226 } 227 228 cxmutstr cx_strrchr_m( 229 cxmutstr string, 230 int chr 231 ) { 232 cxstring result = cx_strrchr(cx_strcast(string), chr); 233 return (cxmutstr) {(char *) result.ptr, result.length}; 234 } 235 236 #ifndef CX_STRSTR_SBO_SIZE 237 #define CX_STRSTR_SBO_SIZE 512 238 #endif 239 unsigned const cx_strstr_sbo_size = CX_STRSTR_SBO_SIZE; 240 241 cxstring cx_strstr( 242 cxstring haystack, 243 cxstring needle 244 ) { 245 if (needle.length == 0) { 246 return haystack; 247 } 248 249 // optimize for single-char needles 250 if (needle.length == 1) { 251 return cx_strchr(haystack, *needle.ptr); 252 } 253 254 /* 255 * IMPORTANT: 256 * Our prefix table contains the prefix length PLUS ONE 257 * this is our decision, because we want to use the full range of size_t. 258 * The original algorithm needs a (-1) at one single place, 259 * and we want to avoid that. 260 */ 261 262 // local prefix table 263 size_t s_prefix_table[CX_STRSTR_SBO_SIZE]; 264 265 // check needle length and use appropriate prefix table 266 // if the pattern exceeds static prefix table, allocate on the heap 267 bool useheap = needle.length >= CX_STRSTR_SBO_SIZE; 268 register size_t *ptable = useheap ? calloc(needle.length + 1, 269 sizeof(size_t)) : s_prefix_table; 270 271 // keep counter in registers 272 register size_t i, j; 273 274 // fill prefix table 275 i = 0; 276 j = 0; 277 ptable[i] = j; 278 while (i < needle.length) { 279 while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) { 280 j = ptable[j - 1]; 281 } 282 i++; 283 j++; 284 ptable[i] = j; 285 } 286 287 // search 288 cxstring result = {NULL, 0}; 289 i = 0; 290 j = 1; 291 while (i < haystack.length) { 292 while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) { 293 j = ptable[j - 1]; 294 } 295 i++; 296 j++; 297 if (j - 1 == needle.length) { 298 size_t start = i - needle.length; 299 result.ptr = haystack.ptr + start; 300 result.length = haystack.length - start; 301 break; 302 } 303 } 304 305 // if prefix table was allocated on the heap, free it 306 if (ptable != s_prefix_table) { 307 free(ptable); 308 } 309 310 return result; 311 } 312 313 cxmutstr cx_strstr_m( 314 cxmutstr haystack, 315 cxstring needle 316 ) { 317 cxstring result = cx_strstr(cx_strcast(haystack), needle); 318 return (cxmutstr) {(char *) result.ptr, result.length}; 319 } 320 321 size_t cx_strsplit( 322 cxstring string, 323 cxstring delim, 324 size_t limit, 325 cxstring *output 326 ) { 327 // special case: output limit is zero 328 if (limit == 0) return 0; 329 330 // special case: delimiter is empty 331 if (delim.length == 0) { 332 output[0] = string; 333 return 1; 334 } 335 336 // special cases: delimiter is at least as large as the string 337 if (delim.length >= string.length) { 338 // exact match 339 if (cx_strcmp(string, delim) == 0) { 340 output[0] = cx_strn(string.ptr, 0); 341 output[1] = cx_strn(string.ptr + string.length, 0); 342 return 2; 343 } else { 344 // no match possible 345 output[0] = string; 346 return 1; 347 } 348 } 349 350 size_t n = 0; 351 cxstring curpos = string; 352 while (1) { 353 ++n; 354 cxstring match = cx_strstr(curpos, delim); 355 if (match.length > 0) { 356 // is the limit reached? 357 if (n < limit) { 358 // copy the current string to the array 359 cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr); 360 output[n - 1] = item; 361 size_t processed = item.length + delim.length; 362 curpos.ptr += processed; 363 curpos.length -= processed; 364 } else { 365 // limit reached, copy the _full_ remaining string 366 output[n - 1] = curpos; 367 break; 368 } 369 } else { 370 // no more matches, copy last string 371 output[n - 1] = curpos; 372 break; 373 } 374 } 375 376 return n; 377 } 378 379 size_t cx_strsplit_a( 380 const CxAllocator *allocator, 381 cxstring string, 382 cxstring delim, 383 size_t limit, 384 cxstring **output 385 ) { 386 // find out how many splits we're going to make and allocate memory 387 size_t n = 0; 388 cxstring curpos = string; 389 while (1) { 390 ++n; 391 cxstring match = cx_strstr(curpos, delim); 392 if (match.length > 0) { 393 // is the limit reached? 394 if (n < limit) { 395 size_t processed = match.ptr - curpos.ptr + delim.length; 396 curpos.ptr += processed; 397 curpos.length -= processed; 398 } else { 399 // limit reached 400 break; 401 } 402 } else { 403 // no more matches 404 break; 405 } 406 } 407 *output = cxCalloc(allocator, n, sizeof(cxstring)); 408 return cx_strsplit(string, delim, n, *output); 409 } 410 411 size_t cx_strsplit_m( 412 cxmutstr string, 413 cxstring delim, 414 size_t limit, 415 cxmutstr *output 416 ) { 417 return cx_strsplit(cx_strcast(string), 418 delim, limit, (cxstring *) output); 419 } 420 421 size_t cx_strsplit_ma( 422 const CxAllocator *allocator, 423 cxmutstr string, 424 cxstring delim, 425 size_t limit, 426 cxmutstr **output 427 ) { 428 return cx_strsplit_a(allocator, cx_strcast(string), 429 delim, limit, (cxstring **) output); 430 } 431 432 int cx_strcmp( 433 cxstring s1, 434 cxstring s2 435 ) { 436 if (s1.length == s2.length) { 437 return memcmp(s1.ptr, s2.ptr, s1.length); 438 } else if (s1.length > s2.length) { 439 return 1; 440 } else { 441 return -1; 442 } 443 } 444 445 int cx_strcasecmp( 446 cxstring s1, 447 cxstring s2 448 ) { 449 if (s1.length == s2.length) { 450 #ifdef _WIN32 451 return _strnicmp(s1.ptr, s2.ptr, s1.length); 452 #else 453 return strncasecmp(s1.ptr, s2.ptr, s1.length); 454 #endif 455 } else if (s1.length > s2.length) { 456 return 1; 457 } else { 458 return -1; 459 } 460 } 461 462 int cx_strcmp_p( 463 const void *s1, 464 const void *s2 465 ) { 466 const cxstring *left = s1; 467 const cxstring *right = s2; 468 return cx_strcmp(*left, *right); 469 } 470 471 int cx_strcasecmp_p( 472 const void *s1, 473 const void *s2 474 ) { 475 const cxstring *left = s1; 476 const cxstring *right = s2; 477 return cx_strcasecmp(*left, *right); 478 } 479 480 cxmutstr cx_strdup_a( 481 const CxAllocator *allocator, 482 cxstring string 483 ) { 484 cxmutstr result = { 485 cxMalloc(allocator, string.length + 1), 486 string.length 487 }; 488 if (result.ptr == NULL) { 489 result.length = 0; 490 return result; 491 } 492 memcpy(result.ptr, string.ptr, string.length); 493 result.ptr[string.length] = '\0'; 494 return result; 495 } 496 497 cxstring cx_strtrim(cxstring string) { 498 cxstring result = string; 499 // TODO: optimize by comparing multiple bytes at once 500 while (result.length > 0 && isspace(*result.ptr)) { 501 result.ptr++; 502 result.length--; 503 } 504 while (result.length > 0 && isspace(result.ptr[result.length - 1])) { 505 result.length--; 506 } 507 return result; 508 } 509 510 cxmutstr cx_strtrim_m(cxmutstr string) { 511 cxstring result = cx_strtrim(cx_strcast(string)); 512 return (cxmutstr) {(char *) result.ptr, result.length}; 513 } 514 515 bool cx_strprefix( 516 cxstring string, 517 cxstring prefix 518 ) { 519 if (string.length < prefix.length) return false; 520 return memcmp(string.ptr, prefix.ptr, prefix.length) == 0; 521 } 522 523 bool cx_strsuffix( 524 cxstring string, 525 cxstring suffix 526 ) { 527 if (string.length < suffix.length) return false; 528 return memcmp(string.ptr + string.length - suffix.length, 529 suffix.ptr, suffix.length) == 0; 530 } 531 532 bool cx_strcaseprefix( 533 cxstring string, 534 cxstring prefix 535 ) { 536 if (string.length < prefix.length) return false; 537 #ifdef _WIN32 538 return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0; 539 #else 540 return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0; 541 #endif 542 } 543 544 bool cx_strcasesuffix( 545 cxstring string, 546 cxstring suffix 547 ) { 548 if (string.length < suffix.length) return false; 549 #ifdef _WIN32 550 return _strnicmp(string.ptr+string.length-suffix.length, 551 suffix.ptr, suffix.length) == 0; 552 #else 553 return strncasecmp(string.ptr + string.length - suffix.length, 554 suffix.ptr, suffix.length) == 0; 555 #endif 556 } 557 558 void cx_strlower(cxmutstr string) { 559 cx_for_n(i, string.length) { 560 string.ptr[i] = (char) tolower(string.ptr[i]); 561 } 562 } 563 564 void cx_strupper(cxmutstr string) { 565 cx_for_n(i, string.length) { 566 string.ptr[i] = (char) toupper(string.ptr[i]); 567 } 568 } 569 570 #ifndef CX_STRREPLACE_INDEX_BUFFER_SIZE 571 #define CX_STRREPLACE_INDEX_BUFFER_SIZE 64 572 #endif 573 574 struct cx_strreplace_ibuf { 575 size_t *buf; 576 struct cx_strreplace_ibuf *next; 577 unsigned int len; 578 }; 579 580 static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) { 581 while (buf) { 582 struct cx_strreplace_ibuf *next = buf->next; 583 free(buf->buf); 584 free(buf); 585 buf = next; 586 } 587 } 588 589 cxmutstr cx_strreplacen_a( 590 const CxAllocator *allocator, 591 cxstring str, 592 cxstring pattern, 593 cxstring replacement, 594 size_t replmax 595 ) { 596 597 if (pattern.length == 0 || pattern.length > str.length || replmax == 0) 598 return cx_strdup_a(allocator, str); 599 600 // Compute expected buffer length 601 size_t ibufmax = str.length / pattern.length; 602 size_t ibuflen = replmax < ibufmax ? replmax : ibufmax; 603 if (ibuflen > CX_STRREPLACE_INDEX_BUFFER_SIZE) { 604 ibuflen = CX_STRREPLACE_INDEX_BUFFER_SIZE; 605 } 606 607 // Allocate first index buffer 608 struct cx_strreplace_ibuf *firstbuf, *curbuf; 609 firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf)); 610 if (!firstbuf) return cx_mutstrn(NULL, 0); 611 firstbuf->buf = calloc(ibuflen, sizeof(size_t)); 612 if (!firstbuf->buf) { 613 free(firstbuf); 614 return cx_mutstrn(NULL, 0); 615 } 616 617 // Search occurrences 618 cxstring searchstr = str; 619 size_t found = 0; 620 do { 621 cxstring match = cx_strstr(searchstr, pattern); 622 if (match.length > 0) { 623 // Allocate next buffer in chain, if required 624 if (curbuf->len == ibuflen) { 625 struct cx_strreplace_ibuf *nextbuf = 626 calloc(1, sizeof(struct cx_strreplace_ibuf)); 627 if (!nextbuf) { 628 cx_strrepl_free_ibuf(firstbuf); 629 return cx_mutstrn(NULL, 0); 630 } 631 nextbuf->buf = calloc(ibuflen, sizeof(size_t)); 632 if (!nextbuf->buf) { 633 free(nextbuf); 634 cx_strrepl_free_ibuf(firstbuf); 635 return cx_mutstrn(NULL, 0); 636 } 637 curbuf->next = nextbuf; 638 curbuf = nextbuf; 639 } 640 641 // Record match index 642 found++; 643 size_t idx = match.ptr - str.ptr; 644 curbuf->buf[curbuf->len++] = idx; 645 searchstr.ptr = match.ptr + pattern.length; 646 searchstr.length = str.length - idx - pattern.length; 647 } else { 648 break; 649 } 650 } while (searchstr.length > 0 && found < replmax); 651 652 // Allocate result string 653 cxmutstr result; 654 { 655 ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length; 656 size_t rcount = 0; 657 curbuf = firstbuf; 658 do { 659 rcount += curbuf->len; 660 curbuf = curbuf->next; 661 } while (curbuf); 662 result.length = str.length + rcount * adjlen; 663 result.ptr = cxMalloc(allocator, result.length + 1); 664 if (!result.ptr) { 665 cx_strrepl_free_ibuf(firstbuf); 666 return cx_mutstrn(NULL, 0); 667 } 668 } 669 670 // Build result string 671 curbuf = firstbuf; 672 size_t srcidx = 0; 673 char *destptr = result.ptr; 674 do { 675 for (size_t i = 0; i < curbuf->len; i++) { 676 // Copy source part up to next match 677 size_t idx = curbuf->buf[i]; 678 size_t srclen = idx - srcidx; 679 if (srclen > 0) { 680 memcpy(destptr, str.ptr + srcidx, srclen); 681 destptr += srclen; 682 srcidx += srclen; 683 } 684 685 // Copy the replacement and skip the source pattern 686 srcidx += pattern.length; 687 memcpy(destptr, replacement.ptr, replacement.length); 688 destptr += replacement.length; 689 } 690 curbuf = curbuf->next; 691 } while (curbuf); 692 memcpy(destptr, str.ptr + srcidx, str.length - srcidx); 693 694 // Result is guaranteed to be zero-terminated 695 result.ptr[result.length] = '\0'; 696 697 // Free index buffer 698 cx_strrepl_free_ibuf(firstbuf); 699 700 return result; 701 } 702 703 CxStrtokCtx cx_strtok( 704 cxstring str, 705 cxstring delim, 706 size_t limit 707 ) { 708 CxStrtokCtx ctx; 709 ctx.str = str; 710 ctx.delim = delim; 711 ctx.limit = limit; 712 ctx.pos = 0; 713 ctx.next_pos = 0; 714 ctx.delim_pos = 0; 715 ctx.found = 0; 716 ctx.delim_more = NULL; 717 ctx.delim_more_count = 0; 718 return ctx; 719 } 720 721 CxStrtokCtx cx_strtok_m( 722 cxmutstr str, 723 cxstring delim, 724 size_t limit 725 ) { 726 return cx_strtok(cx_strcast(str), delim, limit); 727 } 728 729 bool cx_strtok_next( 730 CxStrtokCtx *ctx, 731 cxstring *token 732 ) { 733 // abortion criteria 734 if (ctx->found >= ctx->limit || ctx->delim_pos >= ctx->str.length) { 735 return false; 736 } 737 738 // determine the search start 739 cxstring haystack = cx_strsubs(ctx->str, ctx->next_pos); 740 741 // search the next delimiter 742 cxstring delim = cx_strstr(haystack, ctx->delim); 743 744 // if found, make delim capture exactly the delimiter 745 if (delim.length > 0) { 746 delim.length = ctx->delim.length; 747 } 748 749 // if more delimiters are specified, check them now 750 if (ctx->delim_more_count > 0) { 751 cx_for_n(i, ctx->delim_more_count) { 752 cxstring d = cx_strstr(haystack, ctx->delim_more[i]); 753 if (d.length > 0 && (delim.length == 0 || d.ptr < delim.ptr)) { 754 delim.ptr = d.ptr; 755 delim.length = ctx->delim_more[i].length; 756 } 757 } 758 } 759 760 // store the token information and adjust the context 761 ctx->found++; 762 ctx->pos = ctx->next_pos; 763 token->ptr = &ctx->str.ptr[ctx->pos]; 764 ctx->delim_pos = delim.length == 0 ? 765 ctx->str.length : (size_t) (delim.ptr - ctx->str.ptr); 766 token->length = ctx->delim_pos - ctx->pos; 767 ctx->next_pos = ctx->delim_pos + delim.length; 768 769 return true; 770 } 771 772 bool cx_strtok_next_m( 773 CxStrtokCtx *ctx, 774 cxmutstr *token 775 ) { 776 return cx_strtok_next(ctx, (cxstring *) token); 777 } 778 779 void cx_strtok_delim( 780 CxStrtokCtx *ctx, 781 const cxstring *delim, 782 size_t count 783 ) { 784 ctx->delim_more = delim; 785 ctx->delim_more_count = count; 786 } 787

UNIXworkcode

UNIXwork`code`