string.c - UNIXwork Code

1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include "cx/string.h" 30 #include "cx/utils.h" 31 32 #include <string.h> 33 #include <stdarg.h> 34 #include <ctype.h> 35 36 #ifndef _WIN32 37 38 #include <strings.h> // for strncasecmp() 39 40 #endif // _WIN32 41 42 cxmutstr cx_mutstr(char *cstring) { 43 return (cxmutstr) {cstring, strlen(cstring)}; 44 } 45 46 cxmutstr cx_mutstrn( 47 char *cstring, 48 size_t length 49 ) { 50 return (cxmutstr) {cstring, length}; 51 } 52 53 cxstring cx_str(const char *cstring) { 54 return (cxstring) {cstring, strlen(cstring)}; 55 } 56 57 cxstring cx_strn( 58 const char *cstring, 59 size_t length 60 ) { 61 return (cxstring) {cstring, length}; 62 } 63 64 cxstring cx_strcast(cxmutstr str) { 65 return (cxstring) {str.ptr, str.length}; 66 } 67 68 void cx_strfree(cxmutstr *str) { 69 free(str->ptr); 70 str->ptr = NULL; 71 str->length = 0; 72 } 73 74 void cx_strfree_a( 75 CxAllocator const *alloc, 76 cxmutstr *str 77 ) { 78 cxFree(alloc, str->ptr); 79 str->ptr = NULL; 80 str->length = 0; 81 } 82 83 size_t cx_strlen( 84 size_t count, 85 ... 86 ) { 87 if (count == 0) return 0; 88 89 va_list ap; 90 va_start(ap, count); 91 size_t size = 0; 92 cx_for_n(i, count) { 93 cxstring str = va_arg(ap, cxstring); 94 size += str.length; 95 } 96 va_end(ap); 97 98 return size; 99 } 100 101 cxmutstr cx_strcat_ma( 102 CxAllocator const *alloc, 103 cxmutstr str, 104 size_t count, 105 ... 106 ) { 107 if (count == 0) return str; 108 109 cxstring *strings = calloc(count, sizeof(cxstring)); 110 if (!strings) abort(); 111 112 va_list ap; 113 va_start(ap, count); 114 115 // get all args and overall length 116 size_t slen = str.length; 117 cx_for_n(i, count) { 118 cxstring s = va_arg (ap, cxstring); 119 strings[i] = s; 120 slen += s.length; 121 } 122 va_end(ap); 123 124 // reallocate or create new string 125 if (str.ptr == NULL) { 126 str.ptr = cxMalloc(alloc, slen + 1); 127 } else { 128 str.ptr = cxRealloc(alloc, str.ptr, slen + 1); 129 } 130 if (str.ptr == NULL) abort(); 131 132 // concatenate strings 133 size_t pos = str.length; 134 str.length = slen; 135 cx_for_n(i, count) { 136 cxstring s = strings[i]; 137 memcpy(str.ptr + pos, s.ptr, s.length); 138 pos += s.length; 139 } 140 141 // terminate string 142 str.ptr[str.length] = '\0'; 143 144 // free temporary array 145 free(strings); 146 147 return str; 148 } 149 150 cxstring cx_strsubs( 151 cxstring string, 152 size_t start 153 ) { 154 return cx_strsubsl(string, start, string.length - start); 155 } 156 157 cxmutstr cx_strsubs_m( 158 cxmutstr string, 159 size_t start 160 ) { 161 return cx_strsubsl_m(string, start, string.length - start); 162 } 163 164 cxstring cx_strsubsl( 165 cxstring string, 166 size_t start, 167 size_t length 168 ) { 169 if (start > string.length) { 170 return (cxstring) {NULL, 0}; 171 } 172 173 size_t rem_len = string.length - start; 174 if (length > rem_len) { 175 length = rem_len; 176 } 177 178 return (cxstring) {string.ptr + start, length}; 179 } 180 181 cxmutstr cx_strsubsl_m( 182 cxmutstr string, 183 size_t start, 184 size_t length 185 ) { 186 cxstring result = cx_strsubsl(cx_strcast(string), start, length); 187 return (cxmutstr) {(char *) result.ptr, result.length}; 188 } 189 190 cxstring cx_strchr( 191 cxstring string, 192 int chr 193 ) { 194 chr = 0xFF & chr; 195 // TODO: improve by comparing multiple bytes at once 196 cx_for_n(i, string.length) { 197 if (string.ptr[i] == chr) { 198 return cx_strsubs(string, i); 199 } 200 } 201 return (cxstring) {NULL, 0}; 202 } 203 204 cxmutstr cx_strchr_m( 205 cxmutstr string, 206 int chr 207 ) { 208 cxstring result = cx_strchr(cx_strcast(string), chr); 209 return (cxmutstr) {(char *) result.ptr, result.length}; 210 } 211 212 cxstring cx_strrchr( 213 cxstring string, 214 int chr 215 ) { 216 chr = 0xFF & chr; 217 size_t i = string.length; 218 while (i > 0) { 219 i--; 220 // TODO: improve by comparing multiple bytes at once 221 if (string.ptr[i] == chr) { 222 return cx_strsubs(string, i); 223 } 224 } 225 return (cxstring) {NULL, 0}; 226 } 227 228 cxmutstr cx_strrchr_m( 229 cxmutstr string, 230 int chr 231 ) { 232 cxstring result = cx_strrchr(cx_strcast(string), chr); 233 return (cxmutstr) {(char *) result.ptr, result.length}; 234 } 235 236 #ifndef CX_STRSTR_SBO_SIZE 237 #define CX_STRSTR_SBO_SIZE 512 238 #endif 239 240 cxstring cx_strstr( 241 cxstring haystack, 242 cxstring needle 243 ) { 244 if (needle.length == 0) { 245 return haystack; 246 } 247 248 // optimize for single-char needles 249 if (needle.length == 1) { 250 return cx_strchr(haystack, *needle.ptr); 251 } 252 253 /* 254 * IMPORTANT: 255 * Our prefix table contains the prefix length PLUS ONE 256 * this is our decision, because we want to use the full range of size_t. 257 * The original algorithm needs a (-1) at one single place, 258 * and we want to avoid that. 259 */ 260 261 // local prefix table 262 size_t s_prefix_table[CX_STRSTR_SBO_SIZE]; 263 264 // check needle length and use appropriate prefix table 265 // if the pattern exceeds static prefix table, allocate on the heap 266 bool useheap = needle.length >= CX_STRSTR_SBO_SIZE; 267 register size_t *ptable = useheap ? calloc(needle.length + 1, 268 sizeof(size_t)) : s_prefix_table; 269 270 // keep counter in registers 271 register size_t i, j; 272 273 // fill prefix table 274 i = 0; 275 j = 0; 276 ptable[i] = j; 277 while (i < needle.length) { 278 while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) { 279 j = ptable[j - 1]; 280 } 281 i++; 282 j++; 283 ptable[i] = j; 284 } 285 286 // search 287 cxstring result = {NULL, 0}; 288 i = 0; 289 j = 1; 290 while (i < haystack.length) { 291 while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) { 292 j = ptable[j - 1]; 293 } 294 i++; 295 j++; 296 if (j - 1 == needle.length) { 297 size_t start = i - needle.length; 298 result.ptr = haystack.ptr + start; 299 result.length = haystack.length - start; 300 break; 301 } 302 } 303 304 // if prefix table was allocated on the heap, free it 305 if (ptable != s_prefix_table) { 306 free(ptable); 307 } 308 309 return result; 310 } 311 312 cxmutstr cx_strstr_m( 313 cxmutstr haystack, 314 cxstring needle 315 ) { 316 cxstring result = cx_strstr(cx_strcast(haystack), needle); 317 return (cxmutstr) {(char *) result.ptr, result.length}; 318 } 319 320 size_t cx_strsplit( 321 cxstring string, 322 cxstring delim, 323 size_t limit, 324 cxstring *output 325 ) { 326 // special case: output limit is zero 327 if (limit == 0) return 0; 328 329 // special case: delimiter is empty 330 if (delim.length == 0) { 331 output[0] = string; 332 return 1; 333 } 334 335 // special cases: delimiter is at least as large as the string 336 if (delim.length >= string.length) { 337 // exact match 338 if (cx_strcmp(string, delim) == 0) { 339 output[0] = cx_strn(string.ptr, 0); 340 output[1] = cx_strn(string.ptr + string.length, 0); 341 return 2; 342 } else { 343 // no match possible 344 output[0] = string; 345 return 1; 346 } 347 } 348 349 size_t n = 0; 350 cxstring curpos = string; 351 while (1) { 352 ++n; 353 cxstring match = cx_strstr(curpos, delim); 354 if (match.length > 0) { 355 // is the limit reached? 356 if (n < limit) { 357 // copy the current string to the array 358 cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr); 359 output[n - 1] = item; 360 size_t processed = item.length + delim.length; 361 curpos.ptr += processed; 362 curpos.length -= processed; 363 } else { 364 // limit reached, copy the _full_ remaining string 365 output[n - 1] = curpos; 366 break; 367 } 368 } else { 369 // no more matches, copy last string 370 output[n - 1] = curpos; 371 break; 372 } 373 } 374 375 return n; 376 } 377 378 size_t cx_strsplit_a( 379 CxAllocator const *allocator, 380 cxstring string, 381 cxstring delim, 382 size_t limit, 383 cxstring **output 384 ) { 385 // find out how many splits we're going to make and allocate memory 386 size_t n = 0; 387 cxstring curpos = string; 388 while (1) { 389 ++n; 390 cxstring match = cx_strstr(curpos, delim); 391 if (match.length > 0) { 392 // is the limit reached? 393 if (n < limit) { 394 size_t processed = match.ptr - curpos.ptr + delim.length; 395 curpos.ptr += processed; 396 curpos.length -= processed; 397 } else { 398 // limit reached 399 break; 400 } 401 } else { 402 // no more matches 403 break; 404 } 405 } 406 *output = cxCalloc(allocator, n, sizeof(cxstring)); 407 return cx_strsplit(string, delim, n, *output); 408 } 409 410 size_t cx_strsplit_m( 411 cxmutstr string, 412 cxstring delim, 413 size_t limit, 414 cxmutstr *output 415 ) { 416 return cx_strsplit(cx_strcast(string), 417 delim, limit, (cxstring *) output); 418 } 419 420 size_t cx_strsplit_ma( 421 CxAllocator const *allocator, 422 cxmutstr string, 423 cxstring delim, 424 size_t limit, 425 cxmutstr **output 426 ) { 427 return cx_strsplit_a(allocator, cx_strcast(string), 428 delim, limit, (cxstring **) output); 429 } 430 431 int cx_strcmp( 432 cxstring s1, 433 cxstring s2 434 ) { 435 if (s1.length == s2.length) { 436 return memcmp(s1.ptr, s2.ptr, s1.length); 437 } else if (s1.length > s2.length) { 438 return 1; 439 } else { 440 return -1; 441 } 442 } 443 444 int cx_strcasecmp( 445 cxstring s1, 446 cxstring s2 447 ) { 448 if (s1.length == s2.length) { 449 #ifdef _WIN32 450 return _strnicmp(s1.ptr, s2.ptr, s1.length); 451 #else 452 return strncasecmp(s1.ptr, s2.ptr, s1.length); 453 #endif 454 } else if (s1.length > s2.length) { 455 return 1; 456 } else { 457 return -1; 458 } 459 } 460 461 int cx_strcmp_p( 462 void const *s1, 463 void const *s2 464 ) { 465 cxstring const *left = s1; 466 cxstring const *right = s2; 467 return cx_strcmp(*left, *right); 468 } 469 470 int cx_strcasecmp_p( 471 void const *s1, 472 void const *s2 473 ) { 474 cxstring const *left = s1; 475 cxstring const *right = s2; 476 return cx_strcasecmp(*left, *right); 477 } 478 479 cxmutstr cx_strdup_a( 480 CxAllocator const *allocator, 481 cxstring string 482 ) { 483 cxmutstr result = { 484 cxMalloc(allocator, string.length + 1), 485 string.length 486 }; 487 if (result.ptr == NULL) { 488 result.length = 0; 489 return result; 490 } 491 memcpy(result.ptr, string.ptr, string.length); 492 result.ptr[string.length] = '\0'; 493 return result; 494 } 495 496 cxstring cx_strtrim(cxstring string) { 497 cxstring result = string; 498 // TODO: optimize by comparing multiple bytes at once 499 while (result.length > 0 && isspace(*result.ptr)) { 500 result.ptr++; 501 result.length--; 502 } 503 while (result.length > 0 && isspace(result.ptr[result.length - 1])) { 504 result.length--; 505 } 506 return result; 507 } 508 509 cxmutstr cx_strtrim_m(cxmutstr string) { 510 cxstring result = cx_strtrim(cx_strcast(string)); 511 return (cxmutstr) {(char *) result.ptr, result.length}; 512 } 513 514 bool cx_strprefix( 515 cxstring string, 516 cxstring prefix 517 ) { 518 if (string.length < prefix.length) return false; 519 return memcmp(string.ptr, prefix.ptr, prefix.length) == 0; 520 } 521 522 bool cx_strsuffix( 523 cxstring string, 524 cxstring suffix 525 ) { 526 if (string.length < suffix.length) return false; 527 return memcmp(string.ptr + string.length - suffix.length, 528 suffix.ptr, suffix.length) == 0; 529 } 530 531 bool cx_strcaseprefix( 532 cxstring string, 533 cxstring prefix 534 ) { 535 if (string.length < prefix.length) return false; 536 #ifdef _WIN32 537 return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0; 538 #else 539 return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0; 540 #endif 541 } 542 543 bool cx_strcasesuffix( 544 cxstring string, 545 cxstring suffix 546 ) { 547 if (string.length < suffix.length) return false; 548 #ifdef _WIN32 549 return _strnicmp(string.ptr+string.length-suffix.length, 550 suffix.ptr, suffix.length) == 0; 551 #else 552 return strncasecmp(string.ptr + string.length - suffix.length, 553 suffix.ptr, suffix.length) == 0; 554 #endif 555 } 556 557 void cx_strlower(cxmutstr string) { 558 cx_for_n(i, string.length) { 559 string.ptr[i] = (char) tolower(string.ptr[i]); 560 } 561 } 562 563 void cx_strupper(cxmutstr string) { 564 cx_for_n(i, string.length) { 565 string.ptr[i] = (char) toupper(string.ptr[i]); 566 } 567 } 568 569 #ifndef CX_STRREPLACE_INDEX_BUFFER_SIZE 570 #define CX_STRREPLACE_INDEX_BUFFER_SIZE 64 571 #endif 572 573 struct cx_strreplace_ibuf { 574 size_t *buf; 575 struct cx_strreplace_ibuf *next; 576 unsigned int len; 577 }; 578 579 static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) { 580 while (buf) { 581 struct cx_strreplace_ibuf *next = buf->next; 582 free(buf->buf); 583 free(buf); 584 buf = next; 585 } 586 } 587 588 cxmutstr cx_strreplacen_a( 589 CxAllocator const *allocator, 590 cxstring str, 591 cxstring pattern, 592 cxstring replacement, 593 size_t replmax 594 ) { 595 596 if (pattern.length == 0 || pattern.length > str.length || replmax == 0) 597 return cx_strdup_a(allocator, str); 598 599 // Compute expected buffer length 600 size_t ibufmax = str.length / pattern.length; 601 size_t ibuflen = replmax < ibufmax ? replmax : ibufmax; 602 if (ibuflen > CX_STRREPLACE_INDEX_BUFFER_SIZE) { 603 ibuflen = CX_STRREPLACE_INDEX_BUFFER_SIZE; 604 } 605 606 // Allocate first index buffer 607 struct cx_strreplace_ibuf *firstbuf, *curbuf; 608 firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf)); 609 if (!firstbuf) return cx_mutstrn(NULL, 0); 610 firstbuf->buf = calloc(ibuflen, sizeof(size_t)); 611 if (!firstbuf->buf) { 612 free(firstbuf); 613 return cx_mutstrn(NULL, 0); 614 } 615 616 // Search occurrences 617 cxstring searchstr = str; 618 size_t found = 0; 619 do { 620 cxstring match = cx_strstr(searchstr, pattern); 621 if (match.length > 0) { 622 // Allocate next buffer in chain, if required 623 if (curbuf->len == ibuflen) { 624 struct cx_strreplace_ibuf *nextbuf = 625 calloc(1, sizeof(struct cx_strreplace_ibuf)); 626 if (!nextbuf) { 627 cx_strrepl_free_ibuf(firstbuf); 628 return cx_mutstrn(NULL, 0); 629 } 630 nextbuf->buf = calloc(ibuflen, sizeof(size_t)); 631 if (!nextbuf->buf) { 632 free(nextbuf); 633 cx_strrepl_free_ibuf(firstbuf); 634 return cx_mutstrn(NULL, 0); 635 } 636 curbuf->next = nextbuf; 637 curbuf = nextbuf; 638 } 639 640 // Record match index 641 found++; 642 size_t idx = match.ptr - str.ptr; 643 curbuf->buf[curbuf->len++] = idx; 644 searchstr.ptr = match.ptr + pattern.length; 645 searchstr.length = str.length - idx - pattern.length; 646 } else { 647 break; 648 } 649 } while (searchstr.length > 0 && found < replmax); 650 651 // Allocate result string 652 cxmutstr result; 653 { 654 ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length; 655 size_t rcount = 0; 656 curbuf = firstbuf; 657 do { 658 rcount += curbuf->len; 659 curbuf = curbuf->next; 660 } while (curbuf); 661 result.length = str.length + rcount * adjlen; 662 result.ptr = cxMalloc(allocator, result.length + 1); 663 if (!result.ptr) { 664 cx_strrepl_free_ibuf(firstbuf); 665 return cx_mutstrn(NULL, 0); 666 } 667 } 668 669 // Build result string 670 curbuf = firstbuf; 671 size_t srcidx = 0; 672 char *destptr = result.ptr; 673 do { 674 for (size_t i = 0; i < curbuf->len; i++) { 675 // Copy source part up to next match 676 size_t idx = curbuf->buf[i]; 677 size_t srclen = idx - srcidx; 678 if (srclen > 0) { 679 memcpy(destptr, str.ptr + srcidx, srclen); 680 destptr += srclen; 681 srcidx += srclen; 682 } 683 684 // Copy the replacement and skip the source pattern 685 srcidx += pattern.length; 686 memcpy(destptr, replacement.ptr, replacement.length); 687 destptr += replacement.length; 688 } 689 curbuf = curbuf->next; 690 } while (curbuf); 691 memcpy(destptr, str.ptr + srcidx, str.length - srcidx); 692 693 // Result is guaranteed to be zero-terminated 694 result.ptr[result.length] = '\0'; 695 696 // Free index buffer 697 cx_strrepl_free_ibuf(firstbuf); 698 699 return result; 700 } 701 702 CxStrtokCtx cx_strtok( 703 cxstring str, 704 cxstring delim, 705 size_t limit 706 ) { 707 CxStrtokCtx ctx; 708 ctx.str = str; 709 ctx.delim = delim; 710 ctx.limit = limit; 711 ctx.pos = 0; 712 ctx.next_pos = 0; 713 ctx.delim_pos = 0; 714 ctx.found = 0; 715 ctx.delim_more = NULL; 716 ctx.delim_more_count = 0; 717 return ctx; 718 } 719 720 CxStrtokCtx cx_strtok_m( 721 cxmutstr str, 722 cxstring delim, 723 size_t limit 724 ) { 725 return cx_strtok(cx_strcast(str), delim, limit); 726 } 727 728 bool cx_strtok_next( 729 CxStrtokCtx *ctx, 730 cxstring *token 731 ) { 732 // abortion criteria 733 if (ctx->found >= ctx->limit || ctx->delim_pos >= ctx->str.length) { 734 return false; 735 } 736 737 // determine the search start 738 cxstring haystack = cx_strsubs(ctx->str, ctx->next_pos); 739 740 // search the next delimiter 741 cxstring delim = cx_strstr(haystack, ctx->delim); 742 743 // if found, make delim capture exactly the delimiter 744 if (delim.length > 0) { 745 delim.length = ctx->delim.length; 746 } 747 748 // if more delimiters are specified, check them now 749 if (ctx->delim_more_count > 0) { 750 cx_for_n(i, ctx->delim_more_count) { 751 cxstring d = cx_strstr(haystack, ctx->delim_more[i]); 752 if (d.length > 0 && (delim.length == 0 || d.ptr < delim.ptr)) { 753 delim.ptr = d.ptr; 754 delim.length = ctx->delim_more[i].length; 755 } 756 } 757 } 758 759 // store the token information and adjust the context 760 ctx->found++; 761 ctx->pos = ctx->next_pos; 762 token->ptr = &ctx->str.ptr[ctx->pos]; 763 ctx->delim_pos = delim.length == 0 ? 764 ctx->str.length : (size_t) (delim.ptr - ctx->str.ptr); 765 token->length = ctx->delim_pos - ctx->pos; 766 ctx->next_pos = ctx->delim_pos + delim.length; 767 768 return true; 769 } 770 771 bool cx_strtok_next_m( 772 CxStrtokCtx *ctx, 773 cxmutstr *token 774 ) { 775 return cx_strtok_next(ctx, (cxstring *) token); 776 } 777 778 void cx_strtok_delim( 779 CxStrtokCtx *ctx, 780 cxstring const *delim, 781 size_t count 782 ) { 783 ctx->delim_more = delim; 784 ctx->delim_more_count = count; 785 } 786

UNIXworkcode

UNIXwork`code`