ucx/string.c

branch
newapi
changeset 178
7c3ff86ee9d4
parent 174
0358f1d9c506
child 253
087cc9216f28
equal deleted inserted replaced
177:e79a60b3a7cb 178:7c3ff86ee9d4
1 /*
2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
3 *
4 * Copyright 2021 Mike Becker, Olaf Wintermann All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include "cx/string.h"
30 #include "cx/utils.h"
31
32 #include <string.h>
33 #include <stdarg.h>
34 #include <ctype.h>
35
36 #ifndef _WIN32
37
38 #include <strings.h> // for strncasecmp()
39
40 #endif // _WIN32
41
42 cxmutstr cx_mutstr(char *cstring) {
43 return (cxmutstr) {cstring, strlen(cstring)};
44 }
45
46 cxmutstr cx_mutstrn(
47 char *cstring,
48 size_t length
49 ) {
50 return (cxmutstr) {cstring, length};
51 }
52
53 cxstring cx_str(const char *cstring) {
54 return (cxstring) {cstring, strlen(cstring)};
55 }
56
57 cxstring cx_strn(
58 const char *cstring,
59 size_t length
60 ) {
61 return (cxstring) {cstring, length};
62 }
63
64 cxstring cx_strcast(cxmutstr str) {
65 return (cxstring) {str.ptr, str.length};
66 }
67
68 void cx_strfree(cxmutstr *str) {
69 free(str->ptr);
70 str->ptr = NULL;
71 str->length = 0;
72 }
73
74 void cx_strfree_a(
75 CxAllocator const *alloc,
76 cxmutstr *str
77 ) {
78 cxFree(alloc, str->ptr);
79 str->ptr = NULL;
80 str->length = 0;
81 }
82
83 size_t cx_strlen(
84 size_t count,
85 ...
86 ) {
87 if (count == 0) return 0;
88
89 va_list ap;
90 va_start(ap, count);
91 size_t size = 0;
92 cx_for_n(i, count) {
93 cxstring str = va_arg(ap, cxstring);
94 size += str.length;
95 }
96 va_end(ap);
97
98 return size;
99 }
100
101 cxmutstr cx_strcat_ma(
102 CxAllocator const *alloc,
103 cxmutstr str,
104 size_t count,
105 ...
106 ) {
107 if (count == 0) return str;
108
109 cxstring *strings = calloc(count, sizeof(cxstring));
110 if (!strings) abort();
111
112 va_list ap;
113 va_start(ap, count);
114
115 // get all args and overall length
116 size_t slen = str.length;
117 cx_for_n(i, count) {
118 cxstring s = va_arg (ap, cxstring);
119 strings[i] = s;
120 slen += s.length;
121 }
122 va_end(ap);
123
124 // reallocate or create new string
125 if (str.ptr == NULL) {
126 str.ptr = cxMalloc(alloc, slen + 1);
127 } else {
128 str.ptr = cxRealloc(alloc, str.ptr, slen + 1);
129 }
130 if (str.ptr == NULL) abort();
131
132 // concatenate strings
133 size_t pos = str.length;
134 str.length = slen;
135 cx_for_n(i, count) {
136 cxstring s = strings[i];
137 memcpy(str.ptr + pos, s.ptr, s.length);
138 pos += s.length;
139 }
140
141 // terminate string
142 str.ptr[str.length] = '\0';
143
144 // free temporary array
145 free(strings);
146
147 return str;
148 }
149
150 cxstring cx_strsubs(
151 cxstring string,
152 size_t start
153 ) {
154 return cx_strsubsl(string, start, string.length - start);
155 }
156
157 cxmutstr cx_strsubs_m(
158 cxmutstr string,
159 size_t start
160 ) {
161 return cx_strsubsl_m(string, start, string.length - start);
162 }
163
164 cxstring cx_strsubsl(
165 cxstring string,
166 size_t start,
167 size_t length
168 ) {
169 if (start > string.length) {
170 return (cxstring) {NULL, 0};
171 }
172
173 size_t rem_len = string.length - start;
174 if (length > rem_len) {
175 length = rem_len;
176 }
177
178 return (cxstring) {string.ptr + start, length};
179 }
180
181 cxmutstr cx_strsubsl_m(
182 cxmutstr string,
183 size_t start,
184 size_t length
185 ) {
186 cxstring result = cx_strsubsl(cx_strcast(string), start, length);
187 return (cxmutstr) {(char *) result.ptr, result.length};
188 }
189
190 cxstring cx_strchr(
191 cxstring string,
192 int chr
193 ) {
194 chr = 0xFF & chr;
195 // TODO: improve by comparing multiple bytes at once
196 cx_for_n(i, string.length) {
197 if (string.ptr[i] == chr) {
198 return cx_strsubs(string, i);
199 }
200 }
201 return (cxstring) {NULL, 0};
202 }
203
204 cxmutstr cx_strchr_m(
205 cxmutstr string,
206 int chr
207 ) {
208 cxstring result = cx_strchr(cx_strcast(string), chr);
209 return (cxmutstr) {(char *) result.ptr, result.length};
210 }
211
212 cxstring cx_strrchr(
213 cxstring string,
214 int chr
215 ) {
216 chr = 0xFF & chr;
217 size_t i = string.length;
218 while (i > 0) {
219 i--;
220 // TODO: improve by comparing multiple bytes at once
221 if (string.ptr[i] == chr) {
222 return cx_strsubs(string, i);
223 }
224 }
225 return (cxstring) {NULL, 0};
226 }
227
228 cxmutstr cx_strrchr_m(
229 cxmutstr string,
230 int chr
231 ) {
232 cxstring result = cx_strrchr(cx_strcast(string), chr);
233 return (cxmutstr) {(char *) result.ptr, result.length};
234 }
235
236 #ifndef CX_STRSTR_SBO_SIZE
237 #define CX_STRSTR_SBO_SIZE 512
238 #endif
239
240 cxstring cx_strstr(
241 cxstring haystack,
242 cxstring needle
243 ) {
244 if (needle.length == 0) {
245 return haystack;
246 }
247
248 // optimize for single-char needles
249 if (needle.length == 1) {
250 return cx_strchr(haystack, *needle.ptr);
251 }
252
253 /*
254 * IMPORTANT:
255 * Our prefix table contains the prefix length PLUS ONE
256 * this is our decision, because we want to use the full range of size_t.
257 * The original algorithm needs a (-1) at one single place,
258 * and we want to avoid that.
259 */
260
261 // local prefix table
262 size_t s_prefix_table[CX_STRSTR_SBO_SIZE];
263
264 // check needle length and use appropriate prefix table
265 // if the pattern exceeds static prefix table, allocate on the heap
266 bool useheap = needle.length >= CX_STRSTR_SBO_SIZE;
267 register size_t *ptable = useheap ? calloc(needle.length + 1,
268 sizeof(size_t)) : s_prefix_table;
269
270 // keep counter in registers
271 register size_t i, j;
272
273 // fill prefix table
274 i = 0;
275 j = 0;
276 ptable[i] = j;
277 while (i < needle.length) {
278 while (j >= 1 && needle.ptr[j - 1] != needle.ptr[i]) {
279 j = ptable[j - 1];
280 }
281 i++;
282 j++;
283 ptable[i] = j;
284 }
285
286 // search
287 cxstring result = {NULL, 0};
288 i = 0;
289 j = 1;
290 while (i < haystack.length) {
291 while (j >= 1 && haystack.ptr[i] != needle.ptr[j - 1]) {
292 j = ptable[j - 1];
293 }
294 i++;
295 j++;
296 if (j - 1 == needle.length) {
297 size_t start = i - needle.length;
298 result.ptr = haystack.ptr + start;
299 result.length = haystack.length - start;
300 break;
301 }
302 }
303
304 // if prefix table was allocated on the heap, free it
305 if (ptable != s_prefix_table) {
306 free(ptable);
307 }
308
309 return result;
310 }
311
312 cxmutstr cx_strstr_m(
313 cxmutstr haystack,
314 cxstring needle
315 ) {
316 cxstring result = cx_strstr(cx_strcast(haystack), needle);
317 return (cxmutstr) {(char *) result.ptr, result.length};
318 }
319
320 size_t cx_strsplit(
321 cxstring string,
322 cxstring delim,
323 size_t limit,
324 cxstring *output
325 ) {
326 // special case: output limit is zero
327 if (limit == 0) return 0;
328
329 // special case: delimiter is empty
330 if (delim.length == 0) {
331 output[0] = string;
332 return 1;
333 }
334
335 // special cases: delimiter is at least as large as the string
336 if (delim.length >= string.length) {
337 // exact match
338 if (cx_strcmp(string, delim) == 0) {
339 output[0] = cx_strn(string.ptr, 0);
340 output[1] = cx_strn(string.ptr + string.length, 0);
341 return 2;
342 } else {
343 // no match possible
344 output[0] = string;
345 return 1;
346 }
347 }
348
349 size_t n = 0;
350 cxstring curpos = string;
351 while (1) {
352 ++n;
353 cxstring match = cx_strstr(curpos, delim);
354 if (match.length > 0) {
355 // is the limit reached?
356 if (n < limit) {
357 // copy the current string to the array
358 cxstring item = cx_strn(curpos.ptr, match.ptr - curpos.ptr);
359 output[n - 1] = item;
360 size_t processed = item.length + delim.length;
361 curpos.ptr += processed;
362 curpos.length -= processed;
363 } else {
364 // limit reached, copy the _full_ remaining string
365 output[n - 1] = curpos;
366 break;
367 }
368 } else {
369 // no more matches, copy last string
370 output[n - 1] = curpos;
371 break;
372 }
373 }
374
375 return n;
376 }
377
378 size_t cx_strsplit_a(
379 CxAllocator const *allocator,
380 cxstring string,
381 cxstring delim,
382 size_t limit,
383 cxstring **output
384 ) {
385 // find out how many splits we're going to make and allocate memory
386 size_t n = 0;
387 cxstring curpos = string;
388 while (1) {
389 ++n;
390 cxstring match = cx_strstr(curpos, delim);
391 if (match.length > 0) {
392 // is the limit reached?
393 if (n < limit) {
394 size_t processed = match.ptr - curpos.ptr + delim.length;
395 curpos.ptr += processed;
396 curpos.length -= processed;
397 } else {
398 // limit reached
399 break;
400 }
401 } else {
402 // no more matches
403 break;
404 }
405 }
406 *output = cxCalloc(allocator, n, sizeof(cxstring));
407 return cx_strsplit(string, delim, n, *output);
408 }
409
410 size_t cx_strsplit_m(
411 cxmutstr string,
412 cxstring delim,
413 size_t limit,
414 cxmutstr *output
415 ) {
416 return cx_strsplit(cx_strcast(string),
417 delim, limit, (cxstring *) output);
418 }
419
420 size_t cx_strsplit_ma(
421 CxAllocator const *allocator,
422 cxmutstr string,
423 cxstring delim,
424 size_t limit,
425 cxmutstr **output
426 ) {
427 return cx_strsplit_a(allocator, cx_strcast(string),
428 delim, limit, (cxstring **) output);
429 }
430
431 int cx_strcmp(
432 cxstring s1,
433 cxstring s2
434 ) {
435 if (s1.length == s2.length) {
436 return memcmp(s1.ptr, s2.ptr, s1.length);
437 } else if (s1.length > s2.length) {
438 return 1;
439 } else {
440 return -1;
441 }
442 }
443
444 int cx_strcasecmp(
445 cxstring s1,
446 cxstring s2
447 ) {
448 if (s1.length == s2.length) {
449 #ifdef _WIN32
450 return _strnicmp(s1.ptr, s2.ptr, s1.length);
451 #else
452 return strncasecmp(s1.ptr, s2.ptr, s1.length);
453 #endif
454 } else if (s1.length > s2.length) {
455 return 1;
456 } else {
457 return -1;
458 }
459 }
460
461 int cx_strcmp_p(
462 void const *s1,
463 void const *s2
464 ) {
465 cxstring const *left = s1;
466 cxstring const *right = s2;
467 return cx_strcmp(*left, *right);
468 }
469
470 int cx_strcasecmp_p(
471 void const *s1,
472 void const *s2
473 ) {
474 cxstring const *left = s1;
475 cxstring const *right = s2;
476 return cx_strcasecmp(*left, *right);
477 }
478
479 cxmutstr cx_strdup_a(
480 CxAllocator const *allocator,
481 cxstring string
482 ) {
483 cxmutstr result = {
484 cxMalloc(allocator, string.length + 1),
485 string.length
486 };
487 if (result.ptr == NULL) {
488 result.length = 0;
489 return result;
490 }
491 memcpy(result.ptr, string.ptr, string.length);
492 result.ptr[string.length] = '\0';
493 return result;
494 }
495
496 cxstring cx_strtrim(cxstring string) {
497 cxstring result = string;
498 // TODO: optimize by comparing multiple bytes at once
499 while (result.length > 0 && isspace(*result.ptr)) {
500 result.ptr++;
501 result.length--;
502 }
503 while (result.length > 0 && isspace(result.ptr[result.length - 1])) {
504 result.length--;
505 }
506 return result;
507 }
508
509 cxmutstr cx_strtrim_m(cxmutstr string) {
510 cxstring result = cx_strtrim(cx_strcast(string));
511 return (cxmutstr) {(char *) result.ptr, result.length};
512 }
513
514 bool cx_strprefix(
515 cxstring string,
516 cxstring prefix
517 ) {
518 if (string.length < prefix.length) return false;
519 return memcmp(string.ptr, prefix.ptr, prefix.length) == 0;
520 }
521
522 bool cx_strsuffix(
523 cxstring string,
524 cxstring suffix
525 ) {
526 if (string.length < suffix.length) return false;
527 return memcmp(string.ptr + string.length - suffix.length,
528 suffix.ptr, suffix.length) == 0;
529 }
530
531 bool cx_strcaseprefix(
532 cxstring string,
533 cxstring prefix
534 ) {
535 if (string.length < prefix.length) return false;
536 #ifdef _WIN32
537 return _strnicmp(string.ptr, prefix.ptr, prefix.length) == 0;
538 #else
539 return strncasecmp(string.ptr, prefix.ptr, prefix.length) == 0;
540 #endif
541 }
542
543 bool cx_strcasesuffix(
544 cxstring string,
545 cxstring suffix
546 ) {
547 if (string.length < suffix.length) return false;
548 #ifdef _WIN32
549 return _strnicmp(string.ptr+string.length-suffix.length,
550 suffix.ptr, suffix.length) == 0;
551 #else
552 return strncasecmp(string.ptr + string.length - suffix.length,
553 suffix.ptr, suffix.length) == 0;
554 #endif
555 }
556
557 void cx_strlower(cxmutstr string) {
558 cx_for_n(i, string.length) {
559 string.ptr[i] = (char) tolower(string.ptr[i]);
560 }
561 }
562
563 void cx_strupper(cxmutstr string) {
564 cx_for_n(i, string.length) {
565 string.ptr[i] = (char) toupper(string.ptr[i]);
566 }
567 }
568
569 #ifndef CX_STRREPLACE_INDEX_BUFFER_SIZE
570 #define CX_STRREPLACE_INDEX_BUFFER_SIZE 64
571 #endif
572
573 struct cx_strreplace_ibuf {
574 size_t *buf;
575 struct cx_strreplace_ibuf *next;
576 unsigned int len;
577 };
578
579 static void cx_strrepl_free_ibuf(struct cx_strreplace_ibuf *buf) {
580 while (buf) {
581 struct cx_strreplace_ibuf *next = buf->next;
582 free(buf->buf);
583 free(buf);
584 buf = next;
585 }
586 }
587
588 cxmutstr cx_strreplacen_a(
589 CxAllocator const *allocator,
590 cxstring str,
591 cxstring pattern,
592 cxstring replacement,
593 size_t replmax
594 ) {
595
596 if (pattern.length == 0 || pattern.length > str.length || replmax == 0)
597 return cx_strdup_a(allocator, str);
598
599 // Compute expected buffer length
600 size_t ibufmax = str.length / pattern.length;
601 size_t ibuflen = replmax < ibufmax ? replmax : ibufmax;
602 if (ibuflen > CX_STRREPLACE_INDEX_BUFFER_SIZE) {
603 ibuflen = CX_STRREPLACE_INDEX_BUFFER_SIZE;
604 }
605
606 // Allocate first index buffer
607 struct cx_strreplace_ibuf *firstbuf, *curbuf;
608 firstbuf = curbuf = calloc(1, sizeof(struct cx_strreplace_ibuf));
609 if (!firstbuf) return cx_mutstrn(NULL, 0);
610 firstbuf->buf = calloc(ibuflen, sizeof(size_t));
611 if (!firstbuf->buf) {
612 free(firstbuf);
613 return cx_mutstrn(NULL, 0);
614 }
615
616 // Search occurrences
617 cxstring searchstr = str;
618 size_t found = 0;
619 do {
620 cxstring match = cx_strstr(searchstr, pattern);
621 if (match.length > 0) {
622 // Allocate next buffer in chain, if required
623 if (curbuf->len == ibuflen) {
624 struct cx_strreplace_ibuf *nextbuf =
625 calloc(1, sizeof(struct cx_strreplace_ibuf));
626 if (!nextbuf) {
627 cx_strrepl_free_ibuf(firstbuf);
628 return cx_mutstrn(NULL, 0);
629 }
630 nextbuf->buf = calloc(ibuflen, sizeof(size_t));
631 if (!nextbuf->buf) {
632 free(nextbuf);
633 cx_strrepl_free_ibuf(firstbuf);
634 return cx_mutstrn(NULL, 0);
635 }
636 curbuf->next = nextbuf;
637 curbuf = nextbuf;
638 }
639
640 // Record match index
641 found++;
642 size_t idx = match.ptr - str.ptr;
643 curbuf->buf[curbuf->len++] = idx;
644 searchstr.ptr = match.ptr + pattern.length;
645 searchstr.length = str.length - idx - pattern.length;
646 } else {
647 break;
648 }
649 } while (searchstr.length > 0 && found < replmax);
650
651 // Allocate result string
652 cxmutstr result;
653 {
654 ssize_t adjlen = (ssize_t) replacement.length - (ssize_t) pattern.length;
655 size_t rcount = 0;
656 curbuf = firstbuf;
657 do {
658 rcount += curbuf->len;
659 curbuf = curbuf->next;
660 } while (curbuf);
661 result.length = str.length + rcount * adjlen;
662 result.ptr = cxMalloc(allocator, result.length + 1);
663 if (!result.ptr) {
664 cx_strrepl_free_ibuf(firstbuf);
665 return cx_mutstrn(NULL, 0);
666 }
667 }
668
669 // Build result string
670 curbuf = firstbuf;
671 size_t srcidx = 0;
672 char *destptr = result.ptr;
673 do {
674 for (size_t i = 0; i < curbuf->len; i++) {
675 // Copy source part up to next match
676 size_t idx = curbuf->buf[i];
677 size_t srclen = idx - srcidx;
678 if (srclen > 0) {
679 memcpy(destptr, str.ptr + srcidx, srclen);
680 destptr += srclen;
681 srcidx += srclen;
682 }
683
684 // Copy the replacement and skip the source pattern
685 srcidx += pattern.length;
686 memcpy(destptr, replacement.ptr, replacement.length);
687 destptr += replacement.length;
688 }
689 curbuf = curbuf->next;
690 } while (curbuf);
691 memcpy(destptr, str.ptr + srcidx, str.length - srcidx);
692
693 // Result is guaranteed to be zero-terminated
694 result.ptr[result.length] = '\0';
695
696 // Free index buffer
697 cx_strrepl_free_ibuf(firstbuf);
698
699 return result;
700 }
701
702 CxStrtokCtx cx_strtok(
703 cxstring str,
704 cxstring delim,
705 size_t limit
706 ) {
707 CxStrtokCtx ctx;
708 ctx.str = str;
709 ctx.delim = delim;
710 ctx.limit = limit;
711 ctx.pos = 0;
712 ctx.next_pos = 0;
713 ctx.delim_pos = 0;
714 ctx.found = 0;
715 ctx.delim_more = NULL;
716 ctx.delim_more_count = 0;
717 return ctx;
718 }
719
720 CxStrtokCtx cx_strtok_m(
721 cxmutstr str,
722 cxstring delim,
723 size_t limit
724 ) {
725 return cx_strtok(cx_strcast(str), delim, limit);
726 }
727
728 bool cx_strtok_next(
729 CxStrtokCtx *ctx,
730 cxstring *token
731 ) {
732 // abortion criteria
733 if (ctx->found >= ctx->limit || ctx->delim_pos >= ctx->str.length) {
734 return false;
735 }
736
737 // determine the search start
738 cxstring haystack = cx_strsubs(ctx->str, ctx->next_pos);
739
740 // search the next delimiter
741 cxstring delim = cx_strstr(haystack, ctx->delim);
742
743 // if found, make delim capture exactly the delimiter
744 if (delim.length > 0) {
745 delim.length = ctx->delim.length;
746 }
747
748 // if more delimiters are specified, check them now
749 if (ctx->delim_more_count > 0) {
750 cx_for_n(i, ctx->delim_more_count) {
751 cxstring d = cx_strstr(haystack, ctx->delim_more[i]);
752 if (d.length > 0 && (delim.length == 0 || d.ptr < delim.ptr)) {
753 delim.ptr = d.ptr;
754 delim.length = ctx->delim_more[i].length;
755 }
756 }
757 }
758
759 // store the token information and adjust the context
760 ctx->found++;
761 ctx->pos = ctx->next_pos;
762 token->ptr = &ctx->str.ptr[ctx->pos];
763 ctx->delim_pos = delim.length == 0 ?
764 ctx->str.length : (size_t) (delim.ptr - ctx->str.ptr);
765 token->length = ctx->delim_pos - ctx->pos;
766 ctx->next_pos = ctx->delim_pos + delim.length;
767
768 return true;
769 }
770
771 bool cx_strtok_next_m(
772 CxStrtokCtx *ctx,
773 cxmutstr *token
774 ) {
775 return cx_strtok_next(ctx, (cxstring *) token);
776 }
777
778 void cx_strtok_delim(
779 CxStrtokCtx *ctx,
780 cxstring const *delim,
781 size_t count
782 ) {
783 ctx->delim_more = delim;
784 ctx->delim_more_count = count;
785 }

mercurial