UNIXworkcode

1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright 2024 Mike Becker - All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include "ec_glob.h" 30 31 #include <stdlib.h> 32 #include <string.h> 33 #include <errno.h> 34 35 #include <regex.h> 36 37 struct numpair_s { 38 long min; 39 long max; 40 }; 41 42 struct ec_glob_re { 43 char *str; 44 unsigned len; 45 unsigned capacity; 46 }; 47 48 #ifndef EC_GLOB_STACK_CAPACITY 49 #define EC_GLOB_STACK_CAPACITY 64 50 #endif 51 52 static void ec_glob_pattern_increase_capacity(struct ec_glob_re *re) { 53 unsigned newcap = re->capacity * 2; 54 char *newmem; 55 if (re->capacity == EC_GLOB_STACK_CAPACITY) { 56 newmem = malloc(newcap); 57 if (newmem == NULL) abort(); 58 memcpy(newmem, re->str, re->len); 59 } else { 60 newmem = realloc(re->str, newcap); 61 if (newmem == NULL) abort(); 62 } 63 re->capacity = newcap; 64 re->str = newmem; 65 } 66 67 #define ec_glob_pattern_ensure_capacity(re, n) \ 68 if (re.len + n > re.capacity) ec_glob_pattern_increase_capacity(&re) 69 70 #define ec_glob_cats(re, s) \ 71 ec_glob_pattern_ensure_capacity(re, sizeof(s) - 1); \ 72 memcpy((re).str + (re).len, s, sizeof(s) - 1); (re).len += sizeof(s)-1 73 74 #define ec_glob_catc(re, c) \ 75 ec_glob_pattern_ensure_capacity(re, 1); (re).str[(re).len++] = (c) 76 77 78 int ec_glob(const char *pattern, const char *string) { 79 char stack[EC_GLOB_STACK_CAPACITY]; 80 struct ec_glob_re re_pattern = { 81 stack, 1, EC_GLOB_STACK_CAPACITY 82 }; 83 re_pattern.str[0] = '^'; 84 85 unsigned scanidx = 0; 86 unsigned inputlen = strlen(pattern); 87 char c; 88 89 // maintain information about braces 90 const unsigned brace_stack_size = 32; 91 char brace_stack[brace_stack_size]; 92 int depth_brace = 0; 93 _Bool braces_valid = 1; 94 95 // first, check if braces are syntactically valid 96 for (unsigned i = 0 ; i < inputlen ; i++) { 97 // skip potentially escaped braces 98 if (pattern[i] == '\\') { 99 i++; 100 } else if (pattern[i] == '{') { 101 depth_brace++; 102 } else if (pattern[i] == '}') { 103 if (depth_brace > 0) { 104 depth_brace--; 105 } else { 106 braces_valid = 0; 107 break; 108 } 109 } 110 } 111 if (depth_brace > 0) { 112 braces_valid = 0; 113 depth_brace = 0; 114 } 115 116 // prepare what we think is more than enough memory for matches 117 const unsigned numrange_max = 32; 118 unsigned numrange_grp_count = 0; 119 unsigned numrange_grp_idx[numrange_max]; 120 struct numpair_s numrange_pairs[numrange_max]; 121 regmatch_t numrange_matches[numrange_max]; 122 123 // initialize first group number with zero 124 // and increment whenever we create a new group 125 numrange_grp_idx[0] = 0; 126 127 // now translate the editorconfig pattern to a POSIX regular expression 128 while (scanidx < inputlen) { 129 c = pattern[scanidx++]; 130 131 // escape 132 if (c == '\\') { 133 if (strchr("?{}[]*\\-,", pattern[scanidx]) != NULL) { 134 // also escape in regex when required 135 if (strchr("?{}[]*\\", pattern[scanidx]) != NULL) { 136 ec_glob_catc(re_pattern, '\\'); 137 } 138 c = pattern[scanidx++]; 139 ec_glob_catc(re_pattern, c); 140 } 141 // otherwise, it's just a backslash 142 else { 143 ec_glob_cats(re_pattern, "\\\\"); 144 } 145 } 146 // wildcard 147 else if (c == '*') { 148 // is double-wildcard? 149 if (pattern[scanidx] == '*') { 150 scanidx++; 151 // check for collapsible slashes 152 if (pattern[scanidx] == '/' && scanidx >= 3 153 && pattern[scanidx - 3] == '/') { 154 // the collapsible slash is simply discarded 155 scanidx++; 156 } 157 ec_glob_cats(re_pattern, ".*"); 158 } else { 159 ec_glob_cats(re_pattern, "[^/]*"); 160 } 161 } 162 // arbitrary character 163 else if (c == '?') { 164 ec_glob_catc(re_pattern, '.'); 165 } 166 // start of alternatives 167 else if (c == '{') { 168 // if braces are not syntactically valid, treat them as literals 169 if (!braces_valid) { 170 ec_glob_cats(re_pattern, "\\{"); 171 continue; 172 } 173 174 // check brace stack 175 depth_brace++; 176 if (depth_brace > brace_stack_size) { 177 // treat brace literally when stacked too many of them 178 ec_glob_cats(re_pattern, "\\{"); 179 continue; 180 } 181 182 // check if {single} or {num1..num2} 183 _Bool single = 1; 184 _Bool dotdot = strchr("+-0123456789", pattern[scanidx]) != NULL; 185 _Bool dotdot_seen = 0; 186 for (unsigned fw = scanidx; fw < inputlen; fw++) { 187 if (pattern[fw] == ',') { 188 single = 0; 189 dotdot = 0; 190 break; 191 } 192 else if (pattern[fw] == '}') { 193 // check if this is a {num1..num2} pattern 194 if (dotdot) { 195 _Bool ok = 1; 196 unsigned ngc = numrange_grp_count; 197 char *chk; 198 errno = 0; 199 numrange_pairs[ngc].min = strtol( 200 &pattern[scanidx], &chk, 10); 201 ok &= *chk == '.' && 0 == errno; 202 numrange_pairs[ngc].max = strtol( 203 strrchr(&pattern[scanidx], '.')+1, &chk, 10); 204 ok &= *chk == '}' && 0 == errno; 205 if (ok) { 206 // a dotdot is not a single 207 single = 0; 208 // skip this subpattern later on 209 scanidx = fw+1; 210 } else { 211 // not ok, we could not parse the numbers 212 dotdot = 0; 213 } 214 } 215 break; 216 } else if (dotdot) { 217 // check for dotdot separator 218 if (pattern[fw] == '.') { 219 if (!dotdot_seen && 220 fw + 2 < inputlen && pattern[fw + 1] == '.' && 221 strchr("+-0123456789", pattern[fw + 2]) != NULL) { 222 fw += 2; 223 dotdot_seen = 1; 224 } else { 225 dotdot = 0; 226 } 227 } 228 // everything must be a digit, otherwise 229 else if (!strchr("0123456789", pattern[fw])) { 230 dotdot = 0; 231 } 232 } 233 } 234 235 if (single) { 236 // push literal brace 237 ec_glob_cats(re_pattern, "\\{"); 238 brace_stack[depth_brace-1] = '}'; 239 } else { 240 // open choice and push parenthesis 241 ec_glob_catc(re_pattern, '('); 242 243 // increase the current group number 244 numrange_grp_idx[numrange_grp_count]++; 245 246 if (dotdot) { 247 // add the number matching pattern 248 ec_glob_cats(re_pattern, "[-+]?[0-9]+)"); 249 // increase group counter and initialize 250 // next index with current group number 251 numrange_grp_count++; 252 if (numrange_grp_count < numrange_max) { 253 numrange_grp_idx[numrange_grp_count] = 254 numrange_grp_idx[numrange_grp_count - 1]; 255 } 256 // we already took care of the closing brace 257 depth_brace--; 258 } else { 259 // remember that we need to close the choice eventually 260 brace_stack[depth_brace - 1] = ')'; 261 } 262 } 263 } 264 // end of alternatives 265 else if (depth_brace > 0 && c == '}') { 266 depth_brace--; 267 if (depth_brace < brace_stack_size 268 && brace_stack[depth_brace] == ')') { 269 ec_glob_catc(re_pattern, ')'); 270 } else { 271 ec_glob_cats(re_pattern, "\\}"); 272 } 273 } 274 // separator of alternatives 275 else if (depth_brace > 0 && c == ',') { 276 ec_glob_catc(re_pattern, '|'); 277 } 278 // brackets 279 else if (c == '[') { 280 // check if we have a corresponding closing bracket 281 _Bool valid = 0; 282 _Bool closing_bracket_literal = 0; 283 unsigned newidx; 284 for (unsigned fw = scanidx ; fw < inputlen ; fw++) { 285 if (pattern[fw] == ']') { 286 // only terminating if it's not the first char 287 if (fw == scanidx) { 288 // otherwise, auto-escaped 289 closing_bracket_literal = 1; 290 } else { 291 valid = 1; 292 newidx = fw+1; 293 break; 294 } 295 } else if (pattern[fw] == '/') { 296 // special (undocumented) case: slash breaks 297 // https://github.com/editorconfig/editorconfig/issues/499 298 break; 299 } else if (pattern[fw] == '\\') { 300 // skip escaped characters as usual 301 fw++; 302 closing_bracket_literal |= pattern[fw] == ']'; 303 } 304 } 305 306 307 if (valid) { 308 // first of all, check, if the sequence is negated 309 if (pattern[scanidx] == '!') { 310 scanidx++; 311 ec_glob_cats(re_pattern, "[^"); 312 } else { 313 ec_glob_catc(re_pattern, '['); 314 } 315 316 // if we have a closing bracket as literal, it must appear first 317 if (closing_bracket_literal) { 318 ec_glob_catc(re_pattern, ']'); 319 // but if the minus operator wanted to be there 320 // we need to move it to the end 321 if (pattern[scanidx] == '-') { 322 scanidx++; 323 } 324 } 325 326 // everything within brackets is treated as a literal character 327 // we have to parse them one by one, though, because we might 328 // need to escape regex-relevant stuff 329 for (unsigned fw = scanidx ; ; fw++) { 330 if (pattern[fw] == '\\') { 331 // skip to next char 332 continue; 333 } 334 // check for terminating bracket 335 else if (pattern[fw] == ']') { 336 if (fw > scanidx && pattern[fw-1] != '\\') { 337 break; 338 } 339 } 340 // include literal character 341 else { 342 if (strchr(".(){}[]", pattern[fw]) != NULL) { 343 ec_glob_catc(re_pattern, '\\'); 344 } 345 ec_glob_catc(re_pattern, pattern[fw]); 346 } 347 } 348 349 // did we promise the minus a seat in the last row? 350 if (pattern[scanidx-1] == '-') { 351 ec_glob_cats(re_pattern, "-]"); 352 } else { 353 ec_glob_catc(re_pattern, ']'); 354 } 355 scanidx = newidx; 356 } else { 357 // literal bracket 358 ec_glob_cats(re_pattern, "\\["); 359 } 360 } 361 // escape special chars 362 else if (strchr(".(){}[]", c) != NULL) { 363 ec_glob_catc(re_pattern, '\\'); 364 ec_glob_catc(re_pattern, c); 365 } 366 // literal (includes path separators) 367 else { 368 ec_glob_catc(re_pattern, c); 369 } 370 } 371 372 // terminate the regular expression 373 ec_glob_catc(re_pattern, '$'); 374 ec_glob_catc(re_pattern, '\0'); 375 376 377 // compile pattern and execute matching 378 regex_t re; 379 int status; 380 int flags = REG_EXTENDED; 381 382 // when we don't have a num-pattern, don't capture anything 383 if (numrange_grp_count == 0) { 384 flags |= REG_NOSUB; 385 } 386 387 if ((status = regcomp(&re, re_pattern.str, flags)) == 0) { 388 status = regexec(&re, string, numrange_max, numrange_matches, 0); 389 390 // check num ranges 391 for (unsigned i = 0 ; status == 0 && i < numrange_grp_count ; i++) { 392 regmatch_t nm = numrange_matches[numrange_grp_idx[i]]; 393 int nmlen = nm.rm_eo-nm.rm_so; 394 char *nmatch = malloc(nmlen+1); 395 memcpy(nmatch, string+nm.rm_so, nmlen); 396 nmatch[nmlen] = '\0'; 397 errno = 0; 398 char *chk; 399 long num = strtol(nmatch, &chk, 10); 400 if (*chk == '\0' && 0 == errno) { 401 // check if the matched number is within the range 402 status |= !(numrange_pairs[i].min <= num 403 && num <= numrange_pairs[i].max); 404 } else { 405 // number not processable, return error 406 status = 1; 407 } 408 free(nmatch); 409 } 410 regfree(&re); 411 } 412 413 if (re_pattern.capacity > EC_GLOB_STACK_CAPACITY) { 414 free(re_pattern.str); 415 } 416 417 return status; 418 } 419