1 /* 2 * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 tre-parse.c - Regexp parser 31 */ 32 33 /* 34 This parser is just a simple recursive descent parser for POSIX.2 35 regexps. The parser supports both the obsolete default syntax and 36 the "extended" syntax, and some nonstandard extensions. 37 */ 38 39 #include <assert.h> 40 #include <errno.h> 41 #include <limits.h> 42 #include <stddef.h> 43 #include <string.h> 44 45 #include "malloc.h" 46 #include "tre-mem.h" 47 #include "tre-ast.h" 48 #include "tre-stack.h" 49 #include "tre-parse.h" 50 51 #include "../locale/collate.h" 52 53 /* BSD compatibility: 54 Before looking up a collating symbol, check if the name matches in 55 the character names (cnames) array; if so, use the corresponding 56 character. 57 58 Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve 59 the implementation choice that for ERE, a non-numeric character following 60 a left brace that would normally be a bound, causes the left brace to be 61 literal. */ 62 #define BSD_COMPATIBILITY 63 #ifdef BSD_COMPATIBILITY 64 #include "cname.h" 65 #define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND 66 #endif /* BSD_COMPATIBILITY */ 67 68 /* Characters with special meanings in regexp syntax. */ 69 #define CHAR_PIPE L'|' 70 #define CHAR_LPAREN L'(' 71 #define CHAR_RPAREN L')' 72 #define CHAR_LBRACE L'{' 73 #define CHAR_RBRACE L'}' 74 #define CHAR_LBRACKET L'[' 75 #define CHAR_RBRACKET L']' 76 #define CHAR_MINUS L'-' 77 #define CHAR_STAR L'*' 78 #define CHAR_QUESTIONMARK L'?' 79 #define CHAR_PLUS L'+' 80 #define CHAR_PERIOD L'.' 81 #define CHAR_COLON L':' 82 #define CHAR_EQUAL L'=' 83 #define CHAR_COMMA L',' 84 #define CHAR_CARET L'^' 85 #define CHAR_DOLLAR L'$' 86 #define CHAR_BACKSLASH L'\\' 87 #define CHAR_HASH L'#' 88 #define CHAR_TILDE L'~' 89 90 /* Some macros for expanding \w, \s, etc. */ 91 static const struct tre_macro_struct { 92 const char c; 93 const char *expansion; 94 } tre_macros[] = 95 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"}, 96 {'f', "\f"}, {'a', "\a"}, {'e', "\033"}, 97 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, 98 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"}, 99 { 0, NULL } 100 }; 101 102 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which 103 must have at least `len' items. Sets buf[0] to zero if the there 104 is no match in `tre_macros'. */ 105 static void 106 tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end, 107 tre_char_t *buf, size_t buf_len) 108 { 109 int i; 110 111 buf[0] = 0; 112 if (regex >= regex_end) 113 return; 114 115 for (i = 0; tre_macros[i].expansion; i++) 116 { 117 if (tre_macros[i].c == *regex) 118 { 119 unsigned int j; 120 DPRINT(("Expanding macro '%c' => '%s'\n", 121 tre_macros[i].c, tre_macros[i].expansion)); 122 for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++) 123 buf[j] = tre_macros[i].expansion[j]; 124 buf[j] = 0; 125 break; 126 } 127 } 128 } 129 130 static reg_errcode_t 131 tre_new_item(tre_mem_t mem, int type, int val, int *max_i, 132 tre_bracket_match_list_t **items) 133 { 134 reg_errcode_t status = REG_OK; 135 tre_bracket_match_list_t *array = *items; 136 int i = array->num_bracket_matches; 137 /* Allocate more space if necessary. */ 138 if (i >= *max_i) 139 { 140 tre_bracket_match_list_t *new_items; 141 DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i)); 142 /* If the array is already 1024 items large, give up -- there's 143 probably an error in the regexp (e.g. not a '\0' terminated 144 string and missing ']') */ 145 if (*max_i >= 1024) 146 return REG_ESPACE; 147 *max_i *= 2; 148 new_items = realloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i)); 149 if (new_items == NULL) 150 return REG_ESPACE; 151 *items = array = new_items; 152 } 153 array->bracket_matches[i].type = type; 154 array->bracket_matches[i].value = val; 155 array->num_bracket_matches++; 156 return status; 157 } 158 159 #define REST(re) (int)(ctx->re_end - (re)), (re) 160 161 #define START_COLLATING_SYMBOLS 16 162 #define MAX_COLLATING_SYMBOL_LEN 4 163 164 typedef struct { 165 const tre_char_t *start; 166 int len; 167 } tre_collating_symbol; 168 169 #ifdef BSD_COMPATIBILITY 170 static wchar_t 171 tre_search_cnames(const wchar_t *name, size_t len) 172 { 173 size_t low = 0; 174 size_t high = NCNAMES - 1; 175 size_t cur; 176 int cmp; 177 178 while(low <= high) 179 { 180 cur = (low + high) / 2; 181 cmp = wcsncmp(name, cnames[cur].name, len); 182 if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code; 183 if (cmp > 0) low = cur + 1; 184 else high = cur - 1; 185 } 186 return (wchar_t)-1; 187 } 188 #endif /* BSD_COMPATIBILITY */ 189 190 /* Scan the contents of a bracket expression, and create a 191 * tre_bracket_match_list_t encoding the bracket expression. If during 192 * the scan, multi-character collating symbols are detected, switch 193 * into a mode to collect those MCCSs into a tre_collating_symbol 194 * list and pass them back. tre_parse_bracket will use that to 195 * create a new string composed of a union of the bracket expression 196 * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and 197 * call tre_parse (recursive) to parse that new string (which will 198 * call tre_parse_bracket and tre_parse_bracket_items again. */ 199 static reg_errcode_t 200 tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items, 201 int *items_size, tre_collating_symbol **result) 202 { 203 const tre_char_t *re = ctx->re; 204 const tre_char_t *re_end = ctx->re_end; 205 tre_collating_symbol *col_syms = NULL; 206 tre_collating_symbol *cp = NULL; 207 int n_col_syms = 0; 208 reg_errcode_t status; 209 int max_i = *items_size; 210 int other = 0; /* contains content other than multi-character collating 211 * symbols */ 212 int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */ 213 tre_cint_t min, c; 214 int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE); 215 int collect_MCCS = 0; 216 const tre_char_t *start; 217 218 for ( ;re < re_end; re++) 219 { 220 switch (*re) 221 { 222 case CHAR_MINUS: 223 /* A first hyphen */ 224 if (re == ctx->re) 225 { 226 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); 227 min = CHAR_MINUS; 228 other++; 229 range = 0; 230 break; 231 } 232 /* The hyphen is the end range */ 233 if (range > 0) 234 { 235 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); 236 c = CHAR_MINUS; 237 goto process_end_range; 238 } 239 if (re + 1 >= re_end) 240 { 241 status = REG_EBRACK; 242 goto error; 243 } 244 /* The hyphen is at the end */ 245 if (re[1] == CHAR_RBRACKET) 246 { 247 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); 248 c = CHAR_MINUS; 249 goto process_begin_range; 250 } 251 /* Two ranges are not allowed to share an endpoint, or begin 252 * range is illegal. */ 253 if (range < 0) 254 { 255 status = REG_ERANGE; 256 goto error; 257 } 258 range = 1; /* Expect end range */ 259 DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re))); 260 break; 261 262 case CHAR_LBRACKET: 263 if (re + 1 >= re_end) 264 { 265 status = REG_EBRACK; 266 goto error; 267 } 268 switch (re[1]) 269 { 270 case CHAR_PERIOD: 271 { 272 re += 2; 273 start = re; 274 for (;; re++) 275 { 276 if (re >= re_end) 277 { 278 status = REG_ECOLLATE; 279 goto error; 280 } 281 if (*re == CHAR_PERIOD) 282 { 283 if (re + 1 >= re_end) 284 { 285 status = REG_ECOLLATE; 286 goto error; 287 } 288 /* Found end */ 289 if (re[1] == CHAR_RBRACKET) 290 { 291 DPRINT(("tre_parse_bracket: collating " 292 "symbol: '%.*" STRF "'\n", 293 REST(start - 2))); 294 /* Empty name */ 295 if (re == start) 296 { 297 status = REG_ECOLLATE; 298 goto error; 299 } 300 #ifdef BSD_COMPATIBILITY 301 /* Check if the name is in cnames; if so, use 302 the corresponding code */ 303 c = tre_search_cnames(start, re - start); 304 if (c != (wchar_t)-1) 305 { 306 re++; 307 goto process_single_character; 308 } 309 #endif /* BSD_COMPATIBILITY */ 310 /* Verify this is a known sequence */ 311 if (__collate_equiv_value(ctx->loc, start, 312 re - start) <= 0) 313 { 314 status = REG_ECOLLATE; 315 goto error; 316 } 317 /* Process single character collating symbols */ 318 if (re - start == 1) 319 { 320 c = *start; 321 re++; 322 goto process_single_character; 323 } 324 /* Inverted MCCSs are undefined */ 325 if (invert) 326 { 327 status = REG_ECOLLATE; 328 goto error; 329 } 330 /* Can't have MCCSs as an endpoint to a range */ 331 if (range > 0) 332 { 333 status = REG_ERANGE; 334 goto error; 335 } 336 range = -1; 337 /* Switch into MCCS collection mode (if not 338 * already there */ 339 #if TRE_DEBUG 340 if (!collect_MCCS) 341 { 342 collect_MCCS = 1; 343 DPRINT(("tre_parse_bracket: Detected MCCS\n")); 344 } 345 #else /* !TRE_DEBUG */ 346 collect_MCCS = 1; 347 #endif /* !TRE_DEBUG */ 348 /* Allocate a memory block the first time */ 349 if (!cp) 350 { 351 if ((col_syms = malloc(sizeof(*col_syms) * 352 (START_COLLATING_SYMBOLS + 2))) 353 == NULL) 354 return REG_ESPACE; 355 cp = col_syms + 1; 356 n_col_syms = START_COLLATING_SYMBOLS; 357 } 358 /* Enlarge the memory block is more is needed */ 359 if ((cp - col_syms) - 1 >= n_col_syms) 360 { 361 int i = n_col_syms; 362 tre_collating_symbol *tmp = 363 realloc(col_syms, sizeof(*col_syms) * 364 ((n_col_syms *= 2) + 2)); 365 if (tmp == NULL) 366 { 367 free(col_syms); 368 return REG_ESPACE; 369 } 370 DPRINT(("tre_list_collating_symbols: " 371 "Enlarging col_syms to %d\n", 372 n_col_syms)); 373 col_syms = tmp; 374 cp = col_syms + i + 1; 375 } 376 cp->start = start; 377 cp->len = re - start; 378 cp++; 379 re++; 380 break; 381 } 382 } 383 } 384 break; 385 } 386 387 case CHAR_EQUAL: 388 case CHAR_COLON: 389 { 390 /* Process equivalence and character classes */ 391 tre_char_t kind = re[1]; 392 393 /* Can't have a class as an endpoint to a range */ 394 if (range > 0) 395 { 396 status = REG_ERANGE; 397 goto error; 398 } 399 if (!collect_MCCS && range == 0) 400 { 401 status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR, 402 min, &max_i, items); 403 if (status != REG_OK) 404 goto error; 405 } 406 range = -1; 407 re += 2; 408 start = re; 409 for (;; re++) 410 { 411 if (re >= re_end) 412 { 413 status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE; 414 goto error; 415 } 416 if (*re == kind) 417 { 418 if (re + 1 >= re_end) 419 { 420 status = kind == CHAR_EQUAL ? REG_ECOLLATE : 421 REG_ECTYPE; 422 goto error; 423 } 424 /* Found end */ 425 if (re[1] == CHAR_RBRACKET) 426 { 427 if (re == start) 428 { 429 /* Empty class name */ 430 status = kind == CHAR_EQUAL ? REG_ECOLLATE : 431 REG_ECTYPE; 432 goto error; 433 } 434 /* Process equivalence class */ 435 if (kind == CHAR_EQUAL) 436 { 437 int equiv; 438 439 DPRINT(("tre_parse_bracket: equivalence: '%.*" 440 STRF "'\n", REST(start - 2))); 441 442 /* While we find the collation value even for 443 multi-character collating elements , we 444 don't (yet) match any collation values 445 against multi-character sequences. We'd have 446 to enumerate those multi-character sequences 447 and like multi-character collating symbols, 448 create a union of those sequences with the 449 rest of the bracket expression. While 450 doable, a bracket expression matching 451 multiple characters, that doesn't explicitly 452 contain multi-character sequences, might 453 be unexpected, so we punt for now. */ 454 if ((equiv = __collate_equiv_value(ctx->loc, 455 start, re - start)) <= 0) 456 { 457 /* The standard says that if no collating 458 element if found, we use the collating 459 symbol itself. But __collate_equiv_value 460 doesn't make a distinction between 461 an element that is in a equvalence 462 class with others, or is the only member, 463 so we already know there is no collating 464 symbol. (Note that in the case of a 465 collating element whose collation value 466 is unique, matching against the 467 collating element itself, or against 468 its collation value, is equivalent.) */ 469 #ifdef BSD_COMPATIBILITY 470 /* Check if the name is in cnames; if so, 471 use the corresponding code */ 472 c = tre_search_cnames(start, re - start); 473 if (c != (wchar_t)-1) 474 { 475 re++; 476 goto process_single_character; 477 } 478 #endif /* BSD_COMPATIBILITY */ 479 status = REG_ECOLLATE; 480 goto error; 481 } 482 if (!collect_MCCS) 483 { 484 status = tre_new_item(ctx->mem, 485 TRE_BRACKET_MATCH_TYPE_EQUIVALENCE, 486 equiv, &max_i, items); 487 if (status != REG_OK) 488 goto error; 489 } 490 } 491 else 492 { 493 /* Process character class */ 494 DPRINT(("tre_parse_bracket: class: '%.*" STRF 495 "'\n", REST(start - 2))); 496 if (!collect_MCCS) 497 { 498 char tmp_str[64]; 499 tre_ctype_t class; 500 int len = MIN(re - start, 63); 501 { 502 tre_char_t tmp_wcs[64]; 503 wcsncpy(tmp_wcs, start, (size_t)len); 504 tmp_wcs[len] = L'\0'; 505 { 506 mbstate_t state; 507 const tre_char_t *src = tmp_wcs; 508 memset(&state, '\0', sizeof(state)); 509 len = wcsrtombs_l(tmp_str, &src, 510 sizeof(tmp_str), &state, 511 ctx->loc); 512 } 513 } 514 tmp_str[len] = '\0'; 515 DPRINT((" class name: %s\n", tmp_str)); 516 class = tre_ctype_l(tmp_str, ctx->loc); 517 if (!class) 518 { 519 status = REG_ECTYPE; 520 goto error; 521 } 522 status = tre_new_item(ctx->mem, 523 TRE_BRACKET_MATCH_TYPE_CLASS, 524 class, &max_i, items); 525 if (status != REG_OK) 526 goto error; 527 } 528 } 529 re++; 530 break; 531 } 532 } 533 } 534 other++; 535 break; 536 } 537 538 default: 539 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); 540 c = CHAR_LBRACKET; 541 goto process_single_character; 542 break; 543 } 544 break; 545 546 case CHAR_RBRACKET: 547 /* A first right bracket */ 548 if (re == ctx->re) 549 { 550 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); 551 min = CHAR_RBRACKET; 552 range = 0; 553 other++; 554 break; 555 } 556 /* Done */ 557 if (collect_MCCS) 558 { 559 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", 560 REST(re))); 561 if (col_syms) 562 { 563 /* Mark the character following the right bracket. Set len 564 * to whether there are other things besides the 565 * multi-character collating symbols */ 566 col_syms->start = re + 1; 567 col_syms->len = other; 568 /* Mark the end of the list */ 569 cp->start = NULL; 570 } 571 *result = col_syms; 572 return REG_OK; 573 } 574 /* range > 0 is not possible, since we did a lookahead after the 575 * hyphen */ 576 if (range == 0) 577 { 578 status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR, 579 min, &max_i, items); 580 if (status != REG_OK) 581 goto error; 582 } 583 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re))); 584 *items_size = max_i; 585 ctx->re = re + 1; 586 return REG_OK; 587 588 default: 589 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); 590 c = *re; 591 process_single_character: 592 /* Process single character */ 593 if (range > 0) 594 { 595 int mine, maxe; 596 597 process_end_range: 598 /* Get collation equivalence values */ 599 mine = __collate_equiv_value(ctx->loc, &min, 1); 600 maxe = __collate_equiv_value(ctx->loc, &c, 1); 601 if (maxe < mine) 602 { 603 status = REG_ERANGE; 604 goto error; 605 } 606 if (!collect_MCCS) 607 { 608 status = tre_new_item(ctx->mem, 609 TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN, 610 mine, &max_i, items); 611 if (status != REG_OK) 612 goto error; 613 status = tre_new_item(ctx->mem, 614 TRE_BRACKET_MATCH_TYPE_RANGE_END, 615 maxe, &max_i, items); 616 if (status != REG_OK) 617 goto error; 618 } 619 range = -1; 620 } 621 else 622 { 623 process_begin_range: 624 if (!collect_MCCS) 625 { 626 if (range == 0) 627 { 628 status = tre_new_item(ctx->mem, 629 TRE_BRACKET_MATCH_TYPE_CHAR, 630 min, &max_i, items); 631 if (status != REG_OK) 632 goto error; 633 } 634 min = c; 635 } 636 range = 0; 637 } 638 other++; 639 break; 640 } 641 } 642 status = REG_EBRACK; 643 error: 644 DPRINT(("tre_parse_bracket: error: '%.*" STRF "', status=%d\n", 645 REST(re), status)); 646 if (col_syms) 647 free(col_syms); 648 return status; 649 } 650 651 #ifdef TRE_DEBUG 652 static const char *bracket_match_type_str[] = { 653 "unused", 654 "char", 655 "range begin", 656 "range end", 657 "class", 658 "equivalence value", 659 }; 660 #endif /* TRE_DEBUG */ 661 662 static reg_errcode_t 663 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) 664 { 665 tre_ast_node_t *node; 666 reg_errcode_t status = REG_OK; 667 tre_bracket_match_list_t *items; 668 int max_i = 32; 669 tre_collating_symbol *col_syms = NULL; 670 671 /* Handle special cases [[:<:]] and [[:>:]] */ 672 if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET 673 && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>') 674 && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET 675 && ctx->re[5] == CHAR_RBRACKET) 676 { 677 *result = tre_ast_new_literal(ctx->mem, ASSERTION, 678 (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW, 679 -1); 680 DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ? 681 "[[:<:]]" : "[[:>:]]")); 682 ctx->re += 6; 683 return *result ? REG_OK : REG_ESPACE; 684 } 685 686 /* Start off with an array of `max_i' elements. */ 687 items = calloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i)); 688 if (items == NULL) 689 return REG_ESPACE; 690 691 if (*ctx->re == CHAR_CARET) 692 { 693 DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re))); 694 items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE; 695 ctx->re++; 696 } 697 698 status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms); 699 700 if (status != REG_OK) 701 goto parse_bracket_done; 702 703 /* If there are collating symbols, split off the multi-character ones 704 * into a union of the bracket expression (without the collating symbols) 705 * and the multiple-character sequences. We create an equivalent input 706 * string and run tre_parse() recursively */ 707 if (col_syms) 708 { 709 tre_char_t *str, *sp; 710 tre_collating_symbol *cp; 711 tre_parse_ctx_t subctx; 712 713 /* Allocate a new string. We start with the size of the original 714 * bracket expression (minus 1) and add 2 (for a leading "[" and 715 * a trailing nil; don't need a "^", since it is illegal to have 716 * inverted MCCSs). Since a multi-character collating symbols 717 * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't 718 * need to worry about the new string getting too long. */ 719 free(items); 720 str = malloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2)); 721 if (str == NULL) 722 { 723 free(col_syms); 724 return REG_ESPACE; 725 } 726 sp = str; 727 if (col_syms->len > 0) 728 { 729 /* There are other items in the bracket expression besides the 730 * multi-character collating symbols, so create a new bracket 731 * expression with only those other itmes. */ 732 const tre_char_t *re; 733 ptrdiff_t i; 734 735 *sp++ = '['; 736 re = ctx->re; 737 for (cp = col_syms + 1; cp->start; cp++) 738 { 739 /* The "- 2" is to account for the "[." */ 740 if ((i = ((cp->start - re) - 2)) > 0) 741 { 742 memcpy(sp, re, sizeof(*sp) * i); 743 sp += i; 744 } 745 /* The "+ 2" is to account for the ".]" */ 746 re = cp->start + cp->len + 2; 747 } 748 i = col_syms->start - re; /* Includes the trailing right bracket */ 749 memcpy(sp, re, sizeof(*sp) * i); 750 sp += i; 751 *sp++ = '|'; 752 } 753 for (cp = col_syms + 1; cp->start; cp++) 754 { 755 memcpy(sp, cp->start, sizeof(*sp) * cp->len); 756 sp += cp->len; 757 if (cp[1].start) 758 *sp++ = '|'; 759 } 760 *sp = 0; 761 DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n", 762 str)); 763 764 memcpy(&subctx, ctx, sizeof(subctx)); 765 subctx.re = str; 766 subctx.len = sp - str; 767 subctx.nofirstsub = 1; 768 subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */ 769 status = tre_parse(&subctx); 770 free(str); 771 if (status != REG_OK) 772 { 773 free(col_syms); 774 return status; 775 } 776 ctx->re = col_syms->start; 777 ctx->position = subctx.position; 778 free(col_syms); 779 *result = subctx.result; 780 DPRINT(("tre_parse_bracket: Returning to original string\n")); 781 return REG_OK; 782 } 783 784 DPRINT(("tre_parse_bracket: creating bracket expression literal\n")); 785 node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position); 786 if (node == NULL) 787 { 788 status = REG_ESPACE; 789 goto parse_bracket_done; 790 } 791 else 792 { 793 tre_literal_t *l = node->obj; 794 l->u.bracket_match_list = tre_mem_alloc(ctx->mem, 795 SIZEOF_BRACKET_MATCH_LIST(items)); 796 if (l->u.bracket_match_list == NULL) 797 { 798 status = REG_ESPACE; 799 goto parse_bracket_done; 800 } 801 memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items)); 802 } 803 804 #ifdef TRE_DEBUG 805 { 806 int i; 807 tre_bracket_match_t *b; 808 DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n", 809 items->num_bracket_matches, items->flags)); 810 for (i = 0, b = items->bracket_matches; 811 i < items->num_bracket_matches; i++, b++) 812 { 813 DPRINT((" %d: %s %d\n", i, bracket_match_type_str[b->type], 814 b->value)); 815 } 816 } 817 #endif /* TRE_DEBUG */ 818 819 parse_bracket_done: 820 free(items); 821 ctx->position++; 822 *result = node; 823 return status; 824 } 825 826 /* Parses a positive decimal integer. Returns -1 if the string does not 827 contain a valid number. */ 828 static int 829 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end) 830 { 831 int num; 832 tre_char_t *endptr; 833 834 errno = 0; 835 num = (int)wcstoul(*regex, (wchar_t **)&endptr, 10); 836 if (errno != 0) 837 return -1; 838 *regex = endptr; 839 return num; 840 } 841 842 static reg_errcode_t 843 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) 844 { 845 int min, max; 846 const tre_char_t *r = ctx->re; 847 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; 848 849 /* Parse number (minimum repetition count). */ 850 min = -1; 851 if (r >= ctx->re_end) 852 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND 853 return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE; 854 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ 855 return REG_EBRACE; 856 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ 857 if (*r >= L'0' && *r <= L'9') { 858 DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r))); 859 min = tre_parse_int(&r, ctx->re_end); 860 if (min == -1) 861 return REG_BADBR; 862 } 863 else 864 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND 865 /* For ERE, return REG_NOMATCH to signal that the lbrace should 866 be treated as a literal */ 867 return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR; 868 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ 869 return REG_BADBR; 870 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ 871 872 /* Parse comma and second number (maximum repetition count). */ 873 max = min; 874 if (r < ctx->re_end && *r == CHAR_COMMA) 875 { 876 r++; 877 DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r))); 878 max = tre_parse_int(&r, ctx->re_end); 879 } 880 881 /* Check that the repeat counts are sane. */ 882 if ((max > 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX) 883 return REG_BADBR; 884 885 /*{*//* Missing }. */ 886 if (r >= ctx->re_end) 887 return REG_EBRACE; 888 889 /* Empty contents of {}. */ 890 if (r == ctx->re) 891 return REG_BADBR; 892 893 /* Parse the ending '}' or '\}'.*/ 894 if (ctx->cflags & REG_EXTENDED) 895 { 896 if (r >= ctx->re_end || *r != CHAR_RBRACE) 897 return REG_BADBR; 898 r++; 899 /* Parse trailing '?' marking minimal repetition. */ 900 if (r < ctx->re_end) 901 { 902 if (*r == CHAR_QUESTIONMARK) 903 { 904 /* Process the question mark only in enhanced mode. 905 Otherwise, the question mark is an error in ERE 906 or a literal in BRE */ 907 if (ctx->cflags & REG_ENHANCED) 908 { 909 minimal = !(ctx->cflags & REG_UNGREEDY); 910 r++; 911 } 912 else return REG_BADRPT; 913 } 914 else if (*r == CHAR_STAR || *r == CHAR_PLUS) 915 { 916 /* These are reserved for future extensions. */ 917 return REG_BADRPT; 918 } 919 } 920 } 921 else 922 { 923 if (r + 1 >= ctx->re_end 924 || *r != CHAR_BACKSLASH 925 || *(r + 1) != CHAR_RBRACE) 926 return REG_BADBR; 927 r += 2; 928 if (r < ctx->re_end && *r == CHAR_STAR) 929 { 930 /* This is reserved for future extensions. */ 931 return REG_BADRPT; 932 } 933 } 934 935 if (minimal) 936 ctx->num_reorder_tags++; 937 938 if (!result) goto parse_bound_exit; 939 /* Create the AST node(s). */ 940 /* Originally, if min == 0 && max == 0, we immediately replace the whole 941 iteration with EMPTY. This unfortunately drops any submatches, and 942 messes up setting the pmatch values (we can get tags of -1, and 943 tag values in the billions). So we leave it and process this case as 944 usual, and wait until tre_expand_ast() to replace with EMPTY */ 945 946 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal); 947 if (!*result) 948 return REG_ESPACE; 949 950 parse_bound_exit: 951 DPRINT(("tre_parse_bound: min %d, max %d\n", min, max)); 952 953 ctx->re = r; 954 return REG_OK; 955 } 956 957 /* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for 958 non-self-contained options, like (?i), this causes ((?i)fu)bar to be 959 treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect. 960 Because we now set up tags for even non-capturing parenthesized 961 subexpressions, we always call PARSE_MARK_FOR_SUBMATCH. So if we 962 pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and 963 have it restore cflags after the subexpression, we don't need to have 964 a separate PARSE_RESTORE_CFLAGS, and then after processing the 965 non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE. 966 This has the side-benefit of now matching the perl behavior: the RE 967 foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior 968 of foo AND (?i) (bar OR zap). */ 969 typedef enum { 970 PARSE_RE = 0, 971 PARSE_ATOM, 972 PARSE_MARK_FOR_SUBMATCH, 973 PARSE_BRANCH, 974 PARSE_PIECE, 975 PARSE_CATENATION, 976 PARSE_POST_CATENATION, 977 PARSE_UNION, 978 PARSE_POST_UNION, 979 PARSE_POSTFIX, 980 } tre_parse_re_stack_symbol_t; 981 982 reg_errcode_t 983 tre_parse(tre_parse_ctx_t *ctx) 984 { 985 tre_ast_node_t *result = NULL; 986 tre_parse_re_stack_symbol_t symbol; 987 reg_errcode_t status = REG_OK; 988 tre_stack_t *stack = ctx->stack; 989 int bottom = tre_stack_num_objects(stack); 990 int depth = 0; 991 int temporary_cflags = 0; 992 int bre_branch_begin; 993 #ifdef TRE_DEBUG 994 const tre_char_t *tmp_re; 995 #endif 996 997 DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n", 998 ctx->len, ctx->re, ctx->len, ctx->cflags)); 999 1000 if (ctx->len <= 0) return REG_EMPTY; 1001 if (!ctx->nofirstsub) 1002 { 1003 STACK_PUSH(stack, int, ctx->cflags); 1004 STACK_PUSH(stack, int, ctx->submatch_id); 1005 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH); 1006 ctx->submatch_id++; 1007 } 1008 STACK_PUSH(stack, int, 0); // bre_branch_begin 1009 STACK_PUSH(stack, int, PARSE_RE); 1010 ctx->re_start = ctx->re; 1011 ctx->re_end = ctx->re + ctx->len; 1012 1013 /* The following is basically just a recursive descent parser. I use 1014 an explicit stack instead of recursive functions mostly because of 1015 two reasons: compatibility with systems which have an overflowable 1016 call stack, and efficiency (both in lines of code and speed). */ 1017 while (tre_stack_num_objects(stack) > bottom) 1018 { 1019 symbol = tre_stack_pop_int(stack); 1020 switch (symbol) 1021 { 1022 case PARSE_RE: 1023 /* Parse a full regexp. A regexp is one or more branches, 1024 separated by the union operator `|'. */ 1025 bre_branch_begin = tre_stack_pop_int(stack); 1026 if (!(ctx->cflags & REG_LITERAL) && 1027 ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) 1028 STACK_PUSHX(stack, int, PARSE_UNION); 1029 STACK_PUSHX(stack, int, bre_branch_begin); 1030 STACK_PUSHX(stack, int, PARSE_BRANCH); 1031 break; 1032 1033 case PARSE_BRANCH: 1034 /* Parse a branch. A branch is one or more pieces, concatenated. 1035 A piece is an atom possibly followed by a postfix operator. */ 1036 bre_branch_begin = tre_stack_pop_int(stack); 1037 STACK_PUSHX(stack, int, PARSE_CATENATION); 1038 STACK_PUSHX(stack, int, bre_branch_begin); 1039 STACK_PUSHX(stack, int, PARSE_PIECE); 1040 break; 1041 1042 case PARSE_PIECE: 1043 /* Parse a piece. A piece is an atom possibly followed by one 1044 or more postfix operators. */ 1045 bre_branch_begin = tre_stack_pop_int(stack); 1046 STACK_PUSHX(stack, int, PARSE_POSTFIX); 1047 STACK_PUSHX(stack, int, bre_branch_begin); 1048 STACK_PUSHX(stack, int, PARSE_ATOM); 1049 break; 1050 1051 case PARSE_CATENATION: 1052 /* If the expression has not ended, parse another piece. */ 1053 { 1054 tre_char_t c; 1055 if (ctx->re >= ctx->re_end) 1056 break; 1057 c = *ctx->re; 1058 if (!(ctx->cflags & REG_LITERAL)) 1059 { 1060 if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) || 1061 ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED 1062 && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH && 1063 *(ctx->re + 1) == CHAR_PIPE)) 1064 break; 1065 if ((ctx->cflags & REG_EXTENDED 1066 && c == CHAR_RPAREN && depth > 0) 1067 || (!(ctx->cflags & REG_EXTENDED) 1068 && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH 1069 && *(ctx->re + 1) == CHAR_RPAREN)) 1070 { 1071 if (!(ctx->cflags & REG_EXTENDED) && depth == 0) 1072 return REG_EPAREN; 1073 DPRINT(("tre_parse: group end: '%.*" STRF "'\n", 1074 REST(ctx->re))); 1075 depth--; 1076 if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED))) 1077 ctx->re += 2; 1078 break; 1079 } 1080 } 1081 1082 #ifdef REG_LEFT_ASSOC 1083 if (ctx->cflags & REG_LEFT_ASSOC) 1084 { 1085 /* Left associative concatenation. */ 1086 STACK_PUSHX(stack, int, PARSE_CATENATION); 1087 STACK_PUSHX(stack, voidptr, result); 1088 STACK_PUSHX(stack, int, PARSE_POST_CATENATION); 1089 STACK_PUSHX(stack, int, 0); // bre_branch_begin 1090 STACK_PUSHX(stack, int, PARSE_PIECE); 1091 } 1092 else 1093 #endif /* REG_LEFT_ASSOC */ 1094 { 1095 /* Default case, right associative concatenation. */ 1096 STACK_PUSHX(stack, voidptr, result); 1097 STACK_PUSHX(stack, int, PARSE_POST_CATENATION); 1098 STACK_PUSHX(stack, int, PARSE_CATENATION); 1099 STACK_PUSHX(stack, int, 0); // bre_branch_begin 1100 STACK_PUSHX(stack, int, PARSE_PIECE); 1101 } 1102 break; 1103 } 1104 1105 case PARSE_POST_CATENATION: 1106 { 1107 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); 1108 tre_ast_node_t *tmp_node; 1109 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result); 1110 if (!tmp_node) 1111 return REG_ESPACE; 1112 result = tmp_node; 1113 break; 1114 } 1115 1116 case PARSE_UNION: 1117 if (ctx->re >= ctx->re_end) 1118 break; 1119 if (ctx->cflags & REG_LITERAL) 1120 break; 1121 if (!(ctx->cflags & REG_EXTENDED)) 1122 { 1123 if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end) 1124 break; 1125 ctx->re++; 1126 } 1127 switch (*ctx->re) 1128 { 1129 case CHAR_PIPE: 1130 DPRINT(("tre_parse: union: '%.*" STRF "'\n", 1131 REST(ctx->re))); 1132 STACK_PUSHX(stack, int, PARSE_UNION); 1133 STACK_PUSHX(stack, voidptr, (void *)ctx->re); 1134 STACK_PUSHX(stack, voidptr, result); 1135 STACK_PUSHX(stack, int, PARSE_POST_UNION); 1136 /* We need to pass a boolean (eventually) to PARSE_ATOM to 1137 indicate if this is the beginning of a BRE extended branch. */ 1138 STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin 1139 STACK_PUSHX(stack, int, PARSE_BRANCH); 1140 ctx->re++; 1141 break; 1142 1143 case CHAR_RPAREN: 1144 ctx->re++; 1145 break; 1146 1147 default: 1148 if (!(ctx->cflags & REG_EXTENDED)) 1149 ctx->re--; 1150 break; 1151 } 1152 break; 1153 1154 case PARSE_POST_UNION: 1155 { 1156 tre_ast_node_t *tmp_node; 1157 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); 1158 const tre_char_t *pipechar = tre_stack_pop_voidptr(stack); 1159 /* error on empty expression at end of union */ 1160 if (pipechar == ctx->re - 1) 1161 { 1162 return REG_EMPTY; 1163 } 1164 tmp_node = tre_ast_new_union(ctx->mem, tree, result); 1165 if (!tmp_node) 1166 return REG_ESPACE; 1167 result = tmp_node; 1168 break; 1169 } 1170 1171 case PARSE_POSTFIX: 1172 /* Parse postfix operators. */ 1173 if (ctx->re >= ctx->re_end) 1174 break; 1175 if (ctx->cflags & REG_LITERAL) 1176 break; 1177 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; 1178 int rep_min = 0; 1179 int rep_max = -1; 1180 #ifdef TRE_DEBUG 1181 int lbrace_off; 1182 #endif 1183 switch (*ctx->re) 1184 { 1185 case CHAR_PLUS: 1186 case CHAR_QUESTIONMARK: 1187 if (!(ctx->cflags & REG_EXTENDED)) 1188 break; 1189 /*FALLTHROUGH*/ 1190 case CHAR_STAR: 1191 { 1192 tre_ast_node_t *tmp_node; 1193 #ifdef TRE_DEBUG 1194 const char *tstr = "star"; 1195 tmp_re = ctx->re; 1196 #endif 1197 1198 handle_plus_or_question: 1199 /* error on iteration of raw assertion (not in subexpression) */ 1200 if (result->type == LITERAL && result->submatch_id < 0 && 1201 IS_ASSERTION((tre_literal_t *)result->obj)) 1202 { 1203 if (!(ctx->cflags & REG_EXTENDED)) break; 1204 return REG_BADRPT; 1205 } 1206 if (*ctx->re == CHAR_PLUS) 1207 { 1208 rep_min = 1; 1209 #ifdef TRE_DEBUG 1210 tstr = "plus"; 1211 #endif 1212 } 1213 if (*ctx->re == CHAR_QUESTIONMARK) 1214 { 1215 rep_max = 1; 1216 #ifdef TRE_DEBUG 1217 tstr = "questionmark"; 1218 #endif 1219 } 1220 1221 if (ctx->cflags & REG_EXTENDED) 1222 { 1223 if (ctx->re + 1 < ctx->re_end) 1224 { 1225 if (*(ctx->re + 1) == CHAR_QUESTIONMARK) 1226 { 1227 /* Process the question mark only in enhanced mode. 1228 Otherwise, the question mark is an error in ERE */ 1229 if (ctx->cflags & REG_ENHANCED) 1230 { 1231 minimal = !(ctx->cflags & REG_UNGREEDY); 1232 ctx->re++; 1233 } 1234 else return REG_BADRPT; 1235 } 1236 else if (*(ctx->re + 1) == CHAR_STAR 1237 || *(ctx->re + 1) == CHAR_PLUS) 1238 { 1239 /* These are reserved for future extensions. */ 1240 return REG_BADRPT; 1241 } 1242 } 1243 } 1244 else 1245 { 1246 if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR) 1247 { 1248 /* This is reserved for future extensions. */ 1249 return REG_BADRPT; 1250 } 1251 if (ctx->re + 2 < ctx->re_end) 1252 { 1253 if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_QUESTIONMARK) 1254 { 1255 /* Process the question mark only in enhanced mode. 1256 Otherwise, the question mark is a literal in BRE */ 1257 if (ctx->cflags & REG_ENHANCED) 1258 { 1259 minimal = !(ctx->cflags & REG_UNGREEDY); 1260 ctx->re += 2; 1261 } 1262 } 1263 else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS) 1264 { 1265 /* This is reserved for future extensions. */ 1266 return REG_BADRPT; 1267 } 1268 } 1269 } 1270 1271 if (minimal) 1272 ctx->num_reorder_tags++; 1273 1274 DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n", 1275 minimal ? " minimal" : "greedy", tstr, REST(tmp_re))); 1276 if (result == NULL) 1277 { 1278 if (ctx->cflags & REG_EXTENDED) return REG_BADRPT; 1279 else goto parse_literal; 1280 } 1281 ctx->re++; 1282 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max, 1283 minimal); 1284 if (tmp_node == NULL) 1285 return REG_ESPACE; 1286 result = tmp_node; 1287 1288 /* Set the iterator with a submatch id in the invisible range 1289 * (which will be overridden if a real submatch is needed) */ 1290 result->submatch_id = ctx->submatch_id_invisible++; 1291 1292 #if 0 1293 /* We don't allow multiple postfixes, but this might be needed 1294 to support approximate matching */ 1295 STACK_PUSHX(stack, int, PARSE_POSTFIX); 1296 #endif 1297 } 1298 break; 1299 1300 case CHAR_BACKSLASH: 1301 /* "\{" is special without REG_EXTENDED */ 1302 /* "\+" and "\?" are special with REG_ENHANCED for BRE */ 1303 if (!(ctx->cflags & REG_EXTENDED) 1304 && ctx->re + 1 < ctx->re_end) 1305 { 1306 switch (*(ctx->re + 1)) 1307 { 1308 case CHAR_LBRACE: 1309 ctx->re++; 1310 #ifdef TRE_DEBUG 1311 lbrace_off = 2; 1312 #endif 1313 goto parse_brace; 1314 case CHAR_PLUS: 1315 case CHAR_QUESTIONMARK: 1316 if (ctx->cflags & REG_ENHANCED) 1317 { 1318 #ifdef TRE_DEBUG 1319 tmp_re = ctx->re; 1320 #endif 1321 ctx->re++; 1322 goto handle_plus_or_question; 1323 } 1324 break; 1325 } 1326 break; 1327 } 1328 else 1329 break; 1330 1331 case CHAR_LBRACE: 1332 { 1333 int raw_assertion; 1334 1335 /* "{" is literal without REG_EXTENDED */ 1336 if (!(ctx->cflags & REG_EXTENDED)) 1337 break; 1338 #ifdef TRE_DEBUG 1339 lbrace_off = 1; 1340 #endif 1341 1342 parse_brace: 1343 /* error on iteration of raw assertion (not in subexpression), 1344 but wait until after parsing bounds */ 1345 raw_assertion = (result->type == LITERAL 1346 && result->submatch_id < 0 1347 && IS_ASSERTION((tre_literal_t *)result->obj)); 1348 ctx->re++; 1349 1350 status = tre_parse_bound(ctx, &result); 1351 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND 1352 /* For ERE, if status is REG_NOMATCH, this mean the lbrace 1353 is to be treated as a literal. */ 1354 if (status == REG_NOMATCH) 1355 { 1356 ctx->re--; 1357 break; 1358 } 1359 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ 1360 DPRINT(("tre_parse: bound: '%.*" STRF "'\n", 1361 REST(ctx->re - lbrace_off))); 1362 if (status != REG_OK) 1363 return status; 1364 if (raw_assertion) return REG_BADRPT; 1365 1366 /* Set the iterator with a submatch id in the invisible range 1367 * (which will be overridden if a real submatch is needed) */ 1368 if (result->type == ITERATION) 1369 result->submatch_id = ctx->submatch_id_invisible++; 1370 1371 #if 0 1372 /* We don't allow multiple postfixes, but this might be needed 1373 to support approximate matching */ 1374 STACK_PUSHX(stack, int, PARSE_POSTFIX); 1375 #endif 1376 break; 1377 } 1378 } 1379 break; 1380 1381 case PARSE_ATOM: 1382 { 1383 /* Parse an atom. An atom is a regular expression enclosed in `()', 1384 an empty set of `()', a bracket expression, `.', `^', `$', 1385 a `\' followed by a character, or a single character. */ 1386 1387 /* The stack contains a boolean value, whether PARSE_ATOM is 1388 being called just after the start of a group (left paren) 1389 in a BRE */ 1390 bre_branch_begin = tre_stack_pop_int(stack); 1391 1392 /* End of regexp? (empty string). */ 1393 if (ctx->re >= ctx->re_end) 1394 goto parse_literal; 1395 1396 if (ctx->cflags & REG_LITERAL) 1397 goto parse_literal; 1398 1399 switch (*ctx->re) 1400 { 1401 case CHAR_LPAREN: /* parenthesized subexpression */ 1402 1403 /* Handle "(?...)" extensions. They work in a way similar 1404 to Perls corresponding extensions. */ 1405 if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) == 1406 (REG_EXTENDED|REG_ENHANCED) 1407 && *(ctx->re + 1) == CHAR_QUESTIONMARK) 1408 { 1409 int new_cflags = ctx->cflags; 1410 int bit = 1; 1411 int invisible_submatch = 0; 1412 DPRINT(("tre_parse: extension: '%.*" STRF "'\n", 1413 REST(ctx->re))); 1414 ctx->re += 2; 1415 while (/*CONSTCOND*/1) 1416 { 1417 if (*ctx->re == L'i') 1418 { 1419 DPRINT(("tre_parse: icase: '%.*" STRF "'\n", 1420 REST(ctx->re))); 1421 if (bit) 1422 new_cflags |= REG_ICASE; 1423 else 1424 new_cflags &= ~REG_ICASE; 1425 ctx->re++; 1426 } 1427 else if (*ctx->re == L'n') 1428 { 1429 DPRINT(("tre_parse: newline: '%.*" STRF "'\n", 1430 REST(ctx->re))); 1431 if (bit) 1432 new_cflags |= REG_NEWLINE; 1433 else 1434 new_cflags &= ~REG_NEWLINE; 1435 ctx->re++; 1436 } 1437 #ifdef REG_LEFT_ASSOC 1438 else if (*ctx->re == L'l') 1439 { 1440 DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n", 1441 REST(ctx->re))); 1442 if (bit) 1443 new_cflags |= REG_LEFT_ASSOC; 1444 else 1445 new_cflags &= ~REG_LEFT_ASSOC; 1446 ctx->re++; 1447 } 1448 #endif /* REG_LEFT_ASSOC */ 1449 #ifdef REG_UNGREEDY 1450 else if (*ctx->re == L'U') 1451 { 1452 DPRINT(("tre_parse: ungreedy: '%.*" STRF "'\n", 1453 REST(ctx->re))); 1454 if (bit) 1455 new_cflags |= REG_UNGREEDY; 1456 else 1457 new_cflags &= ~REG_UNGREEDY; 1458 ctx->re++; 1459 } 1460 #endif /* REG_UNGREEDY */ 1461 else if (*ctx->re == CHAR_MINUS) 1462 { 1463 DPRINT(("tre_parse: turn off: '%.*" STRF "'\n", 1464 REST(ctx->re))); 1465 ctx->re++; 1466 bit = 0; 1467 } 1468 else if (*ctx->re == CHAR_COLON) 1469 { 1470 DPRINT(("tre_parse: no group: '%.*" STRF 1471 "', (invisible submatch %d)\n", 1472 REST(ctx->re), ctx->submatch_id_invisible)); 1473 ctx->re++; 1474 depth++; 1475 invisible_submatch = 1; 1476 break; 1477 } 1478 else if (*ctx->re == CHAR_HASH) 1479 { 1480 DPRINT(("tre_parse: comment: '%.*" STRF "'\n", 1481 REST(ctx->re))); 1482 /* A comment can contain any character except a 1483 right parenthesis */ 1484 while (*ctx->re != CHAR_RPAREN 1485 && ctx->re < ctx->re_end) 1486 ctx->re++; 1487 if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end) 1488 { 1489 ctx->re++; 1490 break; 1491 } 1492 else 1493 return REG_BADPAT; 1494 } 1495 else if (*ctx->re == CHAR_RPAREN) 1496 { 1497 ctx->re++; 1498 break; 1499 } 1500 else 1501 return REG_BADRPT; 1502 } 1503 1504 /* Turn on the cflags changes for the rest of the 1505 enclosing group. */ 1506 if (invisible_submatch) 1507 { 1508 STACK_PUSHX(stack, int, ctx->cflags); 1509 STACK_PUSHX(stack, int, ctx->submatch_id_invisible); 1510 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); 1511 ctx->submatch_id_invisible++; 1512 STACK_PUSHX(stack, int, 0); // bre_branch_begin 1513 STACK_PUSHX(stack, int, PARSE_RE); 1514 } 1515 else { 1516 STACK_PUSHX(stack, int, 0); // bre_branch_begin 1517 STACK_PUSHX(stack, int, PARSE_ATOM); 1518 } 1519 ctx->cflags = new_cflags; 1520 break; 1521 } 1522 1523 if (ctx->cflags & REG_EXTENDED) 1524 { 1525 parse_bre_lparen: 1526 DPRINT(("tre_parse: group begin: '%.*" STRF 1527 "', submatch %d\n", REST(ctx->re), 1528 ctx->submatch_id)); 1529 ctx->re++; 1530 /* First parse a whole RE, then mark the resulting tree 1531 for submatching. */ 1532 STACK_PUSHX(stack, int, ctx->cflags); 1533 STACK_PUSHX(stack, int, ctx->submatch_id); 1534 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); 1535 /* We need to pass a boolean (eventually) to PARSE_ATOM to 1536 indicate if this is the beginning of a BRE group. */ 1537 STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED)); 1538 STACK_PUSHX(stack, int, PARSE_RE); 1539 ctx->submatch_id++; 1540 depth++; 1541 } 1542 else 1543 goto parse_literal; 1544 break; 1545 1546 case CHAR_RPAREN: /* end of current subexpression */ 1547 if (ctx->cflags & REG_EXTENDED && depth > 0) 1548 { 1549 parse_bre_rparen_empty: 1550 if (!(ctx->cflags & REG_EXTENDED) && depth == 0) 1551 return REG_EPAREN; 1552 DPRINT(("tre_parse: empty: '%.*" STRF "'\n", 1553 REST(ctx->re))); 1554 /* We were expecting an atom, but instead the current 1555 subexpression was closed. POSIX leaves the meaning of 1556 this to be implementation-defined. We interpret this as 1557 an empty expression (which matches an empty string). */ 1558 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 1559 if (result == NULL) 1560 return REG_ESPACE; 1561 if (!(ctx->cflags & REG_EXTENDED)) 1562 ctx->re--; 1563 } 1564 else 1565 goto parse_literal; 1566 break; 1567 1568 case CHAR_LBRACKET: /* bracket expression */ 1569 DPRINT(("tre_parse: bracket: '%.*" STRF "'\n", 1570 REST(ctx->re))); 1571 ctx->re++; 1572 status = tre_parse_bracket(ctx, &result); 1573 if (status != REG_OK) 1574 return status; 1575 break; 1576 1577 case CHAR_BACKSLASH: 1578 /* Deal with "\(", "\)" or "\{" for BREs */ 1579 if (!(ctx->cflags & REG_EXTENDED) 1580 && ctx->re + 1 < ctx->re_end) 1581 { 1582 if (*(ctx->re + 1) == CHAR_LPAREN) 1583 { 1584 ctx->re++; 1585 goto parse_bre_lparen; 1586 } 1587 else if (*(ctx->re + 1) == CHAR_RPAREN) 1588 { 1589 ctx->re++; 1590 goto parse_bre_rparen_empty; 1591 } 1592 if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal; 1593 } 1594 1595 if (ctx->re + 1 >= ctx->re_end) 1596 /* Trailing backslash. */ 1597 return REG_EESCAPE; 1598 1599 /* XXX illumos: want macro expansion enabled by default (same as TRE) */ 1600 #if 0 1601 if (!(ctx->cflags & REG_ENHANCED)) 1602 { 1603 DPRINT(("tre_parse: unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re))); 1604 ctx->re++; 1605 goto unenhanced_backslash; 1606 } 1607 #endif 1608 1609 /* If a macro is used, parse the expanded macro recursively. */ 1610 { 1611 tre_char_t buf[64]; 1612 tre_expand_macro(ctx->re + 1, ctx->re_end, 1613 buf, elementsof(buf)); 1614 if (buf[0] != 0) 1615 { 1616 tre_parse_ctx_t subctx; 1617 memcpy(&subctx, ctx, sizeof(subctx)); 1618 subctx.re = buf; 1619 subctx.len = tre_strlen(buf); 1620 subctx.nofirstsub = 1; 1621 status = tre_parse(&subctx); 1622 if (status != REG_OK) 1623 return status; 1624 ctx->re += 2; 1625 ctx->position = subctx.position; 1626 result = subctx.result; 1627 break; 1628 } 1629 } 1630 1631 if (*(ctx->re + 1) == L'Q') 1632 { 1633 DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n", 1634 REST(ctx->re))); 1635 ctx->cflags |= REG_LITERAL; 1636 temporary_cflags |= REG_LITERAL; 1637 ctx->re += 2; 1638 STACK_PUSHX(stack, int, 0); 1639 STACK_PUSHX(stack, int, PARSE_ATOM); 1640 break; 1641 } 1642 1643 DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re))); 1644 ctx->re++; 1645 switch (*ctx->re) 1646 { 1647 case L'b': 1648 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1649 ASSERT_AT_WB, -1); 1650 ctx->re++; 1651 break; 1652 case L'B': 1653 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1654 ASSERT_AT_WB_NEG, -1); 1655 ctx->re++; 1656 break; 1657 case L'<': 1658 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1659 ASSERT_AT_BOW, -1); 1660 ctx->re++; 1661 break; 1662 case L'>': 1663 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1664 ASSERT_AT_EOW, -1); 1665 ctx->re++; 1666 break; 1667 case L'x': 1668 ctx->re++; 1669 if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end) 1670 { 1671 /* 8 bit hex char. */ 1672 char tmp[3] = {0, 0, 0}; 1673 long val; 1674 DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n", 1675 REST(ctx->re - 2))); 1676 1677 if (tre_isxdigit_l(ctx->re[0], ctx->loc) && 1678 ctx->re < ctx->re_end) 1679 { 1680 tmp[0] = (char)ctx->re[0]; 1681 ctx->re++; 1682 } 1683 if (tre_isxdigit_l(ctx->re[0], ctx->loc) && 1684 ctx->re < ctx->re_end) 1685 { 1686 tmp[1] = (char)ctx->re[0]; 1687 ctx->re++; 1688 } 1689 val = strtol(tmp, NULL, 16); 1690 result = tre_ast_new_literal(ctx->mem, (int)val, 1691 (int)val, ctx->position); 1692 ctx->position++; 1693 break; 1694 } 1695 else if (ctx->re < ctx->re_end) 1696 { 1697 /* Wide char. */ 1698 char tmp[32]; 1699 long val; 1700 int i = 0; 1701 ctx->re++; 1702 while (ctx->re_end - ctx->re >= 0) 1703 { 1704 if (ctx->re[0] == CHAR_RBRACE) 1705 break; 1706 if (tre_isxdigit_l(ctx->re[0], ctx->loc)) 1707 { 1708 tmp[i] = (char)ctx->re[0]; 1709 i++; 1710 ctx->re++; 1711 continue; 1712 } 1713 return REG_EBRACE; 1714 } 1715 ctx->re++; 1716 tmp[i] = 0; 1717 val = strtol(tmp, NULL, 16); 1718 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val, 1719 ctx->position); 1720 ctx->position++; 1721 break; 1722 } 1723 /*FALLTHROUGH*/ 1724 1725 default: 1726 unenhanced_backslash: 1727 if ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) != 1728 REG_EXTENDED && 1729 tre_isdigit_l(*ctx->re, ctx->loc) && *ctx->re != L'0') 1730 { 1731 /* Back reference (only in BRE or enhanced). */ 1732 int val = *ctx->re - L'0'; 1733 DPRINT(("tre_parse: backref: '%.*" STRF "'\n", 1734 REST(ctx->re - 1))); 1735 result = tre_ast_new_literal(ctx->mem, BACKREF, val, 1736 ctx->position); 1737 if (result == NULL) 1738 return REG_ESPACE; 1739 1740 /* Set the backref with a submatch id in the invisible 1741 * range (which will be overridden if a real submatch 1742 * is needed) */ 1743 result->submatch_id = ctx->submatch_id_invisible++; 1744 1745 ctx->position++; 1746 ctx->num_reorder_tags++; 1747 ctx->max_backref = MAX(val, ctx->max_backref); 1748 ctx->re++; 1749 } 1750 else 1751 { 1752 /* Escaped character. */ 1753 DPRINT(("tre_parse: escaped: '%.*" STRF "'\n", 1754 REST(ctx->re - 1))); 1755 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, 1756 ctx->position); 1757 ctx->position++; 1758 ctx->re++; 1759 } 1760 break; 1761 } 1762 if (result == NULL) 1763 return REG_ESPACE; 1764 break; 1765 1766 case CHAR_PERIOD: /* the any-symbol */ 1767 DPRINT(("tre_parse: any: '%.*" STRF "'\n", 1768 REST(ctx->re))); 1769 if (ctx->cflags & REG_NEWLINE) 1770 { 1771 tre_ast_node_t *tmp1; 1772 tre_ast_node_t *tmp2; 1773 tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1, 1774 ctx->position); 1775 if (!tmp1) 1776 return REG_ESPACE; 1777 tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX, 1778 ctx->position + 1); 1779 if (!tmp2) 1780 return REG_ESPACE; 1781 result = tre_ast_new_union(ctx->mem, tmp1, tmp2); 1782 if (!result) 1783 return REG_ESPACE; 1784 ctx->position += 2; 1785 } 1786 else 1787 { 1788 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, 1789 ctx->position); 1790 if (!result) 1791 return REG_ESPACE; 1792 ctx->position++; 1793 } 1794 ctx->re++; 1795 break; 1796 1797 case CHAR_CARET: /* beginning of line assertion */ 1798 /* '^' has a special meaning everywhere in EREs, at the 1799 beginning of the RE and after \( is BREs. It is also 1800 special in enhanced BREs at the beginning of each branches 1801 of a union */ 1802 if (ctx->cflags & REG_EXTENDED 1803 || bre_branch_begin 1804 || ctx->re == ctx->re_start) 1805 { 1806 DPRINT(("tre_parse: BOL: '%.*" STRF "'\n", 1807 REST(ctx->re))); 1808 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1809 ASSERT_AT_BOL, -1); 1810 if (result == NULL) 1811 return REG_ESPACE; 1812 ctx->re++; 1813 } 1814 else 1815 goto parse_literal; 1816 break; 1817 1818 case CHAR_DOLLAR: /* end of line assertion. */ 1819 /* '$' is special everywhere in EREs, and in the end of the 1820 string and before \) is BREs. */ 1821 if (ctx->cflags & REG_EXTENDED 1822 || (ctx->re + 2 < ctx->re_end 1823 && *(ctx->re + 1) == CHAR_BACKSLASH 1824 && *(ctx->re + 2) == CHAR_RPAREN) 1825 || ctx->re + 1 == ctx->re_end) 1826 { 1827 DPRINT(("tre_parse: EOL: '%.*" STRF "'\n", 1828 REST(ctx->re))); 1829 result = tre_ast_new_literal(ctx->mem, ASSERTION, 1830 ASSERT_AT_EOL, -1); 1831 if (result == NULL) 1832 return REG_ESPACE; 1833 ctx->re++; 1834 } 1835 else 1836 goto parse_literal; 1837 break; 1838 1839 default: 1840 parse_literal: 1841 1842 if (temporary_cflags && ctx->re + 1 < ctx->re_end 1843 && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E') 1844 { 1845 DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n", 1846 REST(ctx->re))); 1847 ctx->cflags &= ~temporary_cflags; 1848 temporary_cflags = 0; 1849 ctx->re += 2; 1850 if (ctx->re < ctx->re_end) 1851 { 1852 STACK_PUSHX(stack, int, 0); 1853 STACK_PUSHX(stack, int, PARSE_ATOM); 1854 } 1855 else 1856 { 1857 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 1858 if (!result) return REG_ESPACE; 1859 } 1860 break; 1861 } 1862 1863 /* We are expecting an atom. If the subexpression (or the whole 1864 regexp ends here, we interpret it as an empty expression 1865 (which matches an empty string), which is an error. 1866 Iterations of an empty expression is also an error. */ 1867 if (!(ctx->cflags & REG_LITERAL)) 1868 { 1869 /* error on end of string */ 1870 if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN 1871 : REG_EMPTY; 1872 /* error on unions and iterations of empty expressions */ 1873 if (ctx->cflags & REG_EXTENDED) 1874 { 1875 if (ctx->re < ctx->re_end) 1876 { 1877 if (*ctx->re == CHAR_PIPE) return REG_EMPTY; 1878 if (*ctx->re == CHAR_LBRACE) 1879 { 1880 ctx->re++; 1881 empty_parse_bound: 1882 /* We need to parse the bound first and return 1883 any error, before returning REG_BADRPT */ 1884 status = tre_parse_bound(ctx, NULL); 1885 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND 1886 /* For ERE, if REG_NOMATCH is returned, we 1887 treat the lbrace as a literal. */ 1888 if (status == REG_NOMATCH) 1889 { 1890 ctx->re--; 1891 /* Drop down to literal-handling code */ 1892 } 1893 else 1894 { 1895 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ 1896 if (status != REG_OK) 1897 return status; 1898 return REG_BADRPT; 1899 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND 1900 } 1901 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ 1902 } 1903 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND 1904 else 1905 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ 1906 if (*ctx->re == CHAR_STAR 1907 || *ctx->re == CHAR_PLUS 1908 || *ctx->re == CHAR_QUESTIONMARK) 1909 { 1910 return REG_BADRPT; 1911 } 1912 } 1913 } 1914 else if (ctx->re + 1 < ctx->re_end 1915 && *ctx->re == CHAR_BACKSLASH 1916 && *(ctx->re + 1) == CHAR_LBRACE) 1917 { 1918 ctx->re += 2; 1919 goto empty_parse_bound; 1920 } 1921 } 1922 1923 DPRINT(("tre_parse: literal: '%.*" STRF "'\n", 1924 REST(ctx->re))); 1925 /* Note that we can't use an tre_isalpha() test here, since there 1926 may be characters which are alphabetic but neither upper or 1927 lower case. */ 1928 if (ctx->cflags & REG_ICASE 1929 && (tre_isupper_l(*ctx->re, ctx->loc) || 1930 tre_islower_l(*ctx->re, ctx->loc))) 1931 { 1932 tre_ast_node_t *tmp1; 1933 tre_ast_node_t *tmp2; 1934 1935 /* XXX - Can there be more than one opposite-case 1936 counterpoints for some character in some locale? Or 1937 more than two characters which all should be regarded 1938 the same character if case is ignored? If yes, there 1939 does not seem to be a portable way to detect it. I guess 1940 that at least for multi-character collating elements there 1941 could be several opposite-case counterpoints, but they 1942 cannot be supported portably anyway. */ 1943 tmp1 = tre_ast_new_literal(ctx->mem, 1944 tre_toupper_l(*ctx->re, ctx->loc), 1945 tre_toupper_l(*ctx->re, ctx->loc), 1946 ctx->position); 1947 if (!tmp1) 1948 return REG_ESPACE; 1949 tmp2 = tre_ast_new_literal(ctx->mem, 1950 tre_tolower_l(*ctx->re, ctx->loc), 1951 tre_tolower_l(*ctx->re, ctx->loc), 1952 ctx->position); 1953 if (!tmp2) 1954 return REG_ESPACE; 1955 result = tre_ast_new_union(ctx->mem, tmp1, tmp2); 1956 if (!result) 1957 return REG_ESPACE; 1958 } 1959 else 1960 { 1961 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, 1962 ctx->position); 1963 if (!result) 1964 return REG_ESPACE; 1965 } 1966 ctx->position++; 1967 ctx->re++; 1968 break; 1969 } 1970 break; 1971 } 1972 1973 case PARSE_MARK_FOR_SUBMATCH: 1974 { 1975 int submatch_id = tre_stack_pop_int(stack); 1976 1977 ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */ 1978 if (result->submatch_id >= 0 && 1979 result->submatch_id < SUBMATCH_ID_INVISIBLE_START) 1980 { 1981 tre_ast_node_t *n, *tmp_node; 1982 if (submatch_id >= SUBMATCH_ID_INVISIBLE_START) 1983 break; 1984 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); 1985 if (n == NULL) 1986 return REG_ESPACE; 1987 tmp_node = tre_ast_new_catenation(ctx->mem, n, result); 1988 if (tmp_node == NULL) 1989 return REG_ESPACE; 1990 tmp_node->num_submatches = result->num_submatches; 1991 result = tmp_node; 1992 } 1993 result->submatch_id = submatch_id; 1994 if (submatch_id < SUBMATCH_ID_INVISIBLE_START) 1995 result->num_submatches++; 1996 break; 1997 } 1998 1999 default: 2000 assert(0); 2001 break; 2002 } 2003 } 2004 2005 /* Check for missing closing parentheses. */ 2006 if (depth > 0) 2007 return REG_EPAREN; 2008 2009 ctx->result = result; 2010 2011 return REG_OK; 2012 }