1 /* 2 * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 tre-match-backtrack.c - TRE backtracking regex matching engine 31 */ 32 33 /* 34 This matcher is for regexps that use back referencing. Regexp matching 35 with back referencing is an NP-complete problem on the number of back 36 references. The easiest way to match them is to use a backtracking 37 routine which basically goes through all possible paths in the TNFA 38 and chooses the one which results in the best (leftmost and longest) 39 match. This can be spectacularly expensive and may run out of stack 40 space, but there really is no better known generic algorithm. Quoting 41 Henry Spencer from comp.compilers: 42 <URL: http://compilers.iecc.com/comparch/article/93-03-102> 43 44 POSIX.2 REs require longest match, which is really exciting to 45 implement since the obsolete ("basic") variant also includes 46 \<digit>. I haven't found a better way of tackling this than doing 47 a preliminary match using a DFA (or simulation) on a modified RE 48 that just replicates subREs for \<digit>, and then doing a 49 backtracking match to determine whether the subRE matches were 50 right. This can be rather slow, but I console myself with the 51 thought that people who use \<digit> deserve very slow execution. 52 (Pun unintentional but very appropriate.) 53 54 */ 55 56 #include <assert.h> 57 #include <stdlib.h> 58 #include <string.h> 59 #include <wchar.h> 60 #include <wctype.h> 61 62 #include "tre-internal.h" 63 #include "tre-mem.h" 64 #include "tre-match-utils.h" 65 #include "tre.h" 66 #include "malloc.h" 67 68 typedef struct { 69 int pos; 70 unsigned int pos_add_next; 71 const char *str_byte; 72 const wchar_t *str_wide; 73 tre_tnfa_transition_t *state; 74 int state_id; 75 int next_c; 76 tre_tag_t *tags; 77 mbstate_t mbstate; 78 } tre_backtrack_item_t; 79 80 typedef struct tre_backtrack_struct { 81 tre_backtrack_item_t item; 82 struct tre_backtrack_struct *prev; 83 struct tre_backtrack_struct *next; 84 } *tre_backtrack_t; 85 86 #define BT_STACK_WIDE_IN(_str_wide) stack->item.str_wide = (_str_wide) 87 #define BT_STACK_WIDE_OUT (str_wide) = stack->item.str_wide 88 89 #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate) 90 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate 91 92 #define tre_bt_mem_new tre_mem_new 93 #define tre_bt_mem_alloc tre_mem_alloc 94 #define tre_bt_mem_destroy tre_mem_destroy 95 96 #define BT_STACK_PUSH(_pos, _pos_add_next, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \ 97 do \ 98 { \ 99 if (!stack->next) \ 100 { \ 101 tre_backtrack_t s; \ 102 s = tre_bt_mem_alloc(mem, sizeof(*s)); \ 103 if (!s) \ 104 { \ 105 tre_bt_mem_destroy(mem); \ 106 if (tags) \ 107 free(tags); \ 108 if (pmatch) \ 109 free(pmatch); \ 110 if (states_seen) \ 111 free(states_seen); \ 112 return REG_ESPACE; \ 113 } \ 114 s->prev = stack; \ 115 s->next = NULL; \ 116 s->item.tags = tre_bt_mem_alloc(mem, \ 117 num_tags * sizeof(*tags)); \ 118 if (!s->item.tags) \ 119 { \ 120 tre_bt_mem_destroy(mem); \ 121 if (tags) \ 122 free(tags); \ 123 if (pmatch) \ 124 free(pmatch); \ 125 if (states_seen) \ 126 free(states_seen); \ 127 return REG_ESPACE; \ 128 } \ 129 stack->next = s; \ 130 stack = s; \ 131 } \ 132 else \ 133 stack = stack->next; \ 134 stack->item.pos = (_pos); \ 135 stack->item.pos_add_next = (_pos_add_next); \ 136 stack->item.str_byte = (_str_byte); \ 137 BT_STACK_WIDE_IN(_str_wide); \ 138 stack->item.state = (_state); \ 139 stack->item.state_id = (_state_id); \ 140 stack->item.next_c = (_next_c); \ 141 memcpy(stack->item.tags, (_tags), num_tags * sizeof(*(_tags))); \ 142 BT_STACK_MBSTATE_IN; \ 143 } \ 144 while (/*CONSTCOND*/0) 145 146 #define BT_STACK_POP() \ 147 do \ 148 { \ 149 assert(stack->prev); \ 150 pos = stack->item.pos; \ 151 pos_add_next = stack->item.pos_add_next; \ 152 str_byte = stack->item.str_byte; \ 153 BT_STACK_WIDE_OUT; \ 154 state = stack->item.state; \ 155 next_c = stack->item.next_c; \ 156 memcpy(tags, stack->item.tags, num_tags * sizeof(*tags)); \ 157 BT_STACK_MBSTATE_OUT; \ 158 stack = stack->prev; \ 159 } \ 160 while (/*CONSTCOND*/0) 161 162 #undef MIN 163 #define MIN(a, b) ((a) <= (b) ? (a) : (b)) 164 165 reg_errcode_t 166 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, 167 int len, tre_str_type_t type, tre_tag_t *match_tags, 168 int eflags, int *match_end_ofs) 169 { 170 /* State variables required by GET_NEXT_WCHAR. */ 171 tre_char_t prev_c = 0, next_c = 0; 172 const char *str_byte = string; 173 int pos = 0; 174 unsigned int pos_add_next = 1; 175 const wchar_t *str_wide = string; 176 mbstate_t mbstate; 177 int reg_notbol = eflags & REG_NOTBOL; 178 int reg_noteol = eflags & REG_NOTEOL; 179 int reg_newline = tnfa->cflags & REG_NEWLINE; 180 int i; 181 182 /* These are used to remember the necessary values of the above 183 variables to return to the position where the current search 184 started from. */ 185 int next_c_start; 186 const char *str_byte_start; 187 int pos_start = -1; 188 const wchar_t *str_wide_start; 189 mbstate_t mbstate_start; 190 191 /* End offset of best match so far, or -1 if no match found yet. */ 192 int match_eo = -1; 193 /* Tag arrays. */ 194 int *next_tags; 195 tre_tag_t *tags = NULL; 196 /* Current TNFA state. */ 197 tre_tnfa_transition_t *state; 198 int *states_seen = NULL; 199 200 /* Memory allocator to for allocating the backtracking stack. */ 201 tre_mem_t mem = tre_bt_mem_new(); 202 203 /* The backtracking stack. */ 204 tre_backtrack_t stack; 205 206 tre_tnfa_transition_t *trans_i; 207 regmatch_t *pmatch = NULL; 208 reg_errcode_t ret; 209 210 int num_tags = tnfa->num_tags; 211 int touch = 1; 212 char *buf; 213 int tbytes; 214 215 memset(&mbstate, '\0', sizeof(mbstate)); 216 buf = NULL; 217 218 if (!mem) 219 return REG_ESPACE; 220 stack = tre_bt_mem_alloc(mem, sizeof(*stack)); 221 if (!stack) 222 { 223 ret = REG_ESPACE; 224 goto error_exit; 225 } 226 stack->prev = NULL; 227 stack->next = NULL; 228 229 DPRINT(("tnfa_execute_backtrack, input type %d\n", type)); 230 DPRINT(("len = %d\n", len)); 231 232 { 233 int pbytes, sbytes, total_bytes; 234 char *tmp_buf; 235 /* Compute the length of the block we need. */ 236 tbytes = sizeof(*tags) * num_tags; 237 pbytes = sizeof(*pmatch) * tnfa->num_submatches; 238 sbytes = sizeof(*states_seen) * tnfa->num_states; 239 total_bytes = 240 (sizeof(long) - 1) * 2 /* for alignment paddings */ 241 + tbytes + pbytes + sbytes; 242 243 DPRINT(("tre_tnfa_run_backtrack, allocate %d bytes\n", total_bytes)); 244 /* Allocate the memory. */ 245 buf = malloc((unsigned)total_bytes); 246 if (buf == NULL) 247 return REG_ESPACE; 248 249 /* Get the various pointers within tmp_buf (properly aligned). */ 250 tags = (void *)buf; 251 tmp_buf = buf + tbytes; 252 tmp_buf += ALIGN(tmp_buf, long); 253 pmatch = (void *)tmp_buf; 254 tmp_buf += pbytes; 255 tmp_buf += ALIGN(tmp_buf, long); 256 states_seen = (void *)tmp_buf; 257 } 258 259 retry: 260 { 261 memset(tags, 0, num_tags * sizeof(*tags)); 262 if (match_tags) 263 memset(match_tags, 0, num_tags * sizeof(*match_tags)); 264 for (i = 0; i < tnfa->num_states; i++) 265 states_seen[i] = 0; 266 } 267 268 state = NULL; 269 pos = pos_start; 270 GET_NEXT_WCHAR(); 271 pos_start = pos; 272 next_c_start = next_c; 273 str_byte_start = str_byte; 274 str_wide_start = str_wide; 275 mbstate_start = mbstate; 276 277 /* Handle initial states. */ 278 next_tags = NULL; 279 for (trans_i = tnfa->initial; trans_i->state; trans_i++) 280 { 281 DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c)); 282 if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions)) 283 { 284 DPRINT(("assert failed\n")); 285 continue; 286 } 287 if (state == NULL) 288 { 289 /* Start from this state. */ 290 state = trans_i->state; 291 next_tags = trans_i->tags; 292 } 293 else 294 { 295 /* Backtrack to this state. */ 296 DPRINT(("saving state %d for backtracking\n", trans_i->state_id)); 297 BT_STACK_PUSH(pos, pos_add_next, str_byte, str_wide, trans_i->state, 298 trans_i->state_id, next_c, tags, mbstate); 299 { 300 int *tmp = trans_i->tags; 301 if (tmp) 302 { 303 while (*tmp >= 0) 304 tre_tag_set(stack->item.tags, *tmp++, pos, touch); 305 touch++; 306 } 307 } 308 } 309 } 310 311 if (next_tags) 312 { 313 for (; *next_tags >= 0; next_tags++) 314 tre_tag_set(tags, *next_tags, pos, touch); 315 touch++; 316 } 317 318 DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte)); 319 DPRINT(("pos:chr/code | state and tags\n")); 320 DPRINT(("-------------+------------------------------------------------\n")); 321 322 if (state == NULL) 323 goto backtrack; 324 325 while (/*CONSTCOND*/1) 326 { 327 tre_tnfa_transition_t *next_state; 328 int empty_br_match; 329 330 DPRINT(("start loop\n")); 331 332 if (match_eo >= 0 && tnfa->num_minimals) 333 { 334 int skip = 0; 335 #ifdef TRE_DEBUG 336 DPRINT(("Checking minimal conditions: match_eo=%d match_tags=", 337 match_eo)); 338 tre_print_tags(match_tags, tnfa->num_tags); 339 DPRINT(("\n")); 340 #endif /* TRE_DEBUG */ 341 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) 342 { 343 int end = tnfa->minimal_tags[i]; 344 int start = tnfa->minimal_tags[i + 1]; 345 DPRINT((" Minimal start %d, end %d\n", start, end)); 346 if (tre_minimal_tag_order(start, end, match_tags, tags) > 0) 347 { 348 skip = 1; 349 break; 350 } 351 } 352 if (!skip) 353 { 354 #ifdef TRE_DEBUG 355 DPRINT((" Keeping tags=")); 356 tre_print_tags(tags, tnfa->num_tags); 357 DPRINT(("\n")); 358 #endif /* TRE_DEBUG */ 359 } 360 else 361 { 362 #ifdef TRE_DEBUG 363 DPRINT((" Throwing out tags=")); 364 tre_print_tags(tags, tnfa->num_tags); 365 DPRINT(("\n")); 366 #endif /* TRE_DEBUG */ 367 goto backtrack; 368 } 369 } 370 371 if (state == tnfa->final) 372 { 373 DPRINT((" match found, match_eo=%d pos=%d\n", match_eo, pos)); 374 375 if (match_eo >= 0 && tnfa->num_minimals) 376 { 377 int compare = 0; 378 #ifdef TRE_DEBUG 379 DPRINT(("Checking minimal conditions: match_eo=%d " 380 "match_tags=", match_eo)); 381 tre_print_tags(match_tags, tnfa->num_tags); 382 DPRINT(("\n")); 383 #endif /* TRE_DEBUG */ 384 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) 385 { 386 int end = tnfa->minimal_tags[i]; 387 int start = tnfa->minimal_tags[i + 1]; 388 DPRINT((" Minimal start %d, end %d\n", start, end)); 389 if ((compare = tre_minimal_tag_order(start, end, 390 match_tags, tags)) != 0) 391 break; 392 } 393 if (compare > 0) 394 { 395 #ifdef TRE_DEBUG 396 DPRINT((" Throwing out new match, tags=")); 397 tre_print_tags(tags, tnfa->num_tags); 398 DPRINT(("\n")); 399 #endif /* TRE_DEBUG */ 400 goto backtrack; 401 } 402 else if (compare < 0) 403 { 404 #ifdef TRE_DEBUG 405 DPRINT((" Throwing out old match, tags=")); 406 tre_print_tags(match_tags, tnfa->num_tags); 407 DPRINT(("\n")); 408 #endif /* TRE_DEBUG */ 409 match_eo = -1; 410 } 411 } 412 413 if (match_eo < pos 414 || (match_eo == pos 415 && match_tags 416 && tre_tag_order(tnfa->num_tags, tnfa->tag_directions, 417 tags, match_tags))) 418 { 419 /* This match wins the previous match. */ 420 #ifdef TRE_DEBUG 421 DPRINT((" win previous tags=")); 422 tre_print_tags(tags, tnfa->num_tags); 423 DPRINT(("\n")); 424 #endif /* TRE_DEBUG */ 425 match_eo = pos; 426 if (match_tags) 427 memcpy(match_tags, tags, num_tags * sizeof(*tags)); 428 } 429 /* Our TNFAs never have transitions leaving from the final state, 430 so we jump right to backtracking. */ 431 goto backtrack; 432 } 433 434 #ifdef TRE_DEBUG 435 DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c, 436 state)); 437 tre_print_tags(tags, tnfa->num_tags); 438 DPRINT(("\n")); 439 #endif /* TRE_DEBUG */ 440 441 /* Go to the next character in the input string. */ 442 empty_br_match = 0; 443 trans_i = state; 444 if (trans_i->state && trans_i->assertions & ASSERT_BACKREF) 445 { 446 /* This is a back reference state. All transitions leaving from 447 this state have the same back reference "assertion". Instead 448 of reading the next character, we match the back reference. */ 449 int so, eo, bt = trans_i->u.backref; 450 int bt_len; 451 int result; 452 453 DPRINT((" should match back reference %d\n", bt)); 454 /* Get the substring we need to match against. Remember to 455 turn off REG_NOSUB temporarily. */ 456 ret = tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB, 457 tnfa, tags, pos); 458 if (ret != REG_OK) goto error_exit; 459 so = pmatch[bt].rm_so; 460 eo = pmatch[bt].rm_eo; 461 bt_len = eo - so; 462 463 #ifdef TRE_DEBUG 464 { 465 int slen; 466 if (len < 0) 467 slen = bt_len; 468 else 469 slen = MIN(bt_len, len - pos); 470 471 if (type == STR_BYTE) 472 { 473 DPRINT((" substring (len %d) is [%d, %d]: '%.*s'\n", 474 bt_len, so, eo, bt_len, (char*)string + so)); 475 DPRINT((" current string is '%.*s'\n", slen, str_byte - 1)); 476 } 477 else if (type == STR_WIDE) 478 { 479 DPRINT((" substring (len %d) is [%d, %d]: '%.*" STRF "'\n", 480 bt_len, so, eo, bt_len, (wchar_t*)string + so)); 481 DPRINT((" current string is '%.*" STRF "'\n", 482 slen, str_wide - 1)); 483 } 484 } 485 #endif 486 487 if (so < 0) 488 { 489 result = 1; /* Back reference of nomatch doesn't match */ 490 } 491 else if (len < 0) 492 { 493 if (type == STR_WIDE) 494 result = wcsncmp((const wchar_t*)string + so, str_wide - 1, 495 (size_t)bt_len); 496 else 497 result = strncmp((const char*)string + so, str_byte - 1, 498 (size_t)bt_len); 499 } 500 else if (len - pos < bt_len) 501 result = 1; 502 else if (type == STR_WIDE) 503 result = wmemcmp((const wchar_t*)string + so, str_wide - 1, 504 (size_t)bt_len); 505 else 506 result = memcmp((const char*)string + so, str_byte - 1, 507 (size_t)bt_len); 508 509 if (result == 0) 510 { 511 /* Back reference matched. Check for infinite loop. */ 512 if (bt_len == 0) 513 empty_br_match = 1; 514 if (empty_br_match && states_seen[trans_i->state_id]) 515 { 516 DPRINT((" avoid loop\n")); 517 goto backtrack; 518 } 519 520 states_seen[trans_i->state_id] = empty_br_match; 521 522 /* Advance in input string and resync `prev_c', `next_c' 523 and pos. */ 524 DPRINT((" back reference matched\n")); 525 str_byte += bt_len - 1; 526 str_wide += bt_len - 1; 527 pos += bt_len - 1; 528 GET_NEXT_WCHAR(); 529 DPRINT((" pos now %d\n", pos)); 530 } 531 else 532 { 533 DPRINT((" back reference did not match\n")); 534 goto backtrack; 535 } 536 } 537 else 538 { 539 /* Check for end of string. */ 540 if (len < 0) 541 { 542 if (next_c == L'\0') 543 goto backtrack; 544 } 545 else 546 { 547 if (pos >= len) 548 goto backtrack; 549 } 550 551 /* Read the next character. */ 552 GET_NEXT_WCHAR(); 553 } 554 555 next_state = NULL; 556 for (trans_i = state; trans_i->state; trans_i++) 557 { 558 DPRINT((" transition %d-%d (%c-%c) %d to %d\n", 559 trans_i->code_min, trans_i->code_max, 560 trans_i->code_min, trans_i->code_max, 561 trans_i->assertions, trans_i->state_id)); 562 if (trans_i->code_min <= (tre_cint_t)prev_c 563 && trans_i->code_max >= (tre_cint_t)prev_c) 564 { 565 if (trans_i->assertions 566 && (CHECK_ASSERTIONS(trans_i->assertions) 567 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) 568 { 569 DPRINT((" assertion failed\n")); 570 continue; 571 } 572 573 if (next_state == NULL) 574 { 575 /* First matching transition. */ 576 DPRINT((" Next state is %d\n", trans_i->state_id)); 577 next_state = trans_i->state; 578 next_tags = trans_i->tags; 579 } 580 else 581 { 582 /* Second matching transition. We may need to backtrack here 583 to take this transition instead of the first one, so we 584 push this transition in the backtracking stack so we can 585 jump back here if needed. */ 586 DPRINT((" saving state %d for backtracking\n", 587 trans_i->state_id)); 588 BT_STACK_PUSH(pos, pos_add_next, str_byte, str_wide, 589 trans_i->state, trans_i->state_id, next_c, 590 tags, mbstate); 591 { 592 int *tmp; 593 for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++) 594 tre_tag_set(stack->item.tags, *tmp, pos, touch); 595 touch++; 596 } 597 #if 0 /* XXX - it's important not to look at all transitions here to keep 598 the stack small! */ 599 break; 600 #endif 601 } 602 } 603 } 604 605 if (next_state != NULL) 606 { 607 /* Matching transitions were found. Take the first one. */ 608 state = next_state; 609 610 /* Update the tag values. */ 611 if (next_tags) 612 { 613 while (*next_tags >= 0) 614 tre_tag_set(tags, *next_tags++, pos, touch); 615 touch++; 616 } 617 } 618 else 619 { 620 backtrack: 621 /* A matching transition was not found. Try to backtrack. */ 622 if (stack->prev) 623 { 624 DPRINT((" backtracking\n")); 625 if (stack->item.state->assertions & ASSERT_BACKREF) 626 { 627 DPRINT((" states_seen[%d] = 0\n", 628 stack->item.state_id)); 629 states_seen[stack->item.state_id] = 0; 630 } 631 632 BT_STACK_POP(); 633 } 634 else if (match_eo < 0) 635 { 636 /* Try starting from a later position in the input string. */ 637 /* Check for end of string. */ 638 if (pos == pos_start) 639 { 640 if (len < 0) 641 { 642 if (next_c == L'\0') 643 { 644 DPRINT(("end of string.\n")); 645 break; 646 } 647 } 648 else 649 { 650 if (pos >= len) 651 { 652 DPRINT(("end of string.\n")); 653 break; 654 } 655 } 656 } 657 DPRINT(("restarting from next start position\n")); 658 next_c = next_c_start; 659 mbstate = mbstate_start; 660 str_byte = str_byte_start; 661 str_wide = str_wide_start; 662 goto retry; 663 } 664 else 665 { 666 DPRINT(("finished\n")); 667 break; 668 } 669 } 670 } 671 672 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH; 673 *match_end_ofs = match_eo; 674 675 error_exit: 676 tre_bt_mem_destroy(mem); 677 if (buf) 678 free(buf); 679 680 return ret; 681 }