1 /*
2 * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 tre-match-backtrack.c - TRE backtracking regex matching engine
31 */
32
33 /*
34 This matcher is for regexps that use back referencing. Regexp matching
35 with back referencing is an NP-complete problem on the number of back
36 references. The easiest way to match them is to use a backtracking
37 routine which basically goes through all possible paths in the TNFA
38 and chooses the one which results in the best (leftmost and longest)
39 match. This can be spectacularly expensive and may run out of stack
40 space, but there really is no better known generic algorithm. Quoting
41 Henry Spencer from comp.compilers:
42 <URL: http://compilers.iecc.com/comparch/article/93-03-102>
43
44 POSIX.2 REs require longest match, which is really exciting to
45 implement since the obsolete ("basic") variant also includes
46 \<digit>. I haven't found a better way of tackling this than doing
47 a preliminary match using a DFA (or simulation) on a modified RE
48 that just replicates subREs for \<digit>, and then doing a
49 backtracking match to determine whether the subRE matches were
50 right. This can be rather slow, but I console myself with the
51 thought that people who use \<digit> deserve very slow execution.
52 (Pun unintentional but very appropriate.)
53
54 */
55
56 #include <assert.h>
57 #include <stdlib.h>
58 #include <string.h>
59 #include <wchar.h>
60 #include <wctype.h>
61
62 #include "tre-internal.h"
63 #include "tre-mem.h"
64 #include "tre-match-utils.h"
65 #include "tre.h"
66 #include "malloc.h"
67
68 typedef struct {
69 int pos;
70 unsigned int pos_add_next;
71 const char *str_byte;
72 const wchar_t *str_wide;
73 tre_tnfa_transition_t *state;
74 int state_id;
75 int next_c;
76 tre_tag_t *tags;
77 mbstate_t mbstate;
78 } tre_backtrack_item_t;
79
80 typedef struct tre_backtrack_struct {
81 tre_backtrack_item_t item;
82 struct tre_backtrack_struct *prev;
83 struct tre_backtrack_struct *next;
84 } *tre_backtrack_t;
85
86 #define BT_STACK_WIDE_IN(_str_wide) stack->item.str_wide = (_str_wide)
87 #define BT_STACK_WIDE_OUT (str_wide) = stack->item.str_wide
88
89 #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate)
90 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
91
92 #define tre_bt_mem_new tre_mem_new
93 #define tre_bt_mem_alloc tre_mem_alloc
94 #define tre_bt_mem_destroy tre_mem_destroy
95
96 #define BT_STACK_PUSH(_pos, _pos_add_next, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
97 do \
98 { \
99 if (!stack->next) \
100 { \
101 tre_backtrack_t s; \
102 s = tre_bt_mem_alloc(mem, sizeof(*s)); \
103 if (!s) \
104 { \
105 tre_bt_mem_destroy(mem); \
106 if (tags) \
107 free(tags); \
108 if (pmatch) \
109 free(pmatch); \
110 if (states_seen) \
111 free(states_seen); \
112 return REG_ESPACE; \
113 } \
114 s->prev = stack; \
115 s->next = NULL; \
116 s->item.tags = tre_bt_mem_alloc(mem, \
117 num_tags * sizeof(*tags)); \
118 if (!s->item.tags) \
119 { \
120 tre_bt_mem_destroy(mem); \
121 if (tags) \
122 free(tags); \
123 if (pmatch) \
124 free(pmatch); \
125 if (states_seen) \
126 free(states_seen); \
127 return REG_ESPACE; \
128 } \
129 stack->next = s; \
130 stack = s; \
131 } \
132 else \
133 stack = stack->next; \
134 stack->item.pos = (_pos); \
135 stack->item.pos_add_next = (_pos_add_next); \
136 stack->item.str_byte = (_str_byte); \
137 BT_STACK_WIDE_IN(_str_wide); \
138 stack->item.state = (_state); \
139 stack->item.state_id = (_state_id); \
140 stack->item.next_c = (_next_c); \
141 memcpy(stack->item.tags, (_tags), num_tags * sizeof(*(_tags))); \
142 BT_STACK_MBSTATE_IN; \
143 } \
144 while (/*CONSTCOND*/0)
145
146 #define BT_STACK_POP() \
147 do \
148 { \
149 assert(stack->prev); \
150 pos = stack->item.pos; \
151 pos_add_next = stack->item.pos_add_next; \
152 str_byte = stack->item.str_byte; \
153 BT_STACK_WIDE_OUT; \
154 state = stack->item.state; \
155 next_c = stack->item.next_c; \
156 memcpy(tags, stack->item.tags, num_tags * sizeof(*tags)); \
157 BT_STACK_MBSTATE_OUT; \
158 stack = stack->prev; \
159 } \
160 while (/*CONSTCOND*/0)
161
162 #undef MIN
163 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
164
165 reg_errcode_t
166 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
167 int len, tre_str_type_t type, tre_tag_t *match_tags,
168 int eflags, int *match_end_ofs)
169 {
170 /* State variables required by GET_NEXT_WCHAR. */
171 tre_char_t prev_c = 0, next_c = 0;
172 const char *str_byte = string;
173 int pos = 0;
174 unsigned int pos_add_next = 1;
175 const wchar_t *str_wide = string;
176 mbstate_t mbstate;
177 int reg_notbol = eflags & REG_NOTBOL;
178 int reg_noteol = eflags & REG_NOTEOL;
179 int reg_newline = tnfa->cflags & REG_NEWLINE;
180 int i;
181
182 /* These are used to remember the necessary values of the above
183 variables to return to the position where the current search
184 started from. */
185 int next_c_start;
186 const char *str_byte_start;
187 int pos_start = -1;
188 const wchar_t *str_wide_start;
189 mbstate_t mbstate_start;
190
191 /* End offset of best match so far, or -1 if no match found yet. */
192 int match_eo = -1;
193 /* Tag arrays. */
194 int *next_tags;
195 tre_tag_t *tags = NULL;
196 /* Current TNFA state. */
197 tre_tnfa_transition_t *state;
198 int *states_seen = NULL;
199
200 /* Memory allocator to for allocating the backtracking stack. */
201 tre_mem_t mem = tre_bt_mem_new();
202
203 /* The backtracking stack. */
204 tre_backtrack_t stack;
205
206 tre_tnfa_transition_t *trans_i;
207 regmatch_t *pmatch = NULL;
208 reg_errcode_t ret;
209
210 int num_tags = tnfa->num_tags;
211 int touch = 1;
212 char *buf;
213 int tbytes;
214
215 memset(&mbstate, '\0', sizeof(mbstate));
216 buf = NULL;
217
218 if (!mem)
219 return REG_ESPACE;
220 stack = tre_bt_mem_alloc(mem, sizeof(*stack));
221 if (!stack)
222 {
223 ret = REG_ESPACE;
224 goto error_exit;
225 }
226 stack->prev = NULL;
227 stack->next = NULL;
228
229 DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
230 DPRINT(("len = %d\n", len));
231
232 {
233 int pbytes, sbytes, total_bytes;
234 char *tmp_buf;
235 /* Compute the length of the block we need. */
236 tbytes = sizeof(*tags) * num_tags;
237 pbytes = sizeof(*pmatch) * tnfa->num_submatches;
238 sbytes = sizeof(*states_seen) * tnfa->num_states;
239 total_bytes =
240 (sizeof(long) - 1) * 2 /* for alignment paddings */
241 + tbytes + pbytes + sbytes;
242
243 DPRINT(("tre_tnfa_run_backtrack, allocate %d bytes\n", total_bytes));
244 /* Allocate the memory. */
245 buf = malloc((unsigned)total_bytes);
246 if (buf == NULL)
247 return REG_ESPACE;
248
249 /* Get the various pointers within tmp_buf (properly aligned). */
250 tags = (void *)buf;
251 tmp_buf = buf + tbytes;
252 tmp_buf += ALIGN(tmp_buf, long);
253 pmatch = (void *)tmp_buf;
254 tmp_buf += pbytes;
255 tmp_buf += ALIGN(tmp_buf, long);
256 states_seen = (void *)tmp_buf;
257 }
258
259 retry:
260 {
261 memset(tags, 0, num_tags * sizeof(*tags));
262 if (match_tags)
263 memset(match_tags, 0, num_tags * sizeof(*match_tags));
264 for (i = 0; i < tnfa->num_states; i++)
265 states_seen[i] = 0;
266 }
267
268 state = NULL;
269 pos = pos_start;
270 GET_NEXT_WCHAR();
271 pos_start = pos;
272 next_c_start = next_c;
273 str_byte_start = str_byte;
274 str_wide_start = str_wide;
275 mbstate_start = mbstate;
276
277 /* Handle initial states. */
278 next_tags = NULL;
279 for (trans_i = tnfa->initial; trans_i->state; trans_i++)
280 {
281 DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
282 if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
283 {
284 DPRINT(("assert failed\n"));
285 continue;
286 }
287 if (state == NULL)
288 {
289 /* Start from this state. */
290 state = trans_i->state;
291 next_tags = trans_i->tags;
292 }
293 else
294 {
295 /* Backtrack to this state. */
296 DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
297 BT_STACK_PUSH(pos, pos_add_next, str_byte, str_wide, trans_i->state,
298 trans_i->state_id, next_c, tags, mbstate);
299 {
300 int *tmp = trans_i->tags;
301 if (tmp)
302 {
303 while (*tmp >= 0)
304 tre_tag_set(stack->item.tags, *tmp++, pos, touch);
305 touch++;
306 }
307 }
308 }
309 }
310
311 if (next_tags)
312 {
313 for (; *next_tags >= 0; next_tags++)
314 tre_tag_set(tags, *next_tags, pos, touch);
315 touch++;
316 }
317
318 DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
319 DPRINT(("pos:chr/code | state and tags\n"));
320 DPRINT(("-------------+------------------------------------------------\n"));
321
322 if (state == NULL)
323 goto backtrack;
324
325 while (/*CONSTCOND*/1)
326 {
327 tre_tnfa_transition_t *next_state;
328 int empty_br_match;
329
330 DPRINT(("start loop\n"));
331
332 if (match_eo >= 0 && tnfa->num_minimals)
333 {
334 int skip = 0;
335 #ifdef TRE_DEBUG
336 DPRINT(("Checking minimal conditions: match_eo=%d match_tags=",
337 match_eo));
338 tre_print_tags(match_tags, tnfa->num_tags);
339 DPRINT(("\n"));
340 #endif /* TRE_DEBUG */
341 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
342 {
343 int end = tnfa->minimal_tags[i];
344 int start = tnfa->minimal_tags[i + 1];
345 DPRINT((" Minimal start %d, end %d\n", start, end));
346 if (tre_minimal_tag_order(start, end, match_tags, tags) > 0)
347 {
348 skip = 1;
349 break;
350 }
351 }
352 if (!skip)
353 {
354 #ifdef TRE_DEBUG
355 DPRINT((" Keeping tags="));
356 tre_print_tags(tags, tnfa->num_tags);
357 DPRINT(("\n"));
358 #endif /* TRE_DEBUG */
359 }
360 else
361 {
362 #ifdef TRE_DEBUG
363 DPRINT((" Throwing out tags="));
364 tre_print_tags(tags, tnfa->num_tags);
365 DPRINT(("\n"));
366 #endif /* TRE_DEBUG */
367 goto backtrack;
368 }
369 }
370
371 if (state == tnfa->final)
372 {
373 DPRINT((" match found, match_eo=%d pos=%d\n", match_eo, pos));
374
375 if (match_eo >= 0 && tnfa->num_minimals)
376 {
377 int compare = 0;
378 #ifdef TRE_DEBUG
379 DPRINT(("Checking minimal conditions: match_eo=%d "
380 "match_tags=", match_eo));
381 tre_print_tags(match_tags, tnfa->num_tags);
382 DPRINT(("\n"));
383 #endif /* TRE_DEBUG */
384 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
385 {
386 int end = tnfa->minimal_tags[i];
387 int start = tnfa->minimal_tags[i + 1];
388 DPRINT((" Minimal start %d, end %d\n", start, end));
389 if ((compare = tre_minimal_tag_order(start, end,
390 match_tags, tags)) != 0)
391 break;
392 }
393 if (compare > 0)
394 {
395 #ifdef TRE_DEBUG
396 DPRINT((" Throwing out new match, tags="));
397 tre_print_tags(tags, tnfa->num_tags);
398 DPRINT(("\n"));
399 #endif /* TRE_DEBUG */
400 goto backtrack;
401 }
402 else if (compare < 0)
403 {
404 #ifdef TRE_DEBUG
405 DPRINT((" Throwing out old match, tags="));
406 tre_print_tags(match_tags, tnfa->num_tags);
407 DPRINT(("\n"));
408 #endif /* TRE_DEBUG */
409 match_eo = -1;
410 }
411 }
412
413 if (match_eo < pos
414 || (match_eo == pos
415 && match_tags
416 && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
417 tags, match_tags)))
418 {
419 /* This match wins the previous match. */
420 #ifdef TRE_DEBUG
421 DPRINT((" win previous tags="));
422 tre_print_tags(tags, tnfa->num_tags);
423 DPRINT(("\n"));
424 #endif /* TRE_DEBUG */
425 match_eo = pos;
426 if (match_tags)
427 memcpy(match_tags, tags, num_tags * sizeof(*tags));
428 }
429 /* Our TNFAs never have transitions leaving from the final state,
430 so we jump right to backtracking. */
431 goto backtrack;
432 }
433
434 #ifdef TRE_DEBUG
435 DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
436 state));
437 tre_print_tags(tags, tnfa->num_tags);
438 DPRINT(("\n"));
439 #endif /* TRE_DEBUG */
440
441 /* Go to the next character in the input string. */
442 empty_br_match = 0;
443 trans_i = state;
444 if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
445 {
446 /* This is a back reference state. All transitions leaving from
447 this state have the same back reference "assertion". Instead
448 of reading the next character, we match the back reference. */
449 int so, eo, bt = trans_i->u.backref;
450 int bt_len;
451 int result;
452
453 DPRINT((" should match back reference %d\n", bt));
454 /* Get the substring we need to match against. Remember to
455 turn off REG_NOSUB temporarily. */
456 ret = tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
457 tnfa, tags, pos);
458 if (ret != REG_OK) goto error_exit;
459 so = pmatch[bt].rm_so;
460 eo = pmatch[bt].rm_eo;
461 bt_len = eo - so;
462
463 #ifdef TRE_DEBUG
464 {
465 int slen;
466 if (len < 0)
467 slen = bt_len;
468 else
469 slen = MIN(bt_len, len - pos);
470
471 if (type == STR_BYTE)
472 {
473 DPRINT((" substring (len %d) is [%d, %d]: '%.*s'\n",
474 bt_len, so, eo, bt_len, (char*)string + so));
475 DPRINT((" current string is '%.*s'\n", slen, str_byte - 1));
476 }
477 else if (type == STR_WIDE)
478 {
479 DPRINT((" substring (len %d) is [%d, %d]: '%.*" STRF "'\n",
480 bt_len, so, eo, bt_len, (wchar_t*)string + so));
481 DPRINT((" current string is '%.*" STRF "'\n",
482 slen, str_wide - 1));
483 }
484 }
485 #endif
486
487 if (so < 0)
488 {
489 result = 1; /* Back reference of nomatch doesn't match */
490 }
491 else if (len < 0)
492 {
493 if (type == STR_WIDE)
494 result = wcsncmp((const wchar_t*)string + so, str_wide - 1,
495 (size_t)bt_len);
496 else
497 result = strncmp((const char*)string + so, str_byte - 1,
498 (size_t)bt_len);
499 }
500 else if (len - pos < bt_len)
501 result = 1;
502 else if (type == STR_WIDE)
503 result = wmemcmp((const wchar_t*)string + so, str_wide - 1,
504 (size_t)bt_len);
505 else
506 result = memcmp((const char*)string + so, str_byte - 1,
507 (size_t)bt_len);
508
509 if (result == 0)
510 {
511 /* Back reference matched. Check for infinite loop. */
512 if (bt_len == 0)
513 empty_br_match = 1;
514 if (empty_br_match && states_seen[trans_i->state_id])
515 {
516 DPRINT((" avoid loop\n"));
517 goto backtrack;
518 }
519
520 states_seen[trans_i->state_id] = empty_br_match;
521
522 /* Advance in input string and resync `prev_c', `next_c'
523 and pos. */
524 DPRINT((" back reference matched\n"));
525 str_byte += bt_len - 1;
526 str_wide += bt_len - 1;
527 pos += bt_len - 1;
528 GET_NEXT_WCHAR();
529 DPRINT((" pos now %d\n", pos));
530 }
531 else
532 {
533 DPRINT((" back reference did not match\n"));
534 goto backtrack;
535 }
536 }
537 else
538 {
539 /* Check for end of string. */
540 if (len < 0)
541 {
542 if (next_c == L'\0')
543 goto backtrack;
544 }
545 else
546 {
547 if (pos >= len)
548 goto backtrack;
549 }
550
551 /* Read the next character. */
552 GET_NEXT_WCHAR();
553 }
554
555 next_state = NULL;
556 for (trans_i = state; trans_i->state; trans_i++)
557 {
558 DPRINT((" transition %d-%d (%c-%c) %d to %d\n",
559 trans_i->code_min, trans_i->code_max,
560 trans_i->code_min, trans_i->code_max,
561 trans_i->assertions, trans_i->state_id));
562 if (trans_i->code_min <= (tre_cint_t)prev_c
563 && trans_i->code_max >= (tre_cint_t)prev_c)
564 {
565 if (trans_i->assertions
566 && (CHECK_ASSERTIONS(trans_i->assertions)
567 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
568 {
569 DPRINT((" assertion failed\n"));
570 continue;
571 }
572
573 if (next_state == NULL)
574 {
575 /* First matching transition. */
576 DPRINT((" Next state is %d\n", trans_i->state_id));
577 next_state = trans_i->state;
578 next_tags = trans_i->tags;
579 }
580 else
581 {
582 /* Second matching transition. We may need to backtrack here
583 to take this transition instead of the first one, so we
584 push this transition in the backtracking stack so we can
585 jump back here if needed. */
586 DPRINT((" saving state %d for backtracking\n",
587 trans_i->state_id));
588 BT_STACK_PUSH(pos, pos_add_next, str_byte, str_wide,
589 trans_i->state, trans_i->state_id, next_c,
590 tags, mbstate);
591 {
592 int *tmp;
593 for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
594 tre_tag_set(stack->item.tags, *tmp, pos, touch);
595 touch++;
596 }
597 #if 0 /* XXX - it's important not to look at all transitions here to keep
598 the stack small! */
599 break;
600 #endif
601 }
602 }
603 }
604
605 if (next_state != NULL)
606 {
607 /* Matching transitions were found. Take the first one. */
608 state = next_state;
609
610 /* Update the tag values. */
611 if (next_tags)
612 {
613 while (*next_tags >= 0)
614 tre_tag_set(tags, *next_tags++, pos, touch);
615 touch++;
616 }
617 }
618 else
619 {
620 backtrack:
621 /* A matching transition was not found. Try to backtrack. */
622 if (stack->prev)
623 {
624 DPRINT((" backtracking\n"));
625 if (stack->item.state->assertions & ASSERT_BACKREF)
626 {
627 DPRINT((" states_seen[%d] = 0\n",
628 stack->item.state_id));
629 states_seen[stack->item.state_id] = 0;
630 }
631
632 BT_STACK_POP();
633 }
634 else if (match_eo < 0)
635 {
636 /* Try starting from a later position in the input string. */
637 /* Check for end of string. */
638 if (pos == pos_start)
639 {
640 if (len < 0)
641 {
642 if (next_c == L'\0')
643 {
644 DPRINT(("end of string.\n"));
645 break;
646 }
647 }
648 else
649 {
650 if (pos >= len)
651 {
652 DPRINT(("end of string.\n"));
653 break;
654 }
655 }
656 }
657 DPRINT(("restarting from next start position\n"));
658 next_c = next_c_start;
659 mbstate = mbstate_start;
660 str_byte = str_byte_start;
661 str_wide = str_wide_start;
662 goto retry;
663 }
664 else
665 {
666 DPRINT(("finished\n"));
667 break;
668 }
669 }
670 }
671
672 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
673 *match_end_ofs = match_eo;
674
675 error_exit:
676 tre_bt_mem_destroy(mem);
677 if (buf)
678 free(buf);
679
680 return ret;
681 }