1 /*
2 * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 tre-match-parallel.c - TRE parallel regex matching engine
31 */
32
33 /*
34 This algorithm searches for matches basically by reading characters
35 in the searched string one by one, starting at the beginning. All
36 matching paths in the TNFA are traversed in parallel. When two or
37 more paths reach the same state, exactly one is chosen according to
38 tag ordering rules; if returning submatches is not required it does
39 not matter which path is chosen.
40
41 The worst case time required for finding the leftmost and longest
42 match, or determining that there is no match, is always linearly
43 dependent on the length of the text being searched.
44
45 This algorithm cannot handle TNFAs with back referencing nodes.
46 See `tre-match-backtrack.c'.
47 */
48
49 #include <assert.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include <wchar.h>
53 #include <wctype.h>
54
55 #include "tre-internal.h"
56 #include "tre-match-utils.h"
57 #include "tre.h"
58
59 typedef struct {
60 tre_tnfa_transition_t *state;
61 tre_tag_t *tags;
62 } tre_tnfa_reach_t;
63
64 typedef struct {
65 int pos;
66 tre_tag_t **tags;
67 } tre_reach_pos_t;
68
69 #ifdef TRE_DEBUG
70 static void
71 tre_print_reach1(tre_tnfa_transition_t *state, tre_tag_t *tags, int num_tags)
72 {
73 DPRINT((" %p", (void *)state));
74 if (num_tags > 0)
75 {
76 DPRINT(("/"));
77 tre_print_tags(tags, num_tags);
78 }
79 }
80
81 static void
82 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
83 {
84 while (reach->state != NULL)
85 {
86 tre_print_reach1(reach->state, reach->tags, num_tags);
87 reach++;
88 }
89 DPRINT(("\n"));
90
91 }
92 #endif /* TRE_DEBUG */
93
94 reg_errcode_t
95 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
96 tre_str_type_t type, tre_tag_t *match_tags, int eflags,
97 int *match_end_ofs)
98 {
99 /* State variables required by GET_NEXT_WCHAR. */
100 tre_char_t prev_c = 0, next_c = 0;
101 const char *str_byte = string;
102 int pos = -1;
103 unsigned int pos_add_next = 1;
104 const wchar_t *str_wide = string;
105 mbstate_t mbstate;
106 int reg_notbol = eflags & REG_NOTBOL;
107 int reg_noteol = eflags & REG_NOTEOL;
108 int reg_newline = tnfa->cflags & REG_NEWLINE;
109
110 char *buf;
111 tre_tnfa_transition_t *trans_i;
112 tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
113 tre_reach_pos_t *reach_pos;
114 int *tag_i;
115 int num_tags, i;
116
117 int match_eo = -1; /* end offset of match (-1 if no match found yet) */
118 #ifdef TRE_DEBUG
119 int once;
120 #endif /* TRE_DEBUG */
121 tre_tag_t *tmp_tags = NULL;
122 tre_tag_t *tmp_iptr;
123 int tbytes;
124 int touch = 1;
125
126 memset(&mbstate, '\0', sizeof(mbstate));
127
128 DPRINT(("tre_tnfa_run_parallel, input type %d\n", type));
129
130 if (!match_tags)
131 num_tags = 0;
132 else
133 num_tags = tnfa->num_tags;
134
135 /* Allocate memory for temporary data required for matching. This needs to
136 be done for every matching operation to be thread safe. This allocates
137 everything in a single large block from the stack frame using alloca()
138 or with malloc() if alloca is unavailable. */
139 {
140 int rbytes, pbytes, total_bytes;
141 char *tmp_buf;
142 /* Compute the length of the block we need. */
143 tbytes = sizeof(*tmp_tags) * num_tags;
144 rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
145 pbytes = sizeof(*reach_pos) * tnfa->num_states;
146 total_bytes =
147 (sizeof(long) - 1) * 4 /* for alignment paddings */
148 + (rbytes + tbytes * tnfa->num_states) * 2 + tbytes + pbytes;
149
150 DPRINT(("tre_tnfa_run_parallel, allocate %d bytes\n", total_bytes));
151 /* Allocate the memory. */
152 buf = malloc((unsigned)total_bytes);
153 if (buf == NULL)
154 return REG_ESPACE;
155 memset(buf, 0, (size_t)total_bytes);
156
157 /* Get the various pointers within tmp_buf (properly aligned). */
158 tmp_tags = (void *)buf;
159 tmp_buf = buf + tbytes;
160 tmp_buf += ALIGN(tmp_buf, long);
161 reach_next = (void *)tmp_buf;
162 tmp_buf += rbytes;
163 tmp_buf += ALIGN(tmp_buf, long);
164 reach = (void *)tmp_buf;
165 tmp_buf += rbytes;
166 tmp_buf += ALIGN(tmp_buf, long);
167 reach_pos = (void *)tmp_buf;
168 tmp_buf += pbytes;
169 tmp_buf += ALIGN(tmp_buf, long);
170 for (i = 0; i < tnfa->num_states; i++)
171 {
172 reach[i].tags = (void *)tmp_buf;
173 tmp_buf += tbytes;
174 reach_next[i].tags = (void *)tmp_buf;
175 tmp_buf += tbytes;
176 }
177 }
178
179 for (i = 0; i < tnfa->num_states; i++)
180 reach_pos[i].pos = -1;
181
182 /* If only one character can start a match, find it first. */
183 if (tnfa->first_char >= 0 && str_byte)
184 {
185 const char *orig_str = str_byte;
186 int first = tnfa->first_char;
187 int found_high_bit = 0;
188
189 if (type == STR_BYTE)
190 {
191 if (len >= 0)
192 str_byte = memchr(orig_str, first, (size_t)len);
193 else
194 str_byte = strchr(orig_str, first);
195 }
196 else if (type == STR_MBS)
197 {
198 /*
199 * If the match character is ASCII, try to match the character
200 * directly, but if a high bit character is found, we stop there.
201 */
202 if (first < 0x80)
203 {
204 if (len >= 0)
205 {
206 int i;
207 for (i = 0; ; str_byte++, i++)
208 {
209 if (i >= len)
210 {
211 str_byte = NULL;
212 break;
213 }
214 if (*str_byte == first)
215 break;
216 if (*str_byte & 0x80)
217 {
218 found_high_bit = 1;
219 break;
220 }
221 }
222 }
223 else
224 {
225 for (; ; str_byte++)
226 {
227 if (!*str_byte)
228 {
229 str_byte = NULL;
230 break;
231 }
232 if (*str_byte == first)
233 break;
234 if (*str_byte & 0x80)
235 {
236 found_high_bit = 1;
237 break;
238 }
239 }
240 }
241 }
242 else
243 {
244 if (len >= 0)
245 {
246 int i;
247 for (i = 0; ; str_byte++, i++)
248 {
249 if (i >= len)
250 {
251 str_byte = NULL;
252 break;
253 }
254 if (*str_byte & 0x80)
255 {
256 found_high_bit = 1;
257 break;
258 }
259 }
260 }
261 else
262 {
263 for (; ; str_byte++)
264 {
265 if (!*str_byte)
266 {
267 str_byte = NULL;
268 break;
269 }
270 if (*str_byte & 0x80)
271 {
272 found_high_bit = 1;
273 break;
274 }
275 }
276 }
277 }
278 }
279 if (str_byte == NULL)
280 {
281 if (buf)
282 free(buf);
283 return REG_NOMATCH;
284 }
285 DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str)));
286 if (!found_high_bit)
287 {
288 if (str_byte >= orig_str + 1)
289 prev_c = (unsigned char)*(str_byte - 1);
290 next_c = (unsigned char)*str_byte;
291 pos = str_byte - orig_str;
292 if (len < 0 || pos < len)
293 str_byte++;
294 }
295 else
296 {
297 if (str_byte == orig_str)
298 goto no_first_optimization;
299 /*
300 * Back up one character, fix up the position, then call
301 * GET_NEXT_WCHAR() to process the multibyte character.
302 */
303 /* no need to set prev_c, since GET_NEXT_WCHAR will overwrite */
304 next_c = (unsigned char)*(str_byte - 1);
305 pos = (str_byte - 1) - orig_str;
306 GET_NEXT_WCHAR();
307 }
308 }
309 else
310 {
311 no_first_optimization:
312 GET_NEXT_WCHAR();
313 pos = 0;
314 }
315
316 DPRINT(("length: %d\n", len));
317 DPRINT(("pos:chr/code | states and tags\n"));
318 DPRINT(("-------------+------------------------------------------------\n"));
319
320 reach_next_i = reach_next;
321 while (/*CONSTCOND*/1)
322 {
323 /* If no match found yet, add the initial states to `reach_next'. */
324 if (match_eo < 0)
325 {
326 DPRINT((" init >"));
327 trans_i = tnfa->initial;
328 while (trans_i->state != NULL)
329 {
330 if (reach_pos[trans_i->state_id].pos < pos)
331 {
332 if (trans_i->assertions
333 && CHECK_ASSERTIONS(trans_i->assertions))
334 {
335 DPRINT(("assertion failed\n"));
336 trans_i++;
337 continue;
338 }
339
340 DPRINT((" %p", (void *)trans_i->state));
341 reach_next_i->state = trans_i->state;
342 memset(reach_next_i->tags, 0, tbytes);
343 tag_i = trans_i->tags;
344 if (tag_i)
345 {
346 while (*tag_i >= 0)
347 {
348 if (*tag_i < num_tags)
349 tre_tag_set(reach_next_i->tags, *tag_i, pos, touch);
350 tag_i++;
351 }
352 touch++;
353 }
354 if (reach_next_i->state == tnfa->final)
355 {
356 DPRINT((" found empty match\n"));
357 match_eo = pos;
358 memcpy(match_tags, reach_next_i->tags, tbytes);
359 }
360 reach_pos[trans_i->state_id].pos = pos;
361 reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
362 reach_next_i++;
363 }
364 trans_i++;
365 }
366 DPRINT(("\n"));
367 reach_next_i->state = NULL;
368 }
369 else
370 {
371 if (num_tags == 0 || reach_next_i == reach_next)
372 /* We have found a match. */
373 break;
374 }
375
376 /* Check for end of string. */
377 if (len < 0)
378 {
379 if (next_c == L'\0')
380 break;
381 }
382 else
383 {
384 if (pos >= len)
385 break;
386 }
387
388 GET_NEXT_WCHAR();
389
390 #ifdef TRE_DEBUG
391 DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
392 tre_print_reach(tnfa, reach_next, num_tags);
393 #endif /* TRE_DEBUG */
394
395 /* Swap `reach' and `reach_next'. */
396 reach_i = reach;
397 reach = reach_next;
398 reach_next = reach_i;
399
400 #ifdef TRE_DEBUG
401 once = 0;
402 #endif /* TRE_DEBUG */
403
404 /* For each state in `reach' see if there is a transition leaving with
405 the current input symbol to a state not yet in `reach_next', and
406 add the destination states to `reach_next'. */
407 reach_next_i = reach_next;
408 for (reach_i = reach; reach_i->state; reach_i++)
409 {
410 for (trans_i = reach_i->state; trans_i->state; trans_i++)
411 {
412 /* Does this transition match the input symbol? */
413 if (trans_i->code_min <= (tre_cint_t)prev_c &&
414 trans_i->code_max >= (tre_cint_t)prev_c)
415 {
416 if (trans_i->assertions
417 && (CHECK_ASSERTIONS(trans_i->assertions)
418 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
419 {
420 DPRINT(("assertion failed\n"));
421 continue;
422 }
423
424 /* Compute the tags after this transition. */
425 memcpy(tmp_tags, reach_i->tags, tbytes);
426 tag_i = trans_i->tags;
427 if (tag_i != NULL)
428 {
429 while (*tag_i >= 0)
430 {
431 if (*tag_i < num_tags)
432 tre_tag_set(tmp_tags, *tag_i, pos, touch);
433 tag_i++;
434 }
435 touch++;
436 }
437
438 /* For each new transition, weed out those that don't
439 fulfill the minimal matching conditions. */
440 if (tnfa->num_minimals && match_eo >= 0)
441 {
442 int skip = 0;
443 #ifdef TRE_DEBUG
444 if (!once)
445 {
446 DPRINT(("Checking minimal conditions: match_eo=%d "
447 "match_tags=", match_eo));
448 tre_print_tags(match_tags, num_tags);
449 DPRINT(("\n"));
450 once++;
451 }
452 #endif /* TRE_DEBUG */
453 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
454 {
455 int end = tnfa->minimal_tags[i];
456 int start = tnfa->minimal_tags[i + 1];
457 DPRINT((" Minimal start %d, end %d\n", start, end));
458 if (tre_minimal_tag_order(start, end, match_tags,
459 tmp_tags) > 0)
460 {
461 skip = 1;
462 break;
463 }
464 }
465 if (skip)
466 {
467 #ifdef TRE_DEBUG
468 DPRINT((" Throwing out"));
469 tre_print_reach1(reach_i->state, tmp_tags,
470 num_tags);
471 DPRINT(("\n"));
472 #endif /* TRE_DEBUG */
473 continue;
474 }
475 }
476
477 if (reach_pos[trans_i->state_id].pos < pos)
478 {
479 /* Found an unvisited node. */
480 reach_next_i->state = trans_i->state;
481 tmp_iptr = reach_next_i->tags;
482 reach_next_i->tags = tmp_tags;
483 tmp_tags = tmp_iptr;
484 reach_pos[trans_i->state_id].pos = pos;
485 reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
486
487 if (reach_next_i->state == tnfa->final
488 && (match_eo == -1
489 || (num_tags > 0
490 && tre_tag_get(reach_next_i->tags, 0) <=
491 tre_tag_get(match_tags, 0))))
492 {
493 #ifdef TRE_DEBUG
494 DPRINT((" found match"));
495 tre_print_reach1(trans_i->state, reach_next_i->tags, num_tags);
496 DPRINT(("\n"));
497 #endif /* TRE_DEBUG */
498 match_eo = pos;
499 memcpy(match_tags, reach_next_i->tags, tbytes);
500 }
501 reach_next_i++;
502
503 }
504 else
505 {
506 assert(reach_pos[trans_i->state_id].pos == pos);
507 /* Another path has also reached this state. We choose
508 the winner by examining the tag values for both
509 paths. */
510 if (tre_tag_order(num_tags, tnfa->tag_directions,
511 tmp_tags,
512 *reach_pos[trans_i->state_id].tags))
513 {
514 /* The new path wins. */
515 tmp_iptr = *reach_pos[trans_i->state_id].tags;
516 *reach_pos[trans_i->state_id].tags = tmp_tags;
517 if (trans_i->state == tnfa->final)
518 {
519 #ifdef TRE_DEBUG
520 DPRINT((" found better match"));
521 tre_print_reach1(trans_i->state, tmp_tags, num_tags);
522 DPRINT(("\n"));
523 #endif /* TRE_DEBUG */
524 match_eo = pos;
525 memcpy(match_tags, tmp_tags, tbytes);
526 }
527 tmp_tags = tmp_iptr;
528 }
529 }
530 }
531 }
532 }
533 reach_next_i->state = NULL;
534 }
535
536 DPRINT(("match end offset = %d\n", match_eo));
537
538 *match_end_ofs = match_eo;
539 #ifdef TRE_DEBUG
540 if (match_tags)
541 {
542 DPRINT(("Winning tags="));
543 tre_print_tags_all(match_tags, num_tags);
544 DPRINT((" touch=%d\n", touch));
545 }
546 #endif /* TRE_DEBUG */
547
548 if (buf)
549 free(buf);
550
551 return match_eo >= 0 ? REG_OK : REG_NOMATCH;
552 }