1 /* 2 * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 tre-match-parallel.c - TRE parallel regex matching engine 31 */ 32 33 /* 34 This algorithm searches for matches basically by reading characters 35 in the searched string one by one, starting at the beginning. All 36 matching paths in the TNFA are traversed in parallel. When two or 37 more paths reach the same state, exactly one is chosen according to 38 tag ordering rules; if returning submatches is not required it does 39 not matter which path is chosen. 40 41 The worst case time required for finding the leftmost and longest 42 match, or determining that there is no match, is always linearly 43 dependent on the length of the text being searched. 44 45 This algorithm cannot handle TNFAs with back referencing nodes. 46 See `tre-match-backtrack.c'. 47 */ 48 49 #include <assert.h> 50 #include <stdlib.h> 51 #include <string.h> 52 #include <wchar.h> 53 #include <wctype.h> 54 55 #include "tre-internal.h" 56 #include "tre-match-utils.h" 57 #include "tre.h" 58 59 typedef struct { 60 tre_tnfa_transition_t *state; 61 tre_tag_t *tags; 62 } tre_tnfa_reach_t; 63 64 typedef struct { 65 int pos; 66 tre_tag_t **tags; 67 } tre_reach_pos_t; 68 69 #ifdef TRE_DEBUG 70 static void 71 tre_print_reach1(tre_tnfa_transition_t *state, tre_tag_t *tags, int num_tags) 72 { 73 DPRINT((" %p", (void *)state)); 74 if (num_tags > 0) 75 { 76 DPRINT(("/")); 77 tre_print_tags(tags, num_tags); 78 } 79 } 80 81 static void 82 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags) 83 { 84 while (reach->state != NULL) 85 { 86 tre_print_reach1(reach->state, reach->tags, num_tags); 87 reach++; 88 } 89 DPRINT(("\n")); 90 91 } 92 #endif /* TRE_DEBUG */ 93 94 reg_errcode_t 95 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, 96 tre_str_type_t type, tre_tag_t *match_tags, int eflags, 97 int *match_end_ofs) 98 { 99 /* State variables required by GET_NEXT_WCHAR. */ 100 tre_char_t prev_c = 0, next_c = 0; 101 const char *str_byte = string; 102 int pos = -1; 103 unsigned int pos_add_next = 1; 104 const wchar_t *str_wide = string; 105 mbstate_t mbstate; 106 int reg_notbol = eflags & REG_NOTBOL; 107 int reg_noteol = eflags & REG_NOTEOL; 108 int reg_newline = tnfa->cflags & REG_NEWLINE; 109 110 char *buf; 111 tre_tnfa_transition_t *trans_i; 112 tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i; 113 tre_reach_pos_t *reach_pos; 114 int *tag_i; 115 int num_tags, i; 116 117 int match_eo = -1; /* end offset of match (-1 if no match found yet) */ 118 #ifdef TRE_DEBUG 119 int once; 120 #endif /* TRE_DEBUG */ 121 tre_tag_t *tmp_tags = NULL; 122 tre_tag_t *tmp_iptr; 123 int tbytes; 124 int touch = 1; 125 126 memset(&mbstate, '\0', sizeof(mbstate)); 127 128 DPRINT(("tre_tnfa_run_parallel, input type %d\n", type)); 129 130 if (!match_tags) 131 num_tags = 0; 132 else 133 num_tags = tnfa->num_tags; 134 135 /* Allocate memory for temporary data required for matching. This needs to 136 be done for every matching operation to be thread safe. This allocates 137 everything in a single large block from the stack frame using alloca() 138 or with malloc() if alloca is unavailable. */ 139 { 140 int rbytes, pbytes, total_bytes; 141 char *tmp_buf; 142 /* Compute the length of the block we need. */ 143 tbytes = sizeof(*tmp_tags) * num_tags; 144 rbytes = sizeof(*reach_next) * (tnfa->num_states + 1); 145 pbytes = sizeof(*reach_pos) * tnfa->num_states; 146 total_bytes = 147 (sizeof(long) - 1) * 4 /* for alignment paddings */ 148 + (rbytes + tbytes * tnfa->num_states) * 2 + tbytes + pbytes; 149 150 DPRINT(("tre_tnfa_run_parallel, allocate %d bytes\n", total_bytes)); 151 /* Allocate the memory. */ 152 buf = malloc((unsigned)total_bytes); 153 if (buf == NULL) 154 return REG_ESPACE; 155 memset(buf, 0, (size_t)total_bytes); 156 157 /* Get the various pointers within tmp_buf (properly aligned). */ 158 tmp_tags = (void *)buf; 159 tmp_buf = buf + tbytes; 160 tmp_buf += ALIGN(tmp_buf, long); 161 reach_next = (void *)tmp_buf; 162 tmp_buf += rbytes; 163 tmp_buf += ALIGN(tmp_buf, long); 164 reach = (void *)tmp_buf; 165 tmp_buf += rbytes; 166 tmp_buf += ALIGN(tmp_buf, long); 167 reach_pos = (void *)tmp_buf; 168 tmp_buf += pbytes; 169 tmp_buf += ALIGN(tmp_buf, long); 170 for (i = 0; i < tnfa->num_states; i++) 171 { 172 reach[i].tags = (void *)tmp_buf; 173 tmp_buf += tbytes; 174 reach_next[i].tags = (void *)tmp_buf; 175 tmp_buf += tbytes; 176 } 177 } 178 179 for (i = 0; i < tnfa->num_states; i++) 180 reach_pos[i].pos = -1; 181 182 /* If only one character can start a match, find it first. */ 183 if (tnfa->first_char >= 0 && str_byte) 184 { 185 const char *orig_str = str_byte; 186 int first = tnfa->first_char; 187 int found_high_bit = 0; 188 189 if (type == STR_BYTE) 190 { 191 if (len >= 0) 192 str_byte = memchr(orig_str, first, (size_t)len); 193 else 194 str_byte = strchr(orig_str, first); 195 } 196 else if (type == STR_MBS) 197 { 198 /* 199 * If the match character is ASCII, try to match the character 200 * directly, but if a high bit character is found, we stop there. 201 */ 202 if (first < 0x80) 203 { 204 if (len >= 0) 205 { 206 int i; 207 for (i = 0; ; str_byte++, i++) 208 { 209 if (i >= len) 210 { 211 str_byte = NULL; 212 break; 213 } 214 if (*str_byte == first) 215 break; 216 if (*str_byte & 0x80) 217 { 218 found_high_bit = 1; 219 break; 220 } 221 } 222 } 223 else 224 { 225 for (; ; str_byte++) 226 { 227 if (!*str_byte) 228 { 229 str_byte = NULL; 230 break; 231 } 232 if (*str_byte == first) 233 break; 234 if (*str_byte & 0x80) 235 { 236 found_high_bit = 1; 237 break; 238 } 239 } 240 } 241 } 242 else 243 { 244 if (len >= 0) 245 { 246 int i; 247 for (i = 0; ; str_byte++, i++) 248 { 249 if (i >= len) 250 { 251 str_byte = NULL; 252 break; 253 } 254 if (*str_byte & 0x80) 255 { 256 found_high_bit = 1; 257 break; 258 } 259 } 260 } 261 else 262 { 263 for (; ; str_byte++) 264 { 265 if (!*str_byte) 266 { 267 str_byte = NULL; 268 break; 269 } 270 if (*str_byte & 0x80) 271 { 272 found_high_bit = 1; 273 break; 274 } 275 } 276 } 277 } 278 } 279 if (str_byte == NULL) 280 { 281 if (buf) 282 free(buf); 283 return REG_NOMATCH; 284 } 285 DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str))); 286 if (!found_high_bit) 287 { 288 if (str_byte >= orig_str + 1) 289 prev_c = (unsigned char)*(str_byte - 1); 290 next_c = (unsigned char)*str_byte; 291 pos = str_byte - orig_str; 292 if (len < 0 || pos < len) 293 str_byte++; 294 } 295 else 296 { 297 if (str_byte == orig_str) 298 goto no_first_optimization; 299 /* 300 * Back up one character, fix up the position, then call 301 * GET_NEXT_WCHAR() to process the multibyte character. 302 */ 303 /* no need to set prev_c, since GET_NEXT_WCHAR will overwrite */ 304 next_c = (unsigned char)*(str_byte - 1); 305 pos = (str_byte - 1) - orig_str; 306 GET_NEXT_WCHAR(); 307 } 308 } 309 else 310 { 311 no_first_optimization: 312 GET_NEXT_WCHAR(); 313 pos = 0; 314 } 315 316 DPRINT(("length: %d\n", len)); 317 DPRINT(("pos:chr/code | states and tags\n")); 318 DPRINT(("-------------+------------------------------------------------\n")); 319 320 reach_next_i = reach_next; 321 while (/*CONSTCOND*/1) 322 { 323 /* If no match found yet, add the initial states to `reach_next'. */ 324 if (match_eo < 0) 325 { 326 DPRINT((" init >")); 327 trans_i = tnfa->initial; 328 while (trans_i->state != NULL) 329 { 330 if (reach_pos[trans_i->state_id].pos < pos) 331 { 332 if (trans_i->assertions 333 && CHECK_ASSERTIONS(trans_i->assertions)) 334 { 335 DPRINT(("assertion failed\n")); 336 trans_i++; 337 continue; 338 } 339 340 DPRINT((" %p", (void *)trans_i->state)); 341 reach_next_i->state = trans_i->state; 342 memset(reach_next_i->tags, 0, tbytes); 343 tag_i = trans_i->tags; 344 if (tag_i) 345 { 346 while (*tag_i >= 0) 347 { 348 if (*tag_i < num_tags) 349 tre_tag_set(reach_next_i->tags, *tag_i, pos, touch); 350 tag_i++; 351 } 352 touch++; 353 } 354 if (reach_next_i->state == tnfa->final) 355 { 356 DPRINT((" found empty match\n")); 357 match_eo = pos; 358 memcpy(match_tags, reach_next_i->tags, tbytes); 359 } 360 reach_pos[trans_i->state_id].pos = pos; 361 reach_pos[trans_i->state_id].tags = &reach_next_i->tags; 362 reach_next_i++; 363 } 364 trans_i++; 365 } 366 DPRINT(("\n")); 367 reach_next_i->state = NULL; 368 } 369 else 370 { 371 if (num_tags == 0 || reach_next_i == reach_next) 372 /* We have found a match. */ 373 break; 374 } 375 376 /* Check for end of string. */ 377 if (len < 0) 378 { 379 if (next_c == L'\0') 380 break; 381 } 382 else 383 { 384 if (pos >= len) 385 break; 386 } 387 388 GET_NEXT_WCHAR(); 389 390 #ifdef TRE_DEBUG 391 DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c)); 392 tre_print_reach(tnfa, reach_next, num_tags); 393 #endif /* TRE_DEBUG */ 394 395 /* Swap `reach' and `reach_next'. */ 396 reach_i = reach; 397 reach = reach_next; 398 reach_next = reach_i; 399 400 #ifdef TRE_DEBUG 401 once = 0; 402 #endif /* TRE_DEBUG */ 403 404 /* For each state in `reach' see if there is a transition leaving with 405 the current input symbol to a state not yet in `reach_next', and 406 add the destination states to `reach_next'. */ 407 reach_next_i = reach_next; 408 for (reach_i = reach; reach_i->state; reach_i++) 409 { 410 for (trans_i = reach_i->state; trans_i->state; trans_i++) 411 { 412 /* Does this transition match the input symbol? */ 413 if (trans_i->code_min <= (tre_cint_t)prev_c && 414 trans_i->code_max >= (tre_cint_t)prev_c) 415 { 416 if (trans_i->assertions 417 && (CHECK_ASSERTIONS(trans_i->assertions) 418 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) 419 { 420 DPRINT(("assertion failed\n")); 421 continue; 422 } 423 424 /* Compute the tags after this transition. */ 425 memcpy(tmp_tags, reach_i->tags, tbytes); 426 tag_i = trans_i->tags; 427 if (tag_i != NULL) 428 { 429 while (*tag_i >= 0) 430 { 431 if (*tag_i < num_tags) 432 tre_tag_set(tmp_tags, *tag_i, pos, touch); 433 tag_i++; 434 } 435 touch++; 436 } 437 438 /* For each new transition, weed out those that don't 439 fulfill the minimal matching conditions. */ 440 if (tnfa->num_minimals && match_eo >= 0) 441 { 442 int skip = 0; 443 #ifdef TRE_DEBUG 444 if (!once) 445 { 446 DPRINT(("Checking minimal conditions: match_eo=%d " 447 "match_tags=", match_eo)); 448 tre_print_tags(match_tags, num_tags); 449 DPRINT(("\n")); 450 once++; 451 } 452 #endif /* TRE_DEBUG */ 453 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2) 454 { 455 int end = tnfa->minimal_tags[i]; 456 int start = tnfa->minimal_tags[i + 1]; 457 DPRINT((" Minimal start %d, end %d\n", start, end)); 458 if (tre_minimal_tag_order(start, end, match_tags, 459 tmp_tags) > 0) 460 { 461 skip = 1; 462 break; 463 } 464 } 465 if (skip) 466 { 467 #ifdef TRE_DEBUG 468 DPRINT((" Throwing out")); 469 tre_print_reach1(reach_i->state, tmp_tags, 470 num_tags); 471 DPRINT(("\n")); 472 #endif /* TRE_DEBUG */ 473 continue; 474 } 475 } 476 477 if (reach_pos[trans_i->state_id].pos < pos) 478 { 479 /* Found an unvisited node. */ 480 reach_next_i->state = trans_i->state; 481 tmp_iptr = reach_next_i->tags; 482 reach_next_i->tags = tmp_tags; 483 tmp_tags = tmp_iptr; 484 reach_pos[trans_i->state_id].pos = pos; 485 reach_pos[trans_i->state_id].tags = &reach_next_i->tags; 486 487 if (reach_next_i->state == tnfa->final 488 && (match_eo == -1 489 || (num_tags > 0 490 && tre_tag_get(reach_next_i->tags, 0) <= 491 tre_tag_get(match_tags, 0)))) 492 { 493 #ifdef TRE_DEBUG 494 DPRINT((" found match")); 495 tre_print_reach1(trans_i->state, reach_next_i->tags, num_tags); 496 DPRINT(("\n")); 497 #endif /* TRE_DEBUG */ 498 match_eo = pos; 499 memcpy(match_tags, reach_next_i->tags, tbytes); 500 } 501 reach_next_i++; 502 503 } 504 else 505 { 506 assert(reach_pos[trans_i->state_id].pos == pos); 507 /* Another path has also reached this state. We choose 508 the winner by examining the tag values for both 509 paths. */ 510 if (tre_tag_order(num_tags, tnfa->tag_directions, 511 tmp_tags, 512 *reach_pos[trans_i->state_id].tags)) 513 { 514 /* The new path wins. */ 515 tmp_iptr = *reach_pos[trans_i->state_id].tags; 516 *reach_pos[trans_i->state_id].tags = tmp_tags; 517 if (trans_i->state == tnfa->final) 518 { 519 #ifdef TRE_DEBUG 520 DPRINT((" found better match")); 521 tre_print_reach1(trans_i->state, tmp_tags, num_tags); 522 DPRINT(("\n")); 523 #endif /* TRE_DEBUG */ 524 match_eo = pos; 525 memcpy(match_tags, tmp_tags, tbytes); 526 } 527 tmp_tags = tmp_iptr; 528 } 529 } 530 } 531 } 532 } 533 reach_next_i->state = NULL; 534 } 535 536 DPRINT(("match end offset = %d\n", match_eo)); 537 538 *match_end_ofs = match_eo; 539 #ifdef TRE_DEBUG 540 if (match_tags) 541 { 542 DPRINT(("Winning tags=")); 543 tre_print_tags_all(match_tags, num_tags); 544 DPRINT((" touch=%d\n", touch)); 545 } 546 #endif /* TRE_DEBUG */ 547 548 if (buf) 549 free(buf); 550 551 return match_eo >= 0 ? REG_OK : REG_NOMATCH; 552 }