1 /* 2 * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 tre-compile.c - TRE regex compiler 31 */ 32 33 /* 34 TODO: 35 - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive 36 function calls. 37 */ 38 39 #include <stdio.h> 40 #include <assert.h> 41 #include <string.h> 42 #include <limits.h> 43 44 #include "tre-internal.h" 45 #include "tre-mem.h" 46 #include "tre-stack.h" 47 #include "tre-ast.h" 48 #include "tre-parse.h" 49 #include "tre-compile.h" 50 #include "tre.h" 51 52 /* 53 Algorithms to setup tags so that submatch addressing can be done. 54 */ 55 56 #ifdef TRE_DEBUG 57 static const char *tag_dir_str[] = { 58 "minimize", 59 "maximize", 60 "left-maximize" 61 }; 62 63 static const char _indent[] = " "; 64 65 static void 66 print_indent(int indent) 67 { 68 while (indent-- > 0) 69 DPRINT((_indent)); 70 } 71 72 static void print_last_matched_pre(tre_last_matched_pre_t *lm, int indent, 73 int num_tags); 74 static void 75 print_last_match_branch_pre(tre_last_matched_branch_pre_t *branch, int indent, 76 int num_tags) 77 { 78 tre_last_matched_pre_t *u = branch->last_matched; 79 int n_last_matched = 0; 80 81 while (u) 82 { 83 n_last_matched++; 84 u = u->next; 85 } 86 87 print_indent(indent); 88 DPRINT(("BRANCH: tot_branches=%d tot_last_matched=%d tot_tags=%d\n", 89 branch->tot_branches, branch->tot_last_matched, branch->tot_tags)); 90 print_indent(indent); 91 DPRINT(("..n_last_matched=%d last_matched=%d\n", branch->n_last_matched, 92 n_last_matched)); 93 if (branch->n_last_matched != n_last_matched) 94 DPRINT(("*** mismatch between n_last_matched and unions ***\n")); 95 if (branch->cmp_tag > 0) 96 { 97 int i; 98 const char *sep = " tags="; 99 print_indent(indent); 100 DPRINT(("..cmp_tag=%d n_tags=%d", branch->cmp_tag, branch->n_tags)); 101 for (i = 0; i < num_tags; i++) 102 if (bit_test(branch->tags, i)) 103 { 104 DPRINT(("%s%d", sep, i)); 105 sep = ","; 106 } 107 DPRINT(("\n")); 108 } 109 110 u = branch->last_matched; 111 indent++; 112 while (u) 113 { 114 print_last_matched_pre(u, indent, num_tags); 115 u = u->next; 116 } 117 } 118 119 static void 120 print_last_matched_pre(tre_last_matched_pre_t *lm, int indent, int num_tags) 121 { 122 tre_last_matched_branch_pre_t *b = lm->branches; 123 int n_branches = 0; 124 125 while (b) 126 { 127 n_branches++; 128 b = b->next; 129 } 130 131 print_indent(indent); 132 DPRINT(("LAST_MATCHED: tot_branches=%d tot_last_matched=%d tot_tags=%d\n", 133 lm->tot_branches, lm->tot_last_matched, lm->tot_tags)); 134 print_indent(indent); 135 DPRINT(("..start_tag=%d n_branches=%d branches=%d\n", lm->start_tag, 136 lm->n_branches, n_branches)); 137 if (lm->n_branches != n_branches) 138 DPRINT(("*** mismatch between n and branches ***\n")); 139 140 b = lm->branches; 141 indent++; 142 while (b) 143 { 144 print_last_match_branch_pre(b, indent, num_tags); 145 b = b->next; 146 } 147 } 148 149 static void print_last_matched(tre_last_matched_t *lm, int indent); 150 static void 151 print_last_match_branch(tre_last_matched_branch_t *branch, int indent) 152 { 153 tre_last_matched_t *u; 154 int i; 155 156 print_indent(indent); 157 DPRINT(("BRANCH: n_last_matched=%d\n", branch->n_last_matched)); 158 if (branch->cmp_tag > 0) 159 { 160 print_indent(indent); 161 DPRINT(("..cmp_tag=%d n_tags=%d", branch->cmp_tag, branch->n_tags)); 162 if (branch->n_tags > 0) 163 { 164 const char *sep = " tags="; 165 for (i = 0; i < branch->n_tags; i++) 166 { 167 DPRINT(("%s%d", sep, branch->tags[i])); 168 sep = ","; 169 } 170 } 171 DPRINT(("\n")); 172 } 173 174 u = branch->last_matched; 175 indent++; 176 for (i = branch->n_last_matched; i > 0; i--, u++) 177 print_last_matched(u, indent); 178 } 179 180 static void 181 print_last_matched(tre_last_matched_t *lm, int indent) 182 { 183 int i; 184 tre_last_matched_branch_t *b; 185 186 print_indent(indent); 187 DPRINT(("LAST_MATCHED: n_branches=%d start_tag=%d\n", lm->n_branches, 188 lm->start_tag)); 189 190 b = lm->branches; 191 indent++; 192 for (i = lm->n_branches; i > 0; i--, b++) 193 print_last_match_branch(b, indent); 194 } 195 #endif /* TRE_DEBUG */ 196 197 /* Merge the tre_last_matched_branch_pre_t of src into dst, creating a new 198 one if needed. If tag_id > 0, add that tag as well (a negative tag_id will 199 create an unset tre_last_matched_branch_pre_t. */ 200 static reg_errcode_t 201 tre_merge_branches(tre_mem_t mem, tre_ast_node_t *dst, tre_ast_node_t *src, 202 int tag_id, int num_tags) 203 { 204 tre_last_matched_branch_pre_t *db = dst->last_matched_branch; 205 tre_last_matched_branch_pre_t *sb = (src ? src->last_matched_branch : NULL); 206 207 if (db) 208 { 209 if (sb) 210 { 211 bitstr_t *l = db->tags; 212 bitstr_t *r = sb->tags; 213 int i = bitstr_size(num_tags); 214 215 while(i-- > 0) 216 *l++ |= *r++; 217 /* db and sb are the info from two parallel sub-trees, so the tags 218 must be mutually exclusive, and we can just add their numbers */ 219 db->n_tags += sb->n_tags; 220 db->tot_tags += sb->tot_tags; 221 if (db->last_matched) 222 { 223 if (sb->last_matched) 224 { 225 tre_last_matched_pre_t *u = db->last_matched; 226 227 while(u->next) 228 u = u->next; 229 u->next = sb->last_matched; 230 db->n_last_matched += sb->n_last_matched; 231 db->tot_branches += sb->tot_branches; 232 db->tot_last_matched += sb->tot_last_matched; 233 } 234 } 235 else if (sb->last_matched) 236 { 237 db->last_matched = sb->last_matched; 238 db->n_last_matched = sb->n_last_matched; 239 db->tot_branches = sb->tot_branches; 240 db->tot_last_matched = sb->tot_last_matched; 241 } 242 } 243 } 244 else 245 db = sb; 246 247 if (tag_id != 0) 248 { 249 if (!db) 250 { 251 db = tre_mem_calloc(mem, sizeof(tre_last_matched_branch_pre_t) 252 + bitstr_size(num_tags)); 253 if (db == NULL) 254 return REG_ESPACE; 255 db->tot_branches = 1; 256 } 257 if (tag_id > 0) 258 { 259 /* tag_id is a new tag, and shouldn't exist in db's tags, 260 so we can always increment n_tags */ 261 bit_set(db->tags, tag_id); 262 db->n_tags++; 263 db->tot_tags++; 264 } 265 } 266 dst->last_matched_branch = db; 267 return REG_OK; 268 } 269 270 /* Inserts a catenation node to the root of the tree given in `node'. 271 As the left child a new tag with number `tag_id' to `node' is added, 272 and the right child is the old root. */ 273 static reg_errcode_t 274 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id) 275 { 276 tre_catenation_t *c; 277 278 DPRINT(("add_tag_left: tag %d\n", tag_id)); 279 280 c = tre_mem_alloc(mem, sizeof(*c)); 281 if (c == NULL) 282 return REG_ESPACE; 283 c->left = tre_ast_new_literal(mem, TAG, tag_id, -1); 284 if (c->left == NULL) 285 return REG_ESPACE; 286 c->right = tre_mem_calloc(mem, sizeof(tre_ast_node_t)); 287 if (c->right == NULL) 288 return REG_ESPACE; 289 290 c->right->obj = node->obj; 291 c->right->type = node->type; 292 c->right->last_matched_branch = node->last_matched_branch; 293 c->right->nullable = -1; 294 c->right->submatch_id = -1; 295 node->obj = c; 296 node->type = CATENATION; 297 node->original = c->right; 298 return REG_OK; 299 } 300 301 /* Inserts a catenation node to the root of the tree given in `node'. 302 As the right child a new tag with number `tag_id' to `node' is added, 303 and the left child is the old root. */ 304 static reg_errcode_t 305 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id) 306 { 307 tre_catenation_t *c; 308 309 DPRINT(("tre_add_tag_right: tag %d\n", tag_id)); 310 311 c = tre_mem_alloc(mem, sizeof(*c)); 312 if (c == NULL) 313 return REG_ESPACE; 314 c->right = tre_ast_new_literal(mem, TAG, tag_id, -1); 315 if (c->right == NULL) 316 return REG_ESPACE; 317 c->left = tre_mem_calloc(mem, sizeof(tre_ast_node_t)); 318 if (c->left == NULL) 319 return REG_ESPACE; 320 321 c->left->obj = node->obj; 322 c->left->type = node->type; 323 c->left->last_matched_branch = node->last_matched_branch; 324 c->left->nullable = -1; 325 c->left->submatch_id = -1; 326 node->obj = c; 327 node->type = CATENATION; 328 node->original = c->left; 329 return REG_OK; 330 } 331 332 typedef enum { 333 ADDTAGS_RECURSE, 334 ADDTAGS_RECURSE_NOT_TOP_UNION, 335 ADDTAGS_AFTER_ITERATION, 336 ADDTAGS_AFTER_UNION_LEFT, 337 ADDTAGS_AFTER_UNION_RIGHT, 338 ADDTAGS_AFTER_CAT_LEFT, 339 ADDTAGS_AFTER_CAT_RIGHT, 340 ADDTAGS_SET_SUBMATCH_END, 341 ADDTAGS_UNION_RECURSE, 342 ADDTAGS_UNION_RIGHT_RECURSE, 343 ADDTAGS_AFTER_UNION_TOP, 344 } tre_addtags_symbol_t; 345 346 enum { 347 COPY_LAST_MATCHED_BRANCH, 348 COPY_LAST_MATCHED_BRANCH_NEXT, 349 COPY_LAST_MATCHED, 350 COPY_LAST_MATCHED_NEXT, 351 }; 352 353 #define REGSET_UNSET ((unsigned)-1) 354 355 /* Go through `regset' and set submatch data for submatches that are 356 using this tag. */ 357 static void 358 tre_purge_regset(unsigned *regset, tre_tnfa_t *tnfa, int tag) 359 { 360 int i; 361 362 for (i = 0; regset[i] != REGSET_UNSET; i++) 363 { 364 int id = regset[i] / 2; 365 int start = !(regset[i] % 2); 366 if (id >= SUBMATCH_ID_INVISIBLE_START) 367 continue; 368 DPRINT((" Using tag %d for %s offset of " 369 "submatch %d\n", tag, 370 start ? "start" : "end", id)); 371 if (start) 372 tnfa->submatch_data[id].so_tag = tag; 373 else 374 tnfa->submatch_data[id].eo_tag = tag; 375 } 376 regset[0] = -1; 377 } 378 379 #define REGSET_HAS_STARTS 0x1 380 #define REGSET_HAS_ENDS 0x2 381 382 /* Adds tags to appropriate locations in the parse tree in `tree', so that 383 subexpressions marked for submatch addressing can be traced. */ 384 static reg_errcode_t 385 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, 386 tre_tnfa_t *tnfa) 387 { 388 reg_errcode_t status = REG_OK; 389 tre_addtags_symbol_t symbol; 390 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */ 391 int bottom = tre_stack_num_objects(stack); 392 /* True for first pass (counting number of needed tags) */ 393 int first_pass = (mem == NULL || tnfa == NULL); 394 unsigned *regset, *orig_regset; 395 int regset_contains = 0; 396 int num_tags = 0; /* Total number of tags. */ 397 int num_minimals = 0; /* Number of special minimal tags. */ 398 int tag = 0; /* The tag that is to be added next. */ 399 int next_tag = 1; /* Next tag to use after this one. */ 400 int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */ 401 int *reorder_tags = NULL; /* Tag reorder array: a pair for each reorder, 402 * the first is the tag to reorder, the second 403 * is the tag after which the first is reordered */ 404 int *rtp; /* Pointer used to fill in reorder_tags and 405 * tag_order */ 406 int *to_reorder; /* Transform array converting sequential order to 407 * that specified by reorder_tags */ 408 int id; 409 410 tre_tag_direction_t direction = TRE_TAG_LEFT_MAXIMIZE; 411 if (!first_pass) 412 { 413 DPRINT(("Initializing direction to %s\n", tag_dir_str[direction])); 414 tnfa->end_tag = 0; 415 tnfa->minimal_tags[0] = -1; 416 } 417 418 regset = malloc(sizeof(*regset) * ((tnfa->num_submatches 419 + tnfa->num_submatches_invisible + 1) * 2)); 420 if (regset == NULL) 421 { 422 status = REG_ESPACE; 423 goto error_regset; 424 } 425 regset[0] = REGSET_UNSET; 426 orig_regset = regset; 427 428 if (!first_pass) 429 { 430 /* Allocate all memory for reorder_tags, tag_order, to_seq_order and 431 * to_reorder in one batch (assuming all are the same type) */ 432 rtp = reorder_tags = malloc(sizeof(*reorder_tags) * 433 ((2 * tnfa->num_reorder_tags + 1) + 434 tnfa->num_tags)); 435 if (reorder_tags == NULL) 436 { 437 status = REG_ESPACE; 438 goto error_reorder_tags; 439 } 440 to_reorder = reorder_tags + (2 * tnfa->num_reorder_tags + 1); 441 } 442 443 STACK_PUSH(stack, voidptr, node); 444 STACK_PUSH(stack, int, ADDTAGS_RECURSE); 445 446 while (tre_stack_num_objects(stack) > bottom) 447 { 448 if (status != REG_OK) 449 break; 450 451 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack); 452 switch (symbol) 453 { 454 int top_union; 455 456 case ADDTAGS_SET_SUBMATCH_END: 457 { 458 int i; 459 460 id = tre_stack_pop_int(stack); 461 node = tre_stack_pop_voidptr(stack); 462 /* Add end of this submatch to regset. */ 463 for (i = 0; regset[i] != REGSET_UNSET; i++); 464 regset[i] = id * 2 + 1; 465 regset[i + 1] = -1; 466 regset_contains |= REGSET_HAS_ENDS; 467 468 /* Always put a tag after a minimal iterator. */ 469 if (minimal_tag >= 0) 470 { 471 if (first_pass) 472 { 473 node->num_tags++; 474 DPRINT((" ADDTAGS_SET_SUBMATCH_END: node->num_tags = %d\n", 475 node->num_tags)); 476 } 477 else 478 { 479 int i; 480 status = tre_merge_branches(mem, node, NULL, tag, 481 tnfa->num_tags); 482 if (status != REG_OK) 483 break; 484 status = tre_add_tag_right(mem, node, tag); 485 if (status != REG_OK) 486 break; 487 tnfa->tag_directions[tag] = TRE_TAG_MINIMIZE; 488 DPRINT(("Setting t%d direction to %s\n", tag, 489 tag_dir_str[tnfa->tag_directions[tag]])); 490 DPRINT(("Minimal %d, %d\n", minimal_tag, tag)); 491 for (i = 0; tnfa->minimal_tags[i] >= 0; i++); 492 tnfa->minimal_tags[i] = tag; 493 tnfa->minimal_tags[i + 1] = minimal_tag; 494 tnfa->minimal_tags[i + 2] = -1; 495 496 DPRINT((" Minimal end: t%d reordered to " 497 "after t%d\n", tag, minimal_tag)); 498 /* Append to tag_order, move "tag" after 499 * "minimal_tag" */ 500 *rtp++ = tag; 501 *rtp++ = minimal_tag; 502 503 num_minimals++; 504 tre_purge_regset(regset, tnfa, tag); 505 } 506 507 minimal_tag = -1; 508 DPRINT((" ADDTAGS_SET_SUBMATCH_END num_tags++ tag=%d\n", tag)); 509 regset[0] = REGSET_UNSET; 510 regset_contains = 0; 511 tag = next_tag; 512 num_tags++; 513 next_tag++; 514 } 515 break; 516 } 517 518 case ADDTAGS_RECURSE_NOT_TOP_UNION: 519 /* Like ADDTAGS_RECURSE, except that top_union is set to zero, 520 * indicating that if a union is being processed, it is not the 521 * top-most of a series */ 522 top_union = 0; 523 goto do_addtags_recurse; 524 525 case ADDTAGS_RECURSE: 526 /* Setting top_union to 1 means that if a union is begin processed, 527 * it is the top-most of a series, and should recurse through the 528 * series to set the left_tag and right_tag values */ 529 top_union = 1; 530 531 do_addtags_recurse: 532 node = tre_stack_pop_voidptr(stack); 533 534 id = node->submatch_id; 535 if (id >= 0) 536 { 537 int i; 538 539 /* Add start of this submatch to regset. */ 540 for (i = 0; regset[i] != REGSET_UNSET; i++); 541 regset[i] = id * 2; 542 regset[i + 1] = -1; 543 regset_contains |= REGSET_HAS_STARTS; 544 545 /* Add end of this submatch to regset after processing this 546 node. */ 547 STACK_PUSH(stack, voidptr, node); 548 STACK_PUSHX(stack, int, id); 549 STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END); 550 } 551 552 switch (node->type) 553 { 554 case LITERAL: 555 { 556 tre_literal_t *lit = node->obj; 557 558 if (!IS_SPECIAL(lit) || IS_BACKREF(lit) || IS_EMPTY(lit) || IS_ASSERTION(lit)) 559 { 560 DPRINT(("Literal %d-%d\n", 561 (int)lit->code_min, (int)lit->code_max)); 562 if (regset_contains) 563 { 564 /* Regset is not empty, so add a tag before the 565 literal or backref. */ 566 if (first_pass) 567 { 568 DPRINT((" ADDTAGS_RECURSE:LITERAL node->num_tags = 1\n")); 569 node->num_tags = 1; 570 } 571 else 572 { 573 status = tre_merge_branches(mem, node, NULL, tag, 574 tnfa->num_tags); 575 if (status != REG_OK) 576 break; 577 status = tre_add_tag_left(mem, node, tag); 578 if (status != REG_OK) 579 break; 580 if (regset_contains == REGSET_HAS_STARTS) 581 tnfa->tag_directions[tag] = TRE_TAG_LEFT_MAXIMIZE; 582 else 583 tnfa->tag_directions[tag] = direction; 584 DPRINT(("Setting t%d direction to %s\n", tag, 585 tag_dir_str[tnfa->tag_directions[tag]])); 586 tre_purge_regset(regset, tnfa, tag); 587 588 if (IS_BACKREF(lit)) 589 { 590 int b = lit->code_max; 591 int t = tnfa->submatch_data[b].so_tag; 592 /* Fail if the referenced submatch hasn't been 593 * completed yet */ 594 if (tnfa->submatch_data[b].eo_tag < 0) 595 { 596 status = REG_ESUBREG; 597 break; 598 } 599 if (t < tag) 600 { 601 DPRINT((" Backref %d start: " 602 "t%d reordered to before t%d\n", 603 b, tag, t)); 604 if(t > 0) 605 t--; 606 /* Append to tag_order, move "tag" after 607 * "t" */ 608 *rtp++ = tag; 609 *rtp++ = t; 610 } 611 #if TRE_DEBUG 612 else 613 DPRINT((" Backref %d start: " 614 "(t%d already before t%d)\n", 615 b, tag, t)); 616 #endif /* TRE_DEBUG */ 617 } 618 } 619 620 DPRINT((" ADDTAGS_RECURSE:LITERAL num_tags++ tag=%d\n", 621 tag)); 622 regset[0] = REGSET_UNSET; 623 regset_contains = 0; 624 tag = next_tag; 625 num_tags++; 626 next_tag++; 627 } 628 } 629 else 630 { 631 assert(!IS_TAG(lit)); 632 } 633 break; 634 } 635 case CATENATION: 636 { 637 tre_catenation_t *cat = node->obj; 638 tre_ast_node_t *left = cat->left; 639 tre_ast_node_t *right = cat->right; 640 int reserved_tag = -1; 641 DPRINT(("Catenation, next_tag = %d\n", next_tag)); 642 643 /* After processing right child. */ 644 STACK_PUSHX(stack, voidptr, node); 645 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT); 646 647 /* Process right child. */ 648 STACK_PUSHX(stack, voidptr, right); 649 STACK_PUSHX(stack, int, ADDTAGS_RECURSE); 650 651 /* After processing left child. */ 652 STACK_PUSHX(stack, int, next_tag + left->num_tags); 653 DPRINT((" Pushing %d for after left\n", 654 next_tag + left->num_tags)); 655 if (left->num_tags > 0 && right->num_tags > 0) 656 { 657 /* Reserve the next tag to the right child. */ 658 DPRINT((" ADDTAGS_RECURSE:CATENATION num_tags++ " 659 "Reserving next_tag %d to right child\n", 660 next_tag)); 661 reserved_tag = next_tag; 662 next_tag++; 663 } 664 STACK_PUSHX(stack, int, reserved_tag); 665 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT); 666 667 /* Process left child. */ 668 STACK_PUSHX(stack, voidptr, left); 669 STACK_PUSHX(stack, int, ADDTAGS_RECURSE); 670 671 } 672 break; 673 case ITERATION: 674 { 675 tre_iteration_t *iter = node->obj; 676 DPRINT(("Iteration\n")); 677 678 if (first_pass) 679 STACK_PUSHX(stack, int, regset_contains != 0); 680 STACK_PUSHX(stack, int, tag); 681 STACK_PUSHX(stack, voidptr, node); 682 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION); 683 684 STACK_PUSHX(stack, voidptr, iter->arg); 685 STACK_PUSHX(stack, int, ADDTAGS_RECURSE); 686 687 /* Regset is not empty, so add a tag here (this always happens 688 because iterators always get submatch id, even if in the 689 invisible range) */ 690 if (regset_contains) 691 { 692 if (!first_pass) 693 { 694 status = tre_merge_branches(mem, node, NULL, tag, 695 tnfa->num_tags); 696 if (status != REG_OK) 697 break; 698 status = tre_add_tag_left(mem, node, tag); 699 if (status != REG_OK) 700 break; 701 if (regset_contains == REGSET_HAS_STARTS && tag != 0) 702 tnfa->tag_directions[tag] = iter->minimal ? 703 TRE_TAG_MINIMIZE : 704 TRE_TAG_LEFT_MAXIMIZE; 705 else 706 tnfa->tag_directions[tag] = direction; 707 DPRINT(("Setting t%d direction to %s\n", tag, 708 tag_dir_str[tnfa->tag_directions[tag]])); 709 tre_purge_regset(regset, tnfa, tag); 710 } 711 712 DPRINT((" ADDTAGS_RECURSE:ITERATION num_tags++ tag=%d\n", 713 tag)); 714 regset[0] = REGSET_UNSET; 715 regset_contains = 0; 716 tag = next_tag; 717 num_tags++; 718 next_tag++; 719 } 720 direction = TRE_TAG_LEFT_MAXIMIZE; 721 DPRINT((" Setting direction to %s\n", tag_dir_str[direction])); 722 } 723 break; 724 case UNION: 725 { 726 tre_union_t *uni; 727 tre_ast_node_t *left; 728 tre_ast_node_t *right; 729 int front_tag = -1; 730 731 DPRINT(("Union\n")); 732 733 if (regset_contains) 734 { 735 DPRINT((" UNION num_tags++ tag=%d\n", tag)); 736 front_tag = tag; 737 tag = next_tag; 738 num_tags++; 739 next_tag++; 740 } 741 742 /* For the top union, walk the tree of consecutive unions, 743 * setting the left_tag and right_tag values in increasing 744 * order (left to right priority) */ 745 if (top_union && 746 (node->num_submatches - 747 (node->submatch_id >= 0 && 748 node->submatch_id < SUBMATCH_ID_INVISIBLE_START)) > 0) 749 { 750 tre_ast_node_t *n; 751 int last = tre_stack_num_objects(stack); 752 753 STACK_PUSH(stack, voidptr, node); 754 STACK_PUSH(stack, int, ADDTAGS_UNION_RECURSE); 755 756 while (tre_stack_num_objects(stack) > last) 757 { 758 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack); 759 switch (symbol) 760 { 761 case ADDTAGS_UNION_RECURSE: 762 n = tre_stack_pop_voidptr(stack); 763 uni = n->obj; 764 left = uni->left; 765 766 /* Since the top union has num_submatches > 0, 767 * we set all the consecutive union's 768 * make_branches to 1 to force the generation 769 * of end tags for each union branch. */ 770 n->make_branches = 1; 771 772 STACK_PUSH(stack, voidptr, n); 773 STACK_PUSH(stack, int, 774 ADDTAGS_UNION_RIGHT_RECURSE); 775 776 if (left->type == UNION) 777 { 778 STACK_PUSH(stack, voidptr, left); 779 STACK_PUSH(stack, int, 780 ADDTAGS_UNION_RECURSE); 781 } 782 else 783 { 784 DPRINT((" ADDTAGS_UNION_RECURSE " 785 "num_tags++ tag=%d\n", tag)); 786 uni->left_tag = tag; 787 tag = next_tag; 788 num_tags++; 789 next_tag++; 790 } 791 break; 792 793 case ADDTAGS_UNION_RIGHT_RECURSE: 794 n = tre_stack_pop_voidptr(stack); 795 uni = n->obj; 796 right = uni->right; 797 798 if (right->type == UNION) 799 { 800 STACK_PUSH(stack, voidptr, right); 801 STACK_PUSH(stack, int, 802 ADDTAGS_UNION_RECURSE); 803 } 804 else 805 { 806 DPRINT((" ADDTAGS_UNION_RIGHT_RECURSE " 807 "num_tags++ tag=%d\n", tag)); 808 uni->right_tag = tag; 809 tag = next_tag; 810 num_tags++; 811 next_tag++; 812 } 813 814 break; 815 816 default: 817 assert(0); 818 break; 819 820 } /* end switch(symbol) */ 821 } /* end while(tre_stack_num_objects(stack) > last */ 822 if (!first_pass) 823 { 824 STACK_PUSHX(stack, int, front_tag); 825 STACK_PUSHX(stack, voidptr, node); 826 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_TOP); 827 } 828 } /* end if (top_union && ...) */ 829 830 uni = node->obj; 831 left = uni->left; 832 right = uni->right; 833 834 /* After processing right child. */ 835 STACK_PUSHX(stack, voidptr, regset); 836 STACK_PUSHX(stack, int, regset_contains != 0); 837 STACK_PUSHX(stack, voidptr, node); 838 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT); 839 840 /* Process right child. */ 841 STACK_PUSHX(stack, voidptr, right); 842 STACK_PUSHX(stack, int, ADDTAGS_RECURSE_NOT_TOP_UNION); 843 844 /* After processing left child. */ 845 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT); 846 847 /* Process left child. */ 848 STACK_PUSHX(stack, voidptr, left); 849 STACK_PUSHX(stack, int, ADDTAGS_RECURSE_NOT_TOP_UNION); 850 851 /* Regset is not empty, so add a tag here. */ 852 if (regset_contains) 853 { 854 if (!first_pass) 855 { 856 status = tre_merge_branches(mem, node, NULL, front_tag, 857 tnfa->num_tags); 858 if (status != REG_OK) 859 break; 860 status = tre_add_tag_left(mem, node, front_tag); 861 if (status != REG_OK) 862 break; 863 if (regset_contains == REGSET_HAS_STARTS) 864 tnfa->tag_directions[front_tag] = TRE_TAG_LEFT_MAXIMIZE; 865 else 866 tnfa->tag_directions[front_tag] = direction; 867 DPRINT(("Setting t%d direction to %s\n", front_tag, 868 tag_dir_str[tnfa->tag_directions[front_tag]])); 869 tre_purge_regset(regset, tnfa, front_tag); 870 } 871 872 regset[0] = REGSET_UNSET; 873 regset_contains = 0; 874 } 875 876 break; 877 } 878 } /* end switch (node->type) */ 879 880 break; /* end case: ADDTAGS_RECURSE */ 881 882 case ADDTAGS_AFTER_ITERATION: 883 { 884 tre_iteration_t *iter; 885 tre_ast_node_t *orig; 886 int enter_tag; 887 888 node = tre_stack_pop_voidptr(stack); 889 orig = node->original ? node->original : node; 890 iter = (tre_iteration_t *)orig->obj; 891 enter_tag = tre_stack_pop_int(stack); 892 if (iter->minimal) 893 minimal_tag = enter_tag; 894 895 DPRINT(("After iteration\n")); 896 if (first_pass) 897 { 898 node->num_tags = iter->arg->num_tags + tre_stack_pop_int(stack); 899 DPRINT((" ADDTAGS_AFTER_ITERATION: node->num_tags = %d\n", 900 node->num_tags)); 901 } 902 else 903 { 904 /* node->last_matched_branch will have the start tag (the tag 905 just *before* the iteration). iter->arg->last_matched_branch 906 will have the tag(s) inside the iteration, the ones that 907 may need to be reset if the iteration doesn't match. So 908 before we merge iter->arg into node, we need to set up 909 a new tre_last_matched_t and tre_last_matched_branch_t, 910 using any of the inside tags as cmp_tag (we choose the first 911 tag found by bit_ffs). If there are no inside tags, we 912 don't bother creating the extra structures. */ 913 tre_last_matched_branch_pre_t *b = 914 iter->arg->last_matched_branch; 915 916 if (b && b->n_tags > 0) 917 { 918 tre_last_matched_pre_t *u; 919 920 bit_ffs(b->tags, num_tags, &b->cmp_tag); 921 DPRINT((" ADDTAGS_AFTER_ITERATION: n_tags=%d " 922 "cmp_tag = %d\n", b->n_tags, b->cmp_tag)); 923 924 u = tre_mem_calloc(mem, sizeof(tre_last_matched_pre_t) + 925 sizeof(tre_last_matched_branch_pre_t) 926 + bitstr_size(tnfa->num_tags)); 927 if (!u) 928 { 929 status = REG_ESPACE; 930 break; 931 } 932 u->branches = b; 933 u->n_branches = 1; 934 u->start_tag = b->cmp_tag; 935 u->tot_branches = b->tot_branches; 936 u->tot_last_matched = 1 + b->tot_last_matched; 937 u->tot_tags = b->tot_tags; 938 939 b = (tre_last_matched_branch_pre_t *)(u + 1); 940 b->last_matched = u; 941 b->n_last_matched = 1; 942 b->tot_branches = 1 + u->tot_branches; 943 b->tot_last_matched = u->tot_last_matched; 944 b->tot_tags = u->tot_tags; 945 946 iter->arg->last_matched_branch = b; 947 } 948 status = tre_merge_branches(mem, node, iter->arg, 0, 949 tnfa->num_tags); 950 if (status != REG_OK) 951 break; 952 953 if (iter->minimal) 954 { 955 /* Add a union with a left EMPTY literal and the right 956 being iter->arg. This should force the tags inside 957 the minimal iteration to prefer being unset */ 958 if (iter->min == 0 && iter->max <= 1) 959 { 960 tre_ast_node_t *u, *e; 961 962 e = tre_ast_new_literal(mem, EMPTY, -1, -1); 963 if (e == NULL) 964 { 965 status = REG_ESPACE; 966 break; 967 } 968 u = tre_ast_new_union(mem, e, iter->arg); 969 if (u == NULL) 970 { 971 status = REG_ESPACE; 972 break; 973 } 974 iter->arg = u; 975 } 976 977 direction = TRE_TAG_MINIMIZE; 978 } 979 else 980 direction = TRE_TAG_MAXIMIZE; 981 DPRINT((" Setting direction to %s\n", tag_dir_str[direction])); 982 } 983 break; 984 } 985 986 case ADDTAGS_AFTER_CAT_LEFT: 987 { 988 int new_tag = tre_stack_pop_int(stack); 989 next_tag = tre_stack_pop_int(stack); 990 DPRINT(("After cat left, tag = %d, next_tag = %d\n", 991 tag, next_tag)); 992 if (new_tag >= 0) 993 { 994 DPRINT((" Setting tag to %d\n", new_tag)); 995 tag = new_tag; 996 } 997 break; 998 } 999 1000 case ADDTAGS_AFTER_CAT_RIGHT: 1001 { 1002 tre_catenation_t *cat; 1003 1004 DPRINT(("After cat right\n")); 1005 node = tre_stack_pop_voidptr(stack); 1006 cat = node->obj; 1007 if (first_pass) 1008 { 1009 node->num_tags = cat->left->num_tags + cat->right->num_tags; 1010 DPRINT((" ADDTAGS_AFTER_CAT_RIGHT: node->num_tags = %d\n", 1011 node->num_tags)); 1012 } 1013 else 1014 { 1015 status = tre_merge_branches(mem, cat->left, cat->right, 0, 1016 tnfa->num_tags); 1017 if (status != REG_OK) 1018 break; 1019 status = tre_merge_branches(mem, node, cat->left, 0, 1020 tnfa->num_tags); 1021 } 1022 break; 1023 } 1024 1025 case ADDTAGS_AFTER_UNION_LEFT: 1026 DPRINT(("After union left\n")); 1027 /* Lift the bottom of the `regset' array so that when processing 1028 the right operand the items currently in the array are 1029 invisible. The original bottom was saved at ADDTAGS_UNION and 1030 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */ 1031 while (*regset != REGSET_UNSET) 1032 regset++; 1033 regset_contains = 0; 1034 break; 1035 1036 case ADDTAGS_AFTER_UNION_RIGHT: 1037 { 1038 int added_tags; 1039 tre_ast_node_t *orig; 1040 tre_union_t *uni; 1041 /* Note: node may not be a UNION, but a CATENATION with a left 1042 * tag. So that is why we pass the original node->obj on the 1043 * stack, to get the union's true values. */ 1044 1045 DPRINT(("After union right\n")); 1046 node = tre_stack_pop_voidptr(stack); 1047 orig = node->original ? node->original : node; 1048 uni = (tre_union_t *)orig->obj; 1049 added_tags = tre_stack_pop_int(stack); 1050 if (first_pass) 1051 { 1052 node->num_tags = uni->left->num_tags + uni->right->num_tags 1053 + added_tags; 1054 if (uni->left_tag > 0) 1055 node->num_tags++; 1056 if (uni->right_tag > 0) 1057 node->num_tags++; 1058 DPRINT((" ADDTAGS_AFTER_UNION_RIGHT: node->num_tags = %d\n", 1059 node->num_tags)); 1060 } 1061 regset = tre_stack_pop_voidptr(stack); 1062 1063 /* Add tags after both children, the left child gets a smaller 1064 tag than the right child. This guarantees that we prefer 1065 the left child over the right child. */ 1066 /* XXX - This is not always necessary (if the children have 1067 tags which must be seen for every match of that child). */ 1068 if (!first_pass && node->make_branches) 1069 { 1070 tre_last_matched_branch_pre_t *lb = 1071 uni->left->last_matched_branch; 1072 tre_last_matched_branch_pre_t *rb = 1073 uni->right->last_matched_branch; 1074 tre_last_matched_pre_t *lu = 1075 uni->left->last_matched_in_progress; 1076 tre_last_matched_pre_t *ru = 1077 uni->right->last_matched_in_progress; 1078 tre_last_matched_pre_t *u; 1079 /* We don't need to call tre_merge_branches because these 1080 * tags don't participate in submatch ranges, so don't need 1081 * to be recorded. But we do set the cmp_tag entry of the 1082 * tre_last_matched_branch_pre_t, so we might call 1083 * tre_merge_branches if we need to create an empty 1084 * tre_last_matched_branch_pre_t. */ 1085 if (uni->left_tag > 0) 1086 { 1087 DPRINT(("Setting t%d direction to maximize\n", 1088 uni->left_tag)); 1089 status = tre_add_tag_right(mem, uni->left, uni->left_tag); 1090 if (status != REG_OK) 1091 break; 1092 tnfa->tag_directions[uni->left_tag] = TRE_TAG_MAXIMIZE; 1093 if (!lb) 1094 { 1095 status = tre_merge_branches(mem, uni->left, NULL, -1, 1096 tnfa->num_tags); 1097 if (status != REG_OK) 1098 break; 1099 lb = uni->left->last_matched_branch; 1100 } 1101 lb->cmp_tag = uni->left_tag; 1102 } 1103 if (uni->right_tag > 0) 1104 { 1105 DPRINT(("Setting t%d direction to maximize\n", 1106 uni->right_tag)); 1107 status = tre_add_tag_right(mem, uni->right, uni->right_tag); 1108 if (status != REG_OK) 1109 break; 1110 tnfa->tag_directions[uni->right_tag] = TRE_TAG_MAXIMIZE; 1111 if (!rb) 1112 { 1113 status = tre_merge_branches(mem, uni->right, NULL, -1, 1114 tnfa->num_tags); 1115 if (status != REG_OK) 1116 break; 1117 rb = uni->right->last_matched_branch; 1118 } 1119 rb->cmp_tag = uni->right_tag; 1120 } 1121 /* Now merge the tre_last_matched_branch_pre_t into a 1122 tre_last_matched_pre_t */ 1123 if (lu == NULL) 1124 { 1125 if (ru == NULL) 1126 { 1127 /* Create a new tre_last_matched_pre_t */ 1128 u = tre_mem_calloc(mem, sizeof(tre_last_matched_pre_t)); 1129 if (!u) 1130 { 1131 status = REG_ESPACE; 1132 break; 1133 } 1134 u->tot_last_matched = 1; 1135 1136 if (lb) 1137 { 1138 u->branches = lb; 1139 u->n_branches = 1; 1140 u->tot_branches += lb->tot_branches; 1141 u->tot_last_matched += lb->tot_last_matched; 1142 u->tot_tags += lb->tot_tags; 1143 if (rb) 1144 { 1145 lb->next = rb; 1146 u->n_branches++; 1147 u->tot_branches += rb->tot_branches; 1148 u->tot_last_matched += rb->tot_last_matched; 1149 u->tot_tags += rb->tot_tags; 1150 } 1151 } 1152 else if (rb) 1153 { 1154 u->branches = rb; 1155 u->n_branches = 1; 1156 u->tot_branches += rb->tot_branches; 1157 u->tot_last_matched += rb->tot_last_matched; 1158 u->tot_tags += rb->tot_tags; 1159 } 1160 } 1161 else 1162 { 1163 /* Use ru, and add lb */ 1164 u = ru; 1165 if (lb) 1166 { 1167 lb->next = u->branches; 1168 u->branches = lb; 1169 u->n_branches++; 1170 u->tot_branches += lb->tot_branches; 1171 u->tot_last_matched += lb->tot_last_matched; 1172 u->tot_tags += lb->tot_tags; 1173 } 1174 } 1175 } 1176 else if (ru == NULL) 1177 { 1178 /* Use lu, and add rb */ 1179 u = lu; 1180 if (rb) 1181 { 1182 rb->next = u->branches; 1183 u->branches = rb; 1184 u->n_branches++; 1185 u->tot_branches += rb->tot_branches; 1186 u->tot_last_matched += rb->tot_last_matched; 1187 u->tot_tags += rb->tot_tags; 1188 } 1189 } 1190 else 1191 { 1192 /* Merge lu and ru into lu */ 1193 if (lu->branches) 1194 { 1195 if (ru->branches) 1196 { 1197 tre_last_matched_branch_pre_t *b = lu->branches; 1198 while (b->next) b = b->next; 1199 b->next = ru->branches; 1200 lu->n_branches += ru->n_branches; 1201 } 1202 } 1203 else if (ru->branches) 1204 { 1205 lu->branches = ru->branches; 1206 lu->n_branches = ru->n_branches; 1207 } 1208 lu->tot_branches += ru->tot_branches; 1209 lu->tot_last_matched += ru->tot_last_matched - 1; 1210 lu->tot_tags += ru->tot_tags; 1211 u = lu; 1212 } 1213 node->last_matched_in_progress = u; 1214 } 1215 direction = TRE_TAG_MAXIMIZE; 1216 break; 1217 } 1218 1219 case ADDTAGS_AFTER_UNION_TOP: /* only called when not first_pass */ 1220 { 1221 tre_last_matched_branch_pre_t *b; 1222 tre_last_matched_pre_t *u; 1223 int start_tag; 1224 1225 DPRINT(("After union top\n")); 1226 node = tre_stack_pop_voidptr(stack); 1227 start_tag = tre_stack_pop_int(stack); 1228 b = tre_mem_calloc(mem, sizeof(tre_last_matched_branch_pre_t) 1229 + bitstr_size(tnfa->num_tags)); 1230 if (!b) 1231 { 1232 status = REG_ESPACE; 1233 break; 1234 } 1235 1236 u = node->last_matched_in_progress; 1237 u->start_tag = start_tag; 1238 b->tot_branches = 1 + u->tot_branches; 1239 b->tot_last_matched = u->tot_last_matched; 1240 b->tot_tags = u->tot_tags; 1241 b->last_matched = u; 1242 b->n_last_matched = 1; 1243 node->last_matched_branch = b; 1244 node->last_matched_in_progress = NULL; 1245 break; 1246 } 1247 1248 default: 1249 assert(0); 1250 break; 1251 1252 } /* end switch(symbol) */ 1253 } /* end while(tre_stack_num_objects(stack) > bottom) */ 1254 1255 if (status != REG_OK) 1256 { 1257 DPRINT(("Error during %s pass\n", first_pass ? "first" : "second")); 1258 goto error_post_compile; 1259 } 1260 1261 if (!first_pass) 1262 { 1263 int i; 1264 if (num_tags != tnfa->num_tags) 1265 { 1266 DPRINT(("num_tags(%d) != tnfa->num_tags(%d)\n", num_tags, 1267 tnfa->num_tags)); 1268 status = REG_BADPAT; 1269 goto error_post_compile; 1270 } 1271 1272 tre_purge_regset(regset, tnfa, tag); 1273 DPRINT(("Setting t%d to %s\n", num_tags, 1274 tag_dir_str[direction])); 1275 tnfa->tag_directions[num_tags] = direction; 1276 1277 if (rtp > reorder_tags + 2 * tnfa->num_reorder_tags) 1278 { 1279 DPRINT(("Processed %d reorder tags instead of %d\n", 1280 (int)(rtp - reorder_tags) / 2, tnfa->num_reorder_tags)); 1281 status = REG_BADPAT; 1282 goto error_post_compile; 1283 } 1284 *rtp = -1; 1285 #if TRE_DEBUG 1286 if (reorder_tags[0] >= 0) 1287 { 1288 DPRINT(("reorder_tags:\n")); 1289 for (rtp = reorder_tags; *rtp >= 0;) 1290 { 1291 DPRINT(("%d after ", *rtp++)); 1292 DPRINT(("%d\n", *rtp++)); 1293 } 1294 } 1295 else 1296 DPRINT(("No reorder_tags\n")); 1297 #endif /* TRE_DEBUG */ 1298 1299 /* Initialize to_reorder */ 1300 for (i = 0; i < num_tags; i++) 1301 to_reorder[i] = i; 1302 /* Use to_seq_order to convert reorder_tags values, and use those to 1303 * reorder to_reorder */ 1304 for (rtp = reorder_tags; *rtp >= 0;) 1305 { 1306 int j, high, low; 1307 int ti = *rtp++; 1308 1309 /* Skip reordering the final tag */ 1310 if (ti >= num_tags) 1311 { 1312 DPRINT(("Skipping reorder of %d\n", ti)); 1313 rtp++; 1314 continue; 1315 } 1316 /* The number of the tag to reorder */ 1317 high = to_reorder[ti]; 1318 /* Reorder after this tag */ 1319 low = to_reorder[*rtp++]; 1320 1321 DPRINT(("ti=%d high=%d low=%d\n", ti, high, low)); 1322 if (low > high) 1323 { 1324 DPRINT(("Tag %d already before %d\n", high, low)); 1325 continue; 1326 } 1327 for (j = 0; j < num_tags; j++) 1328 if (to_reorder[j] > low && to_reorder[j] < high) 1329 to_reorder[j]++; 1330 to_reorder[ti] = low + 1; 1331 #ifdef TRE_DEBUG 1332 DPRINT(("to_reorder=(")); 1333 for (j = 0; j < num_tags; j++) 1334 { 1335 DPRINT(("%d", to_reorder[j])); 1336 if (j < num_tags - 1) 1337 DPRINT((",")); 1338 } 1339 DPRINT((")\n")); 1340 #endif /* TRE_DEBUG */ 1341 } 1342 /* Determine if reordering in really necessary */ 1343 { 1344 int need_reorder = 0; 1345 for (i = 0; i < num_tags; i++) 1346 if(to_reorder[i] != i) 1347 { 1348 need_reorder = 1; 1349 break; 1350 } 1351 /* If need_reorder is not set, free reorder_tags, and set to NULL, 1352 * indicating no reordering is needed */ 1353 if (!need_reorder) 1354 { 1355 DPRINT(("Don't need to reorder\n")); 1356 free(reorder_tags); 1357 reorder_tags = NULL; 1358 } 1359 } 1360 } 1361 1362 if (reorder_tags) 1363 { 1364 int i; 1365 tre_tag_direction_t *new_tag_directions; 1366 #if TRE_DEBUG 1367 DPRINT(("to_reorder:")); 1368 for (i = 0; i < num_tags; i++) 1369 DPRINT((" %d->%d", i, to_reorder[i])); 1370 DPRINT(("\n")); 1371 #endif /* TRE_DEBUG */ 1372 1373 DPRINT(("Reordering submatch_data\n")); 1374 for (i = 0; i < tnfa->num_submatches; i++) 1375 { 1376 #if TRE_DEBUG 1377 int so = tnfa->submatch_data[i].so_tag; 1378 int eo = tnfa->submatch_data[i].eo_tag; 1379 #endif /* TRE_DEBUG */ 1380 tnfa->submatch_data[i].so_tag = 1381 to_reorder[tnfa->submatch_data[i].so_tag]; 1382 tnfa->submatch_data[i].eo_tag = 1383 tnfa->submatch_data[i].eo_tag < num_tags ? 1384 to_reorder[tnfa->submatch_data[i].eo_tag] : 1385 tnfa->submatch_data[i].eo_tag; 1386 DPRINT(("pmatch[%d]: {%d, %d}->{%d, %d}\n", i, so, eo, 1387 tnfa->submatch_data[i].so_tag, 1388 tnfa->submatch_data[i].eo_tag)); 1389 } 1390 1391 DPRINT(("Reordering tag_directions\n")); 1392 /* We only allocate num_tags directions and reorder them. The 1393 * num_tags-th direction (end tag) is left unchanged. */ 1394 new_tag_directions = malloc(sizeof(*new_tag_directions) * num_tags); 1395 if (new_tag_directions == NULL) 1396 { 1397 status = REG_ESPACE; 1398 goto error_post_compile; 1399 } 1400 for (i = 0; i < num_tags; i++) 1401 { 1402 new_tag_directions[to_reorder[i]] = tnfa->tag_directions[i]; 1403 } 1404 #if TRE_DEBUG 1405 for (i = 0; i < num_tags; i++) 1406 { 1407 DPRINT(("t%d %s->%s\n", i, 1408 tag_dir_str[tnfa->tag_directions[i]], 1409 tag_dir_str[new_tag_directions[i]])); 1410 } 1411 DPRINT(("t%d %s->%s\n", num_tags, 1412 tag_dir_str[tnfa->tag_directions[num_tags]], 1413 tag_dir_str[tnfa->tag_directions[num_tags]])); 1414 #endif /* TRE_DEBUG */ 1415 memcpy(tnfa->tag_directions, new_tag_directions, sizeof(*new_tag_directions) * num_tags); 1416 free(new_tag_directions); 1417 1418 DPRINT(("Reordering minimal_tags\n")); 1419 for (i = 0; tnfa->minimal_tags[i] >= 0; i++) 1420 tnfa->minimal_tags[i] = tnfa->minimal_tags[i] < num_tags ? 1421 to_reorder[tnfa->minimal_tags[i]] : 1422 tnfa->minimal_tags[i]; 1423 1424 DPRINT(("Reordering AST tags\n")); 1425 STACK_PUSH(stack, voidptr, tree); 1426 while (status == REG_OK && tre_stack_num_objects(stack) > bottom) 1427 { 1428 node = tre_stack_pop_voidptr(stack); 1429 1430 switch (node->type) 1431 { 1432 case LITERAL: 1433 { 1434 tre_literal_t *lit = (tre_literal_t *)node->obj; 1435 if (IS_TAG(lit)) 1436 lit->code_max = to_reorder[lit->code_max]; 1437 break; 1438 } 1439 1440 case UNION: 1441 { 1442 tre_union_t *uni = (tre_union_t *)node->obj; 1443 STACK_PUSHX(stack, voidptr, uni->right); 1444 STACK_PUSHX(stack, voidptr, uni->left); 1445 break; 1446 } 1447 1448 case CATENATION: 1449 { 1450 tre_catenation_t *cat = (tre_catenation_t *)node->obj; 1451 STACK_PUSHX(stack, voidptr, cat->right); 1452 STACK_PUSHX(stack, voidptr, cat->left); 1453 break; 1454 } 1455 1456 case ITERATION: 1457 { 1458 tre_iteration_t *iter = (tre_iteration_t *)node->obj; 1459 STACK_PUSHX(stack, voidptr, iter->arg); 1460 break; 1461 } 1462 1463 default: 1464 assert(0); 1465 break; 1466 } 1467 } 1468 if (status != REG_OK) 1469 { 1470 DPRINT(("Error while reordering tags\n")); 1471 goto error_post_compile; 1472 } 1473 } 1474 1475 if (!first_pass) 1476 { 1477 if (tree->last_matched_branch) 1478 { 1479 tre_last_matched_branch_t *buf, *b, *bb; 1480 tre_last_matched_branch_pre_t *bp; 1481 tre_last_matched_t *u, *uu; 1482 tre_last_matched_pre_t *up; 1483 int *t; 1484 int i; 1485 #ifdef TRE_DEBUG 1486 tre_last_matched_branch_t *_b; 1487 tre_last_matched_t *_u; 1488 int *_t; 1489 1490 DPRINT(("last_match_branch_pre:\n")); 1491 print_last_match_branch_pre(tree->last_matched_branch, 0, num_tags); 1492 #endif /* TRE_DEBUG */ 1493 buf = (tre_last_matched_branch_t *)calloc(1, 1494 tree->last_matched_branch->tot_branches 1495 * sizeof(tre_last_matched_branch_t) + 1496 tree->last_matched_branch->tot_last_matched 1497 * sizeof(tre_last_matched_t) + 1498 tree->last_matched_branch->tot_tags * 1499 sizeof(int)); 1500 if (!buf) 1501 { 1502 status = REG_ESPACE; 1503 goto error_post_compile; 1504 } 1505 1506 b = buf; 1507 u = (tre_last_matched_t *)(b + 1508 tree->last_matched_branch->tot_branches); 1509 t = (int *)(u + tree->last_matched_branch->tot_last_matched); 1510 #ifdef TRE_DEBUG 1511 _b = b; 1512 _u = u; 1513 _t = t; 1514 #endif /* TRE_DEBUG */ 1515 DPRINT(("Copying info_pre to info\n")); 1516 STACK_PUSH(stack, voidptr, tree->last_matched_branch); 1517 STACK_PUSH(stack, int, 1); 1518 STACK_PUSH(stack, int, COPY_LAST_MATCHED_BRANCH); 1519 1520 while (status == REG_OK && tre_stack_num_objects(stack) > bottom) 1521 { 1522 switch (tre_stack_pop_int(stack)) 1523 { 1524 case COPY_LAST_MATCHED_BRANCH: 1525 i = tre_stack_pop_int(stack); 1526 /* The tre_last_matched_branch_pre_t * is still on the 1527 stack */ 1528 STACK_PUSHX(stack, voidptr, b); 1529 STACK_PUSHX(stack, int, COPY_LAST_MATCHED_BRANCH_NEXT); 1530 b += i; 1531 break; 1532 1533 case COPY_LAST_MATCHED_BRANCH_NEXT: 1534 bb = tre_stack_pop_voidptr(stack); 1535 bp = tre_stack_pop_voidptr(stack); 1536 bb->n_last_matched = bp->n_last_matched; 1537 bb->cmp_tag = bp->cmp_tag; 1538 if (bp->n_tags > 0) 1539 { 1540 int n; 1541 n = bb->n_tags = bp->n_tags; 1542 bb->tags = t; 1543 for (i = 0; i < num_tags; i++) 1544 if (bit_test(bp->tags, i)) 1545 { 1546 *t++ = i; 1547 if (--n <= 0) 1548 break; 1549 } 1550 } 1551 if (bp->next) 1552 { 1553 STACK_PUSHX(stack, voidptr, bp->next); 1554 STACK_PUSHX(stack, voidptr, bb + 1); 1555 STACK_PUSHX(stack, int, COPY_LAST_MATCHED_BRANCH_NEXT); 1556 } 1557 if (bp->n_last_matched > 0) 1558 { 1559 bb->last_matched = u; 1560 STACK_PUSHX(stack, voidptr, bp->last_matched); 1561 STACK_PUSHX(stack, int, bp->n_last_matched); 1562 STACK_PUSHX(stack, int, COPY_LAST_MATCHED); 1563 } 1564 break; 1565 1566 case COPY_LAST_MATCHED: 1567 i = tre_stack_pop_int(stack); 1568 /* The tre_last_matched_pre_t * is still on the stack */ 1569 STACK_PUSHX(stack, voidptr, u); 1570 STACK_PUSHX(stack, int, COPY_LAST_MATCHED_NEXT); 1571 u += i; 1572 break; 1573 1574 case COPY_LAST_MATCHED_NEXT: 1575 uu = tre_stack_pop_voidptr(stack); 1576 up = tre_stack_pop_voidptr(stack); 1577 uu->n_branches = up->n_branches; 1578 uu->branches = b; 1579 uu->start_tag = up->start_tag; 1580 if (up->next) 1581 { 1582 STACK_PUSHX(stack, voidptr, up->next); 1583 STACK_PUSHX(stack, voidptr, uu + 1); 1584 STACK_PUSHX(stack, int, COPY_LAST_MATCHED_NEXT); 1585 } 1586 STACK_PUSHX(stack, voidptr, up->branches); 1587 STACK_PUSHX(stack, int, up->n_branches); 1588 STACK_PUSHX(stack, int, COPY_LAST_MATCHED_BRANCH); 1589 break; 1590 } 1591 } 1592 if (status != REG_OK) 1593 goto error_post_compile; 1594 #ifdef TRE_DEBUG 1595 DPRINT(("last_matched_branch:\n")); 1596 print_last_match_branch(buf, 0); 1597 if (b != _b + tree->last_matched_branch->tot_branches) 1598 DPRINT(("b/%p != _b + tree->last_matched_branch->tot_branches/%p\n", 1599 b, _b + tree->last_matched_branch->tot_branches)); 1600 if (u != _u + tree->last_matched_branch->tot_last_matched) 1601 DPRINT(("u/%p != _u + " 1602 "tree->last_matched_branch->tot_last_matched/%p\n", 1603 u, _u + tree->last_matched_branch->tot_last_matched)); 1604 if (t != _t + tree->last_matched_branch->tot_tags) 1605 DPRINT(("t/%p != _t + tree->last_matched_branch->tot_tags/%p\n", 1606 t, _t + tree->last_matched_branch->tot_tags)); 1607 #endif /* TRE_DEBUG */ 1608 tnfa->last_matched_branch = buf; 1609 } 1610 #ifdef TRE_DEBUG 1611 else 1612 DPRINT(("No last_match_branch_pre\n")); 1613 #endif /* TRE_DEBUG */ 1614 } 1615 1616 DPRINT(("tre_add_tags: %s complete. Number of tags %d.\n", 1617 first_pass? "First pass" : "Second pass", num_tags)); 1618 #ifdef TRE_DEBUG 1619 tre_ast_print(tree); 1620 #endif /* TRE_DEBUG */ 1621 DPRINT(("tre_add_tags: tree->num_tags=%d num_tags=%d\n", tree->num_tags, 1622 num_tags)); 1623 assert(tree->num_tags == num_tags); 1624 tnfa->end_tag = num_tags; 1625 tnfa->num_tags = num_tags; 1626 tnfa->num_minimals = num_minimals; 1627 error_post_compile: 1628 free(reorder_tags); 1629 error_reorder_tags: 1630 free(orig_regset); 1631 error_regset: 1632 return status; 1633 } 1634 1635 /* 1636 AST to TNFA compilation routines. 1637 */ 1638 1639 typedef enum { 1640 COPY_RECURSE, 1641 COPY_SET_RESULT_PTR 1642 } tre_copyast_symbol_t; 1643 1644 /* Flags for tre_copy_ast(). */ 1645 #define COPY_REMOVE_TAGS 1 1646 #define COPY_MAXIMIZE_FIRST_TAG 2 1647 1648 static reg_errcode_t 1649 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, 1650 int flags, int *pos_add, tre_tag_direction_t *tag_directions, 1651 tre_ast_node_t **copy, int *max_pos) 1652 { 1653 reg_errcode_t status = REG_OK; 1654 int bottom = tre_stack_num_objects(stack); 1655 int num_copied = 0; 1656 int first_tag = 1; 1657 tre_ast_node_t **result = copy; 1658 tre_copyast_symbol_t symbol; 1659 1660 STACK_PUSH(stack, voidptr, ast); 1661 STACK_PUSH(stack, int, COPY_RECURSE); 1662 1663 while (status == REG_OK && tre_stack_num_objects(stack) > bottom) 1664 { 1665 tre_ast_node_t *node; 1666 if (status != REG_OK) 1667 break; 1668 1669 symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack); 1670 switch (symbol) 1671 { 1672 case COPY_SET_RESULT_PTR: 1673 result = tre_stack_pop_voidptr(stack); 1674 break; 1675 case COPY_RECURSE: 1676 node = tre_stack_pop_voidptr(stack); 1677 switch (node->type) 1678 { 1679 case LITERAL: 1680 { 1681 tre_literal_t *lit = node->obj; 1682 int pos = lit->position; 1683 int min = lit->code_min; 1684 int max = lit->code_max; 1685 tre_bracket_match_list_t *list = !IS_SPECIAL(lit) ? 1686 lit->u.bracket_match_list : 1687 NULL; 1688 if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) 1689 { 1690 /* XXX - e.g. [ab] has only one position but two 1691 nodes, so we are creating holes in the state space 1692 here. Not fatal, just wastes memory. */ 1693 pos += *pos_add; 1694 num_copied++; 1695 } 1696 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS)) 1697 { 1698 /* Change this tag to empty. */ 1699 min = EMPTY; 1700 max = pos = -1; 1701 } 1702 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG) 1703 && first_tag) 1704 { 1705 /* Maximize the first tag. */ 1706 if (tag_directions[max] == TRE_TAG_LEFT_MAXIMIZE) 1707 tag_directions[max] = TRE_TAG_MAXIMIZE; 1708 first_tag = 0; 1709 } 1710 *result = tre_ast_new_literal(mem, min, max, pos); 1711 if (*result == NULL) 1712 status = REG_ESPACE; 1713 1714 if (pos > *max_pos) 1715 *max_pos = pos; 1716 1717 if (!IS_SPECIAL(lit)) 1718 ((tre_literal_t *)(*result)->obj)->u.bracket_match_list 1719 = list; 1720 break; 1721 } 1722 case UNION: 1723 { 1724 tre_union_t *uni = node->obj; 1725 tre_union_t *tmp; 1726 *result = tre_ast_new_union(mem, uni->left, uni->right); 1727 if (*result == NULL) 1728 { 1729 status = REG_ESPACE; 1730 break; 1731 } 1732 tmp = (*result)->obj; 1733 result = &tmp->left; 1734 STACK_PUSHX(stack, voidptr, uni->right); 1735 STACK_PUSHX(stack, int, COPY_RECURSE); 1736 STACK_PUSHX(stack, voidptr, &tmp->right); 1737 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR); 1738 STACK_PUSHX(stack, voidptr, uni->left); 1739 STACK_PUSHX(stack, int, COPY_RECURSE); 1740 break; 1741 } 1742 case CATENATION: 1743 { 1744 tre_catenation_t *cat = node->obj; 1745 tre_catenation_t *tmp; 1746 *result = tre_ast_new_catenation(mem, cat->left, cat->right); 1747 if (*result == NULL) 1748 { 1749 status = REG_ESPACE; 1750 break; 1751 } 1752 tmp = (*result)->obj; 1753 tmp->left = NULL; 1754 tmp->right = NULL; 1755 result = &tmp->left; 1756 1757 STACK_PUSHX(stack, voidptr, cat->right); 1758 STACK_PUSHX(stack, int, COPY_RECURSE); 1759 STACK_PUSHX(stack, voidptr, &tmp->right); 1760 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR); 1761 STACK_PUSHX(stack, voidptr, cat->left); 1762 STACK_PUSHX(stack, int, COPY_RECURSE); 1763 break; 1764 } 1765 case ITERATION: 1766 { 1767 tre_iteration_t *iter = node->obj; 1768 STACK_PUSHX(stack, voidptr, iter->arg); 1769 STACK_PUSHX(stack, int, COPY_RECURSE); 1770 *result = tre_ast_new_iter(mem, iter->arg, iter->min, 1771 iter->max, iter->minimal); 1772 if (*result == NULL) 1773 { 1774 status = REG_ESPACE; 1775 break; 1776 } 1777 iter = (*result)->obj; 1778 result = &iter->arg; 1779 break; 1780 } 1781 default: 1782 assert(0); 1783 break; 1784 } 1785 break; 1786 } 1787 } 1788 *pos_add += num_copied; 1789 return status; 1790 } 1791 1792 typedef enum { 1793 EXPAND_RECURSE, 1794 EXPAND_AFTER_ITER 1795 } tre_expand_ast_symbol_t; 1796 1797 /* Expands each iteration node that has a finite nonzero minimum or maximum 1798 iteration count to a catenated sequence of copies of the node. */ 1799 static reg_errcode_t 1800 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, 1801 int *position, tre_tag_direction_t *tag_directions, 1802 int *max_depth) 1803 { 1804 reg_errcode_t status = REG_OK; 1805 int bottom = tre_stack_num_objects(stack); 1806 int pos_add = 0; 1807 int pos_add_total = 0; 1808 int max_pos = 0; 1809 int iter_depth = 0; 1810 1811 STACK_PUSHR(stack, voidptr, ast); 1812 STACK_PUSHR(stack, int, EXPAND_RECURSE); 1813 while (status == REG_OK && tre_stack_num_objects(stack) > bottom) 1814 { 1815 tre_ast_node_t *node; 1816 tre_expand_ast_symbol_t symbol; 1817 1818 if (status != REG_OK) 1819 break; 1820 1821 DPRINT(("pos_add %d\n", pos_add)); 1822 1823 symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack); 1824 node = tre_stack_pop_voidptr(stack); 1825 switch (symbol) 1826 { 1827 case EXPAND_RECURSE: 1828 switch (node->type) 1829 { 1830 case LITERAL: 1831 { 1832 tre_literal_t *lit= node->obj; 1833 if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) 1834 { 1835 lit->position += pos_add; 1836 if (lit->position > max_pos) 1837 max_pos = lit->position; 1838 } 1839 break; 1840 } 1841 case UNION: 1842 { 1843 tre_union_t *uni = node->obj; 1844 STACK_PUSHX(stack, voidptr, uni->right); 1845 STACK_PUSHX(stack, int, EXPAND_RECURSE); 1846 STACK_PUSHX(stack, voidptr, uni->left); 1847 STACK_PUSHX(stack, int, EXPAND_RECURSE); 1848 break; 1849 } 1850 case CATENATION: 1851 { 1852 tre_catenation_t *cat = node->obj; 1853 STACK_PUSHX(stack, voidptr, cat->right); 1854 STACK_PUSHX(stack, int, EXPAND_RECURSE); 1855 STACK_PUSHX(stack, voidptr, cat->left); 1856 STACK_PUSHX(stack, int, EXPAND_RECURSE); 1857 break; 1858 } 1859 case ITERATION: 1860 { 1861 tre_iteration_t *iter = node->obj; 1862 STACK_PUSHX(stack, int, pos_add); 1863 STACK_PUSHX(stack, voidptr, node); 1864 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER); 1865 STACK_PUSHX(stack, voidptr, iter->arg); 1866 STACK_PUSHX(stack, int, EXPAND_RECURSE); 1867 /* If we are going to expand this node at EXPAND_AFTER_ITER 1868 then don't increase the `pos' fields of the nodes now, it 1869 will get done when expanding. */ 1870 if (iter->min > 1 || iter->max > 1) 1871 pos_add = 0; 1872 iter_depth++; 1873 DPRINT(("iter\n")); 1874 break; 1875 } 1876 default: 1877 assert(0); 1878 break; 1879 } 1880 break; 1881 case EXPAND_AFTER_ITER: 1882 { 1883 tre_iteration_t *iter = node->obj; 1884 int pos_add_last; 1885 pos_add = tre_stack_pop_int(stack); 1886 pos_add_last = pos_add; 1887 /* Originally (in tre_parse_bound), if min == 0 && max == 0, we 1888 immediate replace the whole iteration with EMPTY. This 1889 unfortunately drops any submatches, and messes up setting the 1890 pmatch values (we can get tags of -1, and tag values in the 1891 billions). So we left it there and replace with EMPTY here. */ 1892 if (iter->min == 0 && iter->max == 0) 1893 { 1894 tre_ast_node_t *empty = tre_ast_new_literal(mem, EMPTY, -1, -1); 1895 if (empty == NULL) 1896 return REG_ESPACE; 1897 node->obj = empty->obj; 1898 node->type = empty->type; 1899 } 1900 else if (iter->min > 1 || iter->max > 1) 1901 { 1902 tre_ast_node_t *seq1 = NULL, *seq2 = NULL; 1903 int j; 1904 int pos_add_save = pos_add; 1905 1906 /* Create a catenated sequence of copies of the node. */ 1907 for (j = 0; j < iter->min; j++) 1908 { 1909 tre_ast_node_t *copy; 1910 /* Remove tags from all but the last copy. */ 1911 int flags = ((j + 1 < iter->min) 1912 ? COPY_REMOVE_TAGS 1913 : COPY_MAXIMIZE_FIRST_TAG); 1914 DPRINT((" pos_add %d\n", pos_add)); 1915 pos_add_save = pos_add; 1916 status = tre_copy_ast(mem, stack, iter->arg, flags, 1917 &pos_add, tag_directions, ©, 1918 &max_pos); 1919 if (status != REG_OK) 1920 return status; 1921 if (seq1 != NULL) 1922 seq1 = tre_ast_new_catenation(mem, seq1, copy); 1923 else 1924 seq1 = copy; 1925 if (seq1 == NULL) 1926 return REG_ESPACE; 1927 } 1928 1929 if (iter->max == -1) 1930 { 1931 /* No upper limit. */ 1932 pos_add_save = pos_add; 1933 status = tre_copy_ast(mem, stack, iter->arg, 0, 1934 &pos_add, NULL, &seq2, &max_pos); 1935 if (status != REG_OK) 1936 return status; 1937 seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0); 1938 if (seq2 == NULL) 1939 return REG_ESPACE; 1940 } 1941 else 1942 { 1943 for (j = iter->min; j < iter->max; j++) 1944 { 1945 tre_ast_node_t *tmp, *copy; 1946 pos_add_save = pos_add; 1947 status = tre_copy_ast(mem, stack, iter->arg, 0, 1948 &pos_add, NULL, ©, &max_pos); 1949 if (status != REG_OK) 1950 return status; 1951 if (seq2 != NULL) 1952 seq2 = tre_ast_new_catenation(mem, copy, seq2); 1953 else 1954 seq2 = copy; 1955 if (seq2 == NULL) 1956 return REG_ESPACE; 1957 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1); 1958 if (tmp == NULL) 1959 return REG_ESPACE; 1960 seq2 = tre_ast_new_union(mem, tmp, seq2); 1961 if (seq2 == NULL) 1962 return REG_ESPACE; 1963 } 1964 } 1965 1966 pos_add = pos_add_save; 1967 if (seq1 == NULL) 1968 seq1 = seq2; 1969 else if (seq2 != NULL) 1970 seq1 = tre_ast_new_catenation(mem, seq1, seq2); 1971 if (seq1 == NULL) 1972 return REG_ESPACE; 1973 node->obj = seq1->obj; 1974 node->type = seq1->type; 1975 } 1976 1977 iter_depth--; 1978 pos_add_total += pos_add - pos_add_last; 1979 if (iter_depth == 0) 1980 pos_add = pos_add_total; 1981 1982 break; 1983 } 1984 default: 1985 assert(0); 1986 break; 1987 } 1988 } 1989 1990 *position += pos_add_total; 1991 1992 /* `max_pos' should never be larger than `*position' if the above 1993 code works, but just an extra safeguard let's make sure 1994 `*position' is set large enough so enough memory will be 1995 allocated for the transition table. */ 1996 if (max_pos > *position) 1997 *position = max_pos; 1998 1999 #ifdef TRE_DEBUG 2000 DPRINT(("Expanded AST:\n")); 2001 tre_ast_print(ast); 2002 DPRINT(("*position %d, max_pos %d\n", *position, max_pos)); 2003 #endif 2004 2005 return status; 2006 } 2007 2008 static tre_pos_and_tags_t * 2009 tre_set_empty(tre_mem_t mem) 2010 { 2011 tre_pos_and_tags_t *new_set; 2012 2013 new_set = tre_mem_calloc(mem, sizeof(*new_set)); 2014 if (new_set == NULL) 2015 return NULL; 2016 2017 new_set[0].position = -1; 2018 new_set[0].code_min = -1; 2019 new_set[0].code_max = -1; 2020 2021 return new_set; 2022 } 2023 2024 static tre_pos_and_tags_t * 2025 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max, 2026 tre_bracket_match_list_t *bracket_match_list, int backref) 2027 { 2028 tre_pos_and_tags_t *new_set; 2029 2030 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2); 2031 if (new_set == NULL) 2032 return NULL; 2033 2034 new_set[0].position = position; 2035 new_set[0].code_min = code_min; 2036 new_set[0].code_max = code_max; 2037 new_set[0].bracket_match_list = bracket_match_list; 2038 new_set[0].backref = backref; 2039 new_set[1].position = -1; 2040 new_set[1].code_min = -1; 2041 new_set[1].code_max = -1; 2042 2043 return new_set; 2044 } 2045 2046 static tre_pos_and_tags_t * 2047 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2, 2048 int *tags, int assertions, int *params) 2049 { 2050 int s1, s2, i, j; 2051 tre_pos_and_tags_t *new_set; 2052 int *new_tags; 2053 int num_tags; 2054 2055 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++); 2056 for (s1 = 0; set1[s1].position >= 0; s1++); 2057 for (s2 = 0; set2[s2].position >= 0; s2++); 2058 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1)); 2059 if (!new_set ) 2060 return NULL; 2061 2062 for (s1 = 0; set1[s1].position >= 0; s1++) 2063 { 2064 new_set[s1].position = set1[s1].position; 2065 new_set[s1].code_min = set1[s1].code_min; 2066 new_set[s1].code_max = set1[s1].code_max; 2067 new_set[s1].assertions = set1[s1].assertions | assertions; 2068 new_set[s1].bracket_match_list = set1[s1].bracket_match_list; 2069 new_set[s1].backref = set1[s1].backref; 2070 if (set1[s1].tags == NULL && tags == NULL) 2071 new_set[s1].tags = NULL; 2072 else 2073 { 2074 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++); 2075 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags) 2076 * (i + num_tags + 1))); 2077 if (new_tags == NULL) 2078 return NULL; 2079 for (j = 0; j < i; j++) 2080 new_tags[j] = set1[s1].tags[j]; 2081 for (i = 0; i < num_tags; i++) 2082 new_tags[j + i] = tags[i]; 2083 new_tags[j + i] = -1; 2084 new_set[s1].tags = new_tags; 2085 } 2086 if (set1[s1].params) 2087 new_set[s1].params = set1[s1].params; 2088 if (params) 2089 { 2090 if (!new_set[s1].params) 2091 new_set[s1].params = params; 2092 else 2093 { 2094 new_set[s1].params = tre_mem_alloc(mem, sizeof(*params) * 2095 TRE_PARAM_LAST); 2096 if (!new_set[s1].params) 2097 return NULL; 2098 for (i = 0; i < TRE_PARAM_LAST; i++) 2099 if (params[i] != TRE_PARAM_UNSET) 2100 new_set[s1].params[i] = params[i]; 2101 } 2102 } 2103 } 2104 2105 for (s2 = 0; set2[s2].position >= 0; s2++) 2106 { 2107 new_set[s1 + s2].position = set2[s2].position; 2108 new_set[s1 + s2].code_min = set2[s2].code_min; 2109 new_set[s1 + s2].code_max = set2[s2].code_max; 2110 /* XXX - why not | assertions here as well? */ 2111 new_set[s1 + s2].assertions = set2[s2].assertions; 2112 new_set[s1 + s2].bracket_match_list = set2[s2].bracket_match_list; 2113 new_set[s1 + s2].backref = set2[s2].backref; 2114 if (set2[s2].tags == NULL) 2115 new_set[s1 + s2].tags = NULL; 2116 else 2117 { 2118 for (i = 0; set2[s2].tags[i] >= 0; i++); 2119 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1)); 2120 if (new_tags == NULL) 2121 return NULL; 2122 for (j = 0; j < i; j++) 2123 new_tags[j] = set2[s2].tags[j]; 2124 new_tags[j] = -1; 2125 new_set[s1 + s2].tags = new_tags; 2126 } 2127 if (set2[s2].params) 2128 new_set[s1 + s2].params = set2[s2].params; 2129 if (params) 2130 { 2131 if (!new_set[s1 + s2].params) 2132 new_set[s1 + s2].params = params; 2133 else 2134 { 2135 new_set[s1 + s2].params = tre_mem_alloc(mem, sizeof(*params) * 2136 TRE_PARAM_LAST); 2137 if (!new_set[s1 + s2].params) 2138 return NULL; 2139 for (i = 0; i < TRE_PARAM_LAST; i++) 2140 if (params[i] != TRE_PARAM_UNSET) 2141 new_set[s1 + s2].params[i] = params[i]; 2142 } 2143 } 2144 } 2145 new_set[s1 + s2].position = -1; 2146 return new_set; 2147 } 2148 2149 /* Finds the empty path through `node' which is the one that should be 2150 taken according to POSIX.2 rules, and adds the tags on that path to 2151 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is 2152 set to the number of tags seen on the path. */ 2153 static reg_errcode_t 2154 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags, 2155 int *assertions, int *params, int *num_tags_seen, 2156 int *params_seen) 2157 { 2158 tre_literal_t *lit; 2159 tre_union_t *uni; 2160 tre_catenation_t *cat; 2161 tre_iteration_t *iter; 2162 int i; 2163 int bottom = tre_stack_num_objects(stack); 2164 reg_errcode_t status = REG_OK; 2165 if (num_tags_seen) 2166 *num_tags_seen = 0; 2167 if (params_seen) 2168 *params_seen = 0; 2169 2170 status = tre_stack_push_voidptr(stack, node); 2171 2172 /* Walk through the tree recursively. */ 2173 while (status == REG_OK && tre_stack_num_objects(stack) > bottom) 2174 { 2175 node = tre_stack_pop_voidptr(stack); 2176 2177 switch (node->type) 2178 { 2179 case LITERAL: 2180 lit = (tre_literal_t *)node->obj; 2181 switch (lit->code_min) 2182 { 2183 case TAG: 2184 if (lit->code_max >= 0) 2185 { 2186 if (tags != NULL) 2187 { 2188 /* Add the tag to `tags'. */ 2189 for (i = 0; tags[i] >= 0; i++) 2190 if (tags[i] == lit->code_max) 2191 break; 2192 if (tags[i] < 0) 2193 { 2194 tags[i] = lit->code_max; 2195 tags[i + 1] = -1; 2196 } 2197 } 2198 if (num_tags_seen) 2199 (*num_tags_seen)++; 2200 } 2201 break; 2202 case ASSERTION: 2203 assert(lit->code_max >= 1 2204 || lit->code_max <= ASSERT_LAST); 2205 if (assertions != NULL) 2206 *assertions |= lit->code_max; 2207 break; 2208 case PARAMETER: 2209 if (params != NULL) 2210 for (i = 0; i < TRE_PARAM_LAST; i++) 2211 params[i] = lit->u.params[i]; 2212 if (params_seen != NULL) 2213 *params_seen = 1; 2214 break; 2215 case EMPTY: 2216 break; 2217 default: 2218 assert(0); 2219 break; 2220 } 2221 break; 2222 2223 case UNION: 2224 /* Subexpressions starting earlier take priority over ones 2225 starting later, so we prefer the left subexpression over the 2226 right subexpression. */ 2227 uni = (tre_union_t *)node->obj; 2228 if (uni->left->nullable) 2229 STACK_PUSHX(stack, voidptr, uni->left) 2230 else if (uni->right->nullable) 2231 STACK_PUSHX(stack, voidptr, uni->right) 2232 else 2233 assert(0); 2234 break; 2235 2236 case CATENATION: 2237 /* The path must go through both children. */ 2238 cat = (tre_catenation_t *)node->obj; 2239 assert(cat->left->nullable); 2240 assert(cat->right->nullable); 2241 STACK_PUSHX(stack, voidptr, cat->left); 2242 STACK_PUSHX(stack, voidptr, cat->right); 2243 break; 2244 2245 case ITERATION: 2246 /* A match with an empty string is preferred over no match at 2247 all, so we go through the argument if possible. */ 2248 iter = (tre_iteration_t *)node->obj; 2249 if (iter->arg->nullable) 2250 STACK_PUSHX(stack, voidptr, iter->arg); 2251 break; 2252 2253 default: 2254 assert(0); 2255 break; 2256 } 2257 } 2258 2259 return status; 2260 } 2261 2262 typedef enum { 2263 NFL_RECURSE, 2264 NFL_POST_UNION, 2265 NFL_POST_CATENATION, 2266 NFL_POST_ITERATION 2267 } tre_nfl_stack_symbol_t; 2268 2269 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for 2270 the nodes of the AST `tree'. */ 2271 static reg_errcode_t 2272 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) 2273 { 2274 int bottom = tre_stack_num_objects(stack); 2275 2276 STACK_PUSHR(stack, voidptr, tree); 2277 STACK_PUSHR(stack, int, NFL_RECURSE); 2278 2279 while (tre_stack_num_objects(stack) > bottom) 2280 { 2281 tre_nfl_stack_symbol_t symbol; 2282 tre_ast_node_t *node; 2283 2284 symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack); 2285 node = tre_stack_pop_voidptr(stack); 2286 switch (symbol) 2287 { 2288 case NFL_RECURSE: 2289 switch (node->type) 2290 { 2291 case LITERAL: 2292 { 2293 tre_literal_t *lit = (tre_literal_t *)node->obj; 2294 if (IS_BACKREF(lit)) 2295 { 2296 /* Back references: nullable = false, firstpos = {i}, 2297 lastpos = {i}. */ 2298 node->nullable = 0; 2299 node->firstpos = tre_set_one(mem, lit->position, 0, 2300 TRE_CHAR_MAX, NULL, -1); 2301 if (!node->firstpos) 2302 return REG_ESPACE; 2303 node->lastpos = tre_set_one(mem, lit->position, 0, 2304 TRE_CHAR_MAX, NULL, 2305 (int)lit->code_max); 2306 if (!node->lastpos) 2307 return REG_ESPACE; 2308 } 2309 else if (lit->code_min < 0) 2310 { 2311 /* Tags, empty strings, params, and zero width assertions: 2312 nullable = true, firstpos = {}, and lastpos = {}. */ 2313 node->nullable = 1; 2314 node->firstpos = tre_set_empty(mem); 2315 if (!node->firstpos) 2316 return REG_ESPACE; 2317 node->lastpos = tre_set_empty(mem); 2318 if (!node->lastpos) 2319 return REG_ESPACE; 2320 } 2321 else 2322 { 2323 /* Literal at position i: nullable = false, firstpos = {i}, 2324 lastpos = {i}. */ 2325 node->nullable = 0; 2326 node->firstpos = 2327 tre_set_one(mem, lit->position, (int)lit->code_min, 2328 (int)lit->code_max, NULL, -1); 2329 if (!node->firstpos) 2330 return REG_ESPACE; 2331 node->lastpos = tre_set_one(mem, lit->position, 2332 (int)lit->code_min, 2333 (int)lit->code_max, 2334 lit->u.bracket_match_list, 2335 -1); 2336 if (!node->lastpos) 2337 return REG_ESPACE; 2338 } 2339 break; 2340 } 2341 2342 case UNION: 2343 /* Compute the attributes for the two subtrees, and after that 2344 for this node. */ 2345 STACK_PUSHR(stack, voidptr, node); 2346 STACK_PUSHR(stack, int, NFL_POST_UNION); 2347 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right); 2348 STACK_PUSHR(stack, int, NFL_RECURSE); 2349 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left); 2350 STACK_PUSHR(stack, int, NFL_RECURSE); 2351 break; 2352 2353 case CATENATION: 2354 /* Compute the attributes for the two subtrees, and after that 2355 for this node. */ 2356 STACK_PUSHR(stack, voidptr, node); 2357 STACK_PUSHR(stack, int, NFL_POST_CATENATION); 2358 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right); 2359 STACK_PUSHR(stack, int, NFL_RECURSE); 2360 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left); 2361 STACK_PUSHR(stack, int, NFL_RECURSE); 2362 break; 2363 2364 case ITERATION: 2365 /* Compute the attributes for the subtree, and after that for 2366 this node. */ 2367 STACK_PUSHR(stack, voidptr, node); 2368 STACK_PUSHR(stack, int, NFL_POST_ITERATION); 2369 STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg); 2370 STACK_PUSHR(stack, int, NFL_RECURSE); 2371 break; 2372 } 2373 break; /* end case: NFL_RECURSE */ 2374 2375 case NFL_POST_UNION: 2376 { 2377 tre_union_t *uni = (tre_union_t *)node->obj; 2378 node->nullable = uni->left->nullable || uni->right->nullable; 2379 node->firstpos = tre_set_union(mem, uni->left->firstpos, 2380 uni->right->firstpos, NULL, 0, NULL); 2381 if (!node->firstpos) 2382 return REG_ESPACE; 2383 node->lastpos = tre_set_union(mem, uni->left->lastpos, 2384 uni->right->lastpos, NULL, 0, NULL); 2385 if (!node->lastpos) 2386 return REG_ESPACE; 2387 break; 2388 } 2389 2390 case NFL_POST_ITERATION: 2391 { 2392 int num_tags, *tags, assertions, params_seen; 2393 int *params; 2394 reg_errcode_t status; 2395 tre_iteration_t *iter = (tre_iteration_t *)node->obj; 2396 2397 /* From Ville Laurikari's original 2001 Master's thesis, the 2398 firstpos(n) and lastpos(n) of an iteration is just the 2399 corresponding values of the iteration's argument. Unfortunately, 2400 this isn't sufficient for the following BRE: 2401 2402 \(a*\)*b\(\1\) matched against ab 2403 2404 The backreference wants to force the first subexpression to 2405 be the empty string, but there is no transition for this. So 2406 we need to modify the lastpos(n) of an iteration to be the 2407 equivalent of that of catentation. Using the same notation as 2408 in the thesis, lastpos(n) is redefined as: 2409 2410 if nullable(c1) then 2411 lastpos(c1) U 2412 addtags(lastpos(c1), 2413 emptymatch(c1)) 2414 else 2415 lastpos(c1) 2416 2417 where c1 is the argument node. firstpos(n) remains the same. */ 2418 2419 /* Compute lastpos. */ 2420 if (iter->min == 0 || iter->arg->nullable) 2421 { 2422 node->nullable = 1; 2423 if (iter->arg->nullable) 2424 { 2425 /* The arg matches the empty string. Make a first pass 2426 with tre_match_empty() to get the number of tags and 2427 parameters. */ 2428 status = tre_match_empty(stack, iter->arg, 2429 NULL, NULL, NULL, &num_tags, 2430 ¶ms_seen); 2431 if (status != REG_OK) 2432 return status; 2433 /* Allocate arrays for the tags and parameters. */ 2434 tags = malloc(sizeof(int) * (num_tags + 1)); 2435 if (!tags) 2436 return REG_ESPACE; 2437 tags[0] = -1; 2438 assertions = 0; 2439 params = NULL; 2440 if (params_seen) 2441 { 2442 params = tre_mem_alloc(mem, sizeof(*params) 2443 * TRE_PARAM_LAST); 2444 if (!params) 2445 { 2446 free(tags); 2447 return REG_ESPACE; 2448 } 2449 } 2450 /* Second pass with tre_mach_empty() to get the list of 2451 tags and parameters. */ 2452 status = tre_match_empty(stack, iter->arg, tags, 2453 &assertions, params, NULL, NULL); 2454 if (status != REG_OK) 2455 { 2456 free(tags); 2457 return status; 2458 } 2459 node->lastpos = 2460 tre_set_union(mem, iter->arg->lastpos, iter->arg->lastpos, 2461 tags, assertions, params); 2462 free(tags); 2463 if (!node->lastpos) 2464 return REG_ESPACE; 2465 } 2466 else 2467 node->lastpos = iter->arg->lastpos; 2468 } 2469 else 2470 { 2471 node->nullable = 0; 2472 node->lastpos = iter->arg->lastpos; 2473 } 2474 node->firstpos = iter->arg->firstpos; 2475 break; 2476 } 2477 2478 case NFL_POST_CATENATION: 2479 { 2480 int num_tags, *tags, assertions, params_seen; 2481 int *params; 2482 reg_errcode_t status; 2483 tre_catenation_t *cat = node->obj; 2484 node->nullable = cat->left->nullable && cat->right->nullable; 2485 2486 /* Compute firstpos. */ 2487 if (cat->left->nullable) 2488 { 2489 /* The left side matches the empty string. Make a first pass 2490 with tre_match_empty() to get the number of tags and 2491 parameters. */ 2492 status = tre_match_empty(stack, cat->left, 2493 NULL, NULL, NULL, &num_tags, 2494 ¶ms_seen); 2495 if (status != REG_OK) 2496 return status; 2497 /* Allocate arrays for the tags and parameters. */ 2498 tags = malloc(sizeof(*tags) * (num_tags + 1)); 2499 if (!tags) 2500 return REG_ESPACE; 2501 tags[0] = -1; 2502 assertions = 0; 2503 params = NULL; 2504 if (params_seen) 2505 { 2506 params = tre_mem_alloc(mem, sizeof(*params) 2507 * TRE_PARAM_LAST); 2508 if (!params) 2509 { 2510 free(tags); 2511 return REG_ESPACE; 2512 } 2513 } 2514 /* Second pass with tre_mach_empty() to get the list of 2515 tags and parameters. */ 2516 status = tre_match_empty(stack, cat->left, tags, 2517 &assertions, params, NULL, NULL); 2518 if (status != REG_OK) 2519 { 2520 free(tags); 2521 return status; 2522 } 2523 node->firstpos = 2524 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos, 2525 tags, assertions, params); 2526 free(tags); 2527 if (!node->firstpos) 2528 return REG_ESPACE; 2529 } 2530 else 2531 { 2532 node->firstpos = cat->left->firstpos; 2533 } 2534 2535 /* Compute lastpos. */ 2536 if (cat->right->nullable) 2537 { 2538 /* The right side matches the empty string. Make a first pass 2539 with tre_match_empty() to get the number of tags and 2540 parameters. */ 2541 status = tre_match_empty(stack, cat->right, 2542 NULL, NULL, NULL, &num_tags, 2543 ¶ms_seen); 2544 if (status != REG_OK) 2545 return status; 2546 /* Allocate arrays for the tags and parameters. */ 2547 tags = malloc(sizeof(int) * (num_tags + 1)); 2548 if (!tags) 2549 return REG_ESPACE; 2550 tags[0] = -1; 2551 assertions = 0; 2552 params = NULL; 2553 if (params_seen) 2554 { 2555 params = tre_mem_alloc(mem, sizeof(*params) 2556 * TRE_PARAM_LAST); 2557 if (!params) 2558 { 2559 free(tags); 2560 return REG_ESPACE; 2561 } 2562 } 2563 /* Second pass with tre_mach_empty() to get the list of 2564 tags and parameters. */ 2565 status = tre_match_empty(stack, cat->right, tags, 2566 &assertions, params, NULL, NULL); 2567 if (status != REG_OK) 2568 { 2569 free(tags); 2570 return status; 2571 } 2572 node->lastpos = 2573 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos, 2574 tags, assertions, params); 2575 free(tags); 2576 if (!node->lastpos) 2577 return REG_ESPACE; 2578 } 2579 else 2580 { 2581 node->lastpos = cat->right->lastpos; 2582 } 2583 break; 2584 } 2585 2586 default: 2587 assert(0); 2588 break; 2589 } 2590 } 2591 2592 return REG_OK; 2593 } 2594 2595 /* Adds a transition from each position in `p1' to each position in `p2'. */ 2596 static reg_errcode_t 2597 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2, 2598 tre_tnfa_transition_t *transitions, 2599 int *counts, int *offs) 2600 { 2601 tre_pos_and_tags_t *orig_p2 = p2; 2602 tre_tnfa_transition_t *trans; 2603 int i, j, k, l, dup, prev_p2_pos; 2604 2605 if (transitions != NULL) 2606 while (p1->position >= 0) 2607 { 2608 p2 = orig_p2; 2609 prev_p2_pos = -1; 2610 while (p2->position >= 0) 2611 { 2612 /* Optimization: if this position was already handled, skip it. */ 2613 if (p2->position == prev_p2_pos) 2614 { 2615 p2++; 2616 continue; 2617 } 2618 prev_p2_pos = p2->position; 2619 /* Set `trans' to point to the next unused transition from 2620 position `p1->position'. */ 2621 trans = transitions + offs[p1->position]; 2622 while (trans->state != NULL) 2623 { 2624 #if 0 2625 /* If we find a previous transition from `p1->position' to 2626 `p2->position', it is overwritten. This can happen only 2627 if there are nested loops in the regexp, like in "((a)*)*". 2628 In POSIX.2 repetition using the outer loop is always 2629 preferred over using the inner loop. Therefore the 2630 transition for the inner loop is useless and can be thrown 2631 away. */ 2632 /* XXX - The same position is used for all nodes in a bracket 2633 expression, so this optimization cannot be used (it will 2634 break bracket expressions) unless I figure out a way to 2635 detect it here. */ 2636 if (trans->state_id == p2->position) 2637 { 2638 DPRINT(("*")); 2639 break; 2640 } 2641 #endif 2642 trans++; 2643 } 2644 2645 if (trans->state == NULL) 2646 (trans + 1)->state = NULL; 2647 /* Use the character ranges, assertions, etc. from `p1' for 2648 the transition from `p1' to `p2'. */ 2649 trans->code_min = p1->code_min; 2650 trans->code_max = p1->code_max; 2651 trans->state = transitions + offs[p2->position]; 2652 trans->state_id = p2->position; 2653 trans->assertions = p1->assertions | p2->assertions 2654 | (p1->bracket_match_list != NULL ? ASSERT_BRACKET_MATCH : 0); 2655 if (p1->backref >= 0) 2656 { 2657 assert((trans->assertions & ASSERT_BRACKET_MATCH) == 0); 2658 assert(p2->backref < 0); 2659 trans->u.backref = p1->backref; 2660 trans->assertions |= ASSERT_BACKREF; 2661 } 2662 if (p1->bracket_match_list != NULL) 2663 { 2664 trans->u.bracket_match_list = 2665 malloc(SIZEOF_BRACKET_MATCH_LIST(p1->bracket_match_list)); 2666 if (trans->u.bracket_match_list == NULL) 2667 return REG_ESPACE; 2668 memcpy(trans->u.bracket_match_list, p1->bracket_match_list, 2669 SIZEOF_BRACKET_MATCH_LIST(p1->bracket_match_list)); 2670 } 2671 2672 /* Find out how many tags this transition has. */ 2673 i = 0; 2674 if (p1->tags != NULL) 2675 while(p1->tags[i] >= 0) 2676 i++; 2677 j = 0; 2678 if (p2->tags != NULL) 2679 while(p2->tags[j] >= 0) 2680 j++; 2681 2682 /* If we are overwriting a transition, free the old tag array. */ 2683 if (trans->tags != NULL) 2684 free(trans->tags); 2685 trans->tags = NULL; 2686 2687 /* If there were any tags, allocate an array and fill it. */ 2688 if (i + j > 0) 2689 { 2690 trans->tags = malloc(sizeof(*trans->tags) * (i + j + 1)); 2691 if (!trans->tags) 2692 return REG_ESPACE; 2693 i = 0; 2694 if (p1->tags != NULL) 2695 while(p1->tags[i] >= 0) 2696 { 2697 trans->tags[i] = p1->tags[i]; 2698 i++; 2699 } 2700 l = i; 2701 j = 0; 2702 if (p2->tags != NULL) 2703 while (p2->tags[j] >= 0) 2704 { 2705 /* Don't add duplicates. */ 2706 dup = 0; 2707 for (k = 0; k < i; k++) 2708 if (trans->tags[k] == p2->tags[j]) 2709 { 2710 dup = 1; 2711 break; 2712 } 2713 if (!dup) 2714 trans->tags[l++] = p2->tags[j]; 2715 j++; 2716 } 2717 trans->tags[l] = -1; 2718 } 2719 2720 /* Set the parameter array. If both `p2' and `p1' have same 2721 parameters, the values in `p2' override those in `p1'. */ 2722 if (p1->params || p2->params) 2723 { 2724 if (!trans->params) 2725 trans->params = malloc(sizeof(*trans->params) 2726 * TRE_PARAM_LAST); 2727 if (!trans->params) 2728 return REG_ESPACE; 2729 for (i = 0; i < TRE_PARAM_LAST; i++) 2730 { 2731 trans->params[i] = TRE_PARAM_UNSET; 2732 if (p1->params && p1->params[i] != TRE_PARAM_UNSET) 2733 trans->params[i] = p1->params[i]; 2734 if (p2->params && p2->params[i] != TRE_PARAM_UNSET) 2735 trans->params[i] = p2->params[i]; 2736 } 2737 } 2738 else 2739 { 2740 if (trans->params) 2741 free(trans->params); 2742 trans->params = NULL; 2743 } 2744 2745 #ifdef TRE_DEBUG 2746 { 2747 int *tags; 2748 2749 DPRINT((" %2d -> %2d on %3d", p1->position, p2->position, 2750 p1->code_min)); 2751 if (p1->code_max != p1->code_min) 2752 DPRINT(("-%3d", p1->code_max)); 2753 tags = trans->tags; 2754 if (tags) 2755 { 2756 DPRINT((", tags [")); 2757 while (*tags >= 0) 2758 { 2759 DPRINT(("%d", *tags)); 2760 tags++; 2761 if (*tags >= 0) 2762 DPRINT((",")); 2763 } 2764 DPRINT(("]")); 2765 } 2766 if (trans->assertions) 2767 DPRINT((", assert %d", trans->assertions)); 2768 if (trans->assertions & ASSERT_BACKREF) 2769 DPRINT((", backref %d", trans->u.backref)); 2770 else if (trans->assertions & ASSERT_BRACKET_MATCH) 2771 DPRINT((", bracket_match_list %p", 2772 trans->u.bracket_match_list)); 2773 if (trans->params) 2774 { 2775 DPRINT((", ")); 2776 tre_print_params(trans->params); 2777 } 2778 DPRINT(("\n")); 2779 } 2780 #endif /* TRE_DEBUG */ 2781 p2++; 2782 } 2783 p1++; 2784 } 2785 else 2786 /* Compute a maximum limit for the number of transitions leaving 2787 from each state. */ 2788 while (p1->position >= 0) 2789 { 2790 p2 = orig_p2; 2791 while (p2->position >= 0) 2792 { 2793 counts[p1->position]++; 2794 p2++; 2795 } 2796 p1++; 2797 } 2798 return REG_OK; 2799 } 2800 2801 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are 2802 labelled with one character range (there are no transitions on empty 2803 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of 2804 the regexp. */ 2805 static reg_errcode_t 2806 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions, 2807 int *counts, int *offs) 2808 { 2809 tre_union_t *uni; 2810 tre_catenation_t *cat; 2811 tre_iteration_t *iter; 2812 reg_errcode_t errcode = REG_OK; 2813 2814 /* XXX - recurse using a stack!. */ 2815 switch (node->type) 2816 { 2817 case LITERAL: 2818 break; 2819 case UNION: 2820 uni = (tre_union_t *)node->obj; 2821 errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs); 2822 if (errcode != REG_OK) 2823 return errcode; 2824 errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs); 2825 break; 2826 2827 case CATENATION: 2828 cat = (tre_catenation_t *)node->obj; 2829 /* Add a transition from each position in cat->left->lastpos 2830 to each position in cat->right->firstpos. */ 2831 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos, 2832 transitions, counts, offs); 2833 if (errcode != REG_OK) 2834 return errcode; 2835 errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs); 2836 if (errcode != REG_OK) 2837 return errcode; 2838 errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs); 2839 break; 2840 2841 case ITERATION: 2842 iter = (tre_iteration_t *)node->obj; 2843 assert(iter->max == -1 || iter->max == 1); 2844 2845 if (iter->max == -1) 2846 { 2847 assert(iter->min == 0 || iter->min == 1); 2848 /* Add a transition from each last position in the iterated 2849 expression to each first position. */ 2850 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos, 2851 transitions, counts, offs); 2852 if (errcode != REG_OK) 2853 return errcode; 2854 } 2855 errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs); 2856 break; 2857 } 2858 return errcode; 2859 } 2860 2861 #define ERROR_EXIT(err) \ 2862 do \ 2863 { \ 2864 errcode = err; \ 2865 if (/*CONSTCOND*/1) \ 2866 goto error_exit; \ 2867 } \ 2868 while (/*CONSTCOND*/0) 2869 2870 int 2871 tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags, 2872 locale_t loc) 2873 { 2874 tre_stack_t *stack; 2875 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r; 2876 tre_pos_and_tags_t *p; 2877 int *counts = NULL, *offs = NULL; 2878 int i, add = 0; 2879 tre_tnfa_transition_t *transitions, *initial; 2880 tre_tnfa_t *tnfa = NULL; 2881 tre_submatch_data_t *submatch_data = NULL; 2882 tre_tag_direction_t *tag_directions = NULL; 2883 reg_errcode_t errcode; 2884 tre_mem_t mem; 2885 2886 /* Parse context. */ 2887 tre_parse_ctx_t parse_ctx; 2888 2889 /* Allocate a stack used throughout the compilation process for various 2890 purposes. */ 2891 stack = tre_stack_new(512, 10240, 128); 2892 if (!stack) 2893 return REG_ESPACE; 2894 /* Allocate a fast memory allocator. */ 2895 mem = tre_mem_new(); 2896 if (!mem) 2897 { 2898 tre_stack_destroy(stack); 2899 return REG_ESPACE; 2900 } 2901 2902 /* Parse the regexp. */ 2903 memset(&parse_ctx, 0, sizeof(parse_ctx)); 2904 parse_ctx.mem = mem; 2905 parse_ctx.stack = stack; 2906 parse_ctx.re = regex; 2907 parse_ctx.len = n; 2908 /* Only allow REG_UNGREEDY to be set if both REG_ENHANCED and REG_EXTENDED 2909 are also set */ 2910 if ((cflags & (REG_ENHANCED | REG_EXTENDED)) != (REG_ENHANCED | REG_EXTENDED)) 2911 cflags &= ~REG_UNGREEDY; 2912 parse_ctx.cflags = cflags; 2913 parse_ctx.max_backref = -1; 2914 parse_ctx.loc = loc; 2915 parse_ctx.submatch_id_invisible = SUBMATCH_ID_INVISIBLE_START; 2916 2917 DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex)); 2918 errcode = tre_parse(&parse_ctx); 2919 if (errcode != REG_OK) 2920 ERROR_EXIT(errcode); 2921 preg->re_nsub = parse_ctx.submatch_id - 1; 2922 tree = parse_ctx.result; 2923 2924 /* Back references and approximate matching cannot currently be used 2925 in the same regexp. */ 2926 if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx) 2927 ERROR_EXIT(REG_BADPAT); 2928 2929 #ifdef TRE_DEBUG 2930 tre_ast_print(tree); 2931 #endif /* TRE_DEBUG */ 2932 2933 /* Referring to nonexistent subexpressions is illegal. */ 2934 if (parse_ctx.max_backref > (int)preg->re_nsub) 2935 ERROR_EXIT(REG_ESUBREG); 2936 2937 /* Allocate the TNFA struct. */ 2938 tnfa = calloc(1, sizeof(tre_tnfa_t)); 2939 if (tnfa == NULL) 2940 ERROR_EXIT(REG_ESPACE); 2941 tnfa->have_backrefs = parse_ctx.max_backref >= 0; 2942 tnfa->have_approx = parse_ctx.have_approx; 2943 tnfa->num_submatches = parse_ctx.submatch_id; 2944 tnfa->num_submatches_invisible = parse_ctx.submatch_id_invisible 2945 - SUBMATCH_ID_INVISIBLE_START; 2946 tnfa->num_reorder_tags = parse_ctx.num_reorder_tags; 2947 tnfa->loc = duplocale(parse_ctx.loc); 2948 if (tnfa->loc == (locale_t)0) 2949 ERROR_EXIT(REG_ESPACE); 2950 2951 /* Set up tags for submatch addressing. If REG_NOSUB is set and the 2952 regexp does not have back references, this can be skipped. */ 2953 if (tnfa->num_reorder_tags > 0 || !(cflags & REG_NOSUB)) 2954 { 2955 DPRINT(("tre_compile: setting up tags\n")); 2956 2957 /* Figure out how many tags we will need. */ 2958 errcode = tre_add_tags(NULL, stack, tree, tnfa); 2959 if (errcode != REG_OK) 2960 ERROR_EXIT(errcode); 2961 #ifdef TRE_DEBUG 2962 tre_ast_print(tree); 2963 #endif /* TRE_DEBUG */ 2964 2965 if (tnfa->num_tags > 0) 2966 { 2967 tag_directions = malloc(sizeof(*tag_directions) 2968 * (tnfa->num_tags + 1)); 2969 if (tag_directions == NULL) 2970 ERROR_EXIT(REG_ESPACE); 2971 tnfa->tag_directions = tag_directions; 2972 memset(tag_directions, -1, 2973 sizeof(*tag_directions) * (tnfa->num_tags + 1)); 2974 } 2975 tnfa->minimal_tags = calloc((unsigned)tnfa->num_tags * 2 + 3, 2976 sizeof(tnfa->minimal_tags)); 2977 if (tnfa->minimal_tags == NULL) 2978 ERROR_EXIT(REG_ESPACE); 2979 2980 submatch_data = calloc((unsigned)parse_ctx.submatch_id, 2981 sizeof(*submatch_data)); 2982 if (submatch_data == NULL) 2983 ERROR_EXIT(REG_ESPACE); 2984 /* Set the eo_tag value to -1 to indicate that that corresponding 2985 * submatch has not be completed yet */ 2986 for (i = 0; i < parse_ctx.submatch_id; i++) 2987 { 2988 submatch_data[i].eo_tag = -1; 2989 } 2990 tnfa->submatch_data = submatch_data; 2991 2992 errcode = tre_add_tags(mem, stack, tree, tnfa); 2993 if (errcode != REG_OK) 2994 ERROR_EXIT(errcode); 2995 2996 #ifdef TRE_DEBUG 2997 for (i = 0; i < parse_ctx.submatch_id; i++) 2998 DPRINT(("pmatch[%d] = {t%d, t%d}\n", 2999 i, submatch_data[i].so_tag, submatch_data[i].eo_tag)); 3000 for (i = 0; i <= tnfa->num_tags; i++) 3001 DPRINT(("t%d is %s\n", i, tag_dir_str[tag_directions[i]])); 3002 #endif /* TRE_DEBUG */ 3003 } 3004 3005 /* Expand iteration nodes. */ 3006 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position, 3007 tag_directions, &tnfa->params_depth); 3008 if (errcode != REG_OK) 3009 ERROR_EXIT(errcode); 3010 3011 /* Add a dummy node for the final state. 3012 XXX - For certain patterns this dummy node can be optimized away, 3013 for example "a*" or "ab*". Figure out a simple way to detect 3014 this possibility. */ 3015 tmp_ast_l = tree; 3016 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++); 3017 if (tmp_ast_r == NULL) 3018 ERROR_EXIT(REG_ESPACE); 3019 3020 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r); 3021 if (tree == NULL) 3022 ERROR_EXIT(REG_ESPACE); 3023 3024 #ifdef TRE_DEBUG 3025 tre_ast_print(tree); 3026 DPRINT(("Number of states: %d\n", parse_ctx.position)); 3027 if (submatch_data) 3028 for (i = 0; i < parse_ctx.submatch_id; i++) 3029 DPRINT(("pmatch[%d] = {t%d, t%d}\n", 3030 i, submatch_data[i].so_tag, submatch_data[i].eo_tag)); 3031 if (tag_directions) 3032 for (i = 0; i <= tnfa->num_tags; i++) 3033 DPRINT(("t%d is %s\n", i, tag_dir_str[tag_directions[i]])); 3034 #endif /* TRE_DEBUG */ 3035 3036 errcode = tre_compute_nfl(mem, stack, tree); 3037 if (errcode != REG_OK) 3038 ERROR_EXIT(errcode); 3039 3040 counts = malloc(sizeof(int) * parse_ctx.position); 3041 if (counts == NULL) 3042 ERROR_EXIT(REG_ESPACE); 3043 3044 offs = malloc(sizeof(int) * parse_ctx.position); 3045 if (offs == NULL) 3046 ERROR_EXIT(REG_ESPACE); 3047 3048 for (i = 0; i < parse_ctx.position; i++) 3049 counts[i] = 0; 3050 tre_ast_to_tnfa(tree, NULL, counts, NULL); 3051 3052 add = 0; 3053 for (i = 0; i < parse_ctx.position; i++) 3054 { 3055 offs[i] = add; 3056 add += counts[i] + 1; 3057 counts[i] = 0; 3058 } 3059 transitions = calloc((unsigned)add + 1, sizeof(*transitions)); 3060 if (transitions == NULL) 3061 ERROR_EXIT(REG_ESPACE); 3062 tnfa->transitions = transitions; 3063 tnfa->num_transitions = add; 3064 3065 DPRINT(("Converting to TNFA:\n")); 3066 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs); 3067 if (errcode != REG_OK) 3068 ERROR_EXIT(errcode); 3069 3070 /* Set first_char only if there is only one character that can be the 3071 first character of a match */ 3072 tnfa->first_char = -1; 3073 if (!tmp_ast_l->nullable) 3074 { 3075 int scanning = 1; 3076 for (p = tree->firstpos; scanning && p->position >= 0; p++) 3077 { 3078 tre_tnfa_transition_t *j = transitions + offs[p->position]; 3079 while (j->state != NULL) 3080 { 3081 if (j->code_min <= j->code_max) 3082 { 3083 if (j->code_max != j->code_min || j->code_min == -1 || tnfa->first_char != -1) 3084 { 3085 tnfa->first_char = -1; 3086 scanning = 0; 3087 break; 3088 } 3089 tnfa->first_char = j->code_min; 3090 } 3091 j++; 3092 } 3093 } 3094 #ifdef TRE_DEBUG 3095 if (tnfa->first_char >= 0) 3096 DPRINT(("first char must be %d\n", tnfa->first_char)); 3097 #endif /* TRE_DEBUG */ 3098 } 3099 3100 p = tree->firstpos; 3101 i = 0; 3102 while (p->position >= 0) 3103 { 3104 i++; 3105 3106 #ifdef TRE_DEBUG 3107 { 3108 int *tags; 3109 DPRINT(("initial: %d", p->position)); 3110 tags = p->tags; 3111 if (tags != NULL) 3112 { 3113 if (*tags >= 0) 3114 DPRINT(("/")); 3115 while (*tags >= 0) 3116 { 3117 DPRINT(("%d", *tags)); 3118 tags++; 3119 if (*tags >= 0) 3120 DPRINT((",")); 3121 } 3122 } 3123 DPRINT((", assert %d", p->assertions)); 3124 if (p->params) 3125 { 3126 DPRINT((", ")); 3127 tre_print_params(p->params); 3128 } 3129 DPRINT(("\n")); 3130 } 3131 #endif /* TRE_DEBUG */ 3132 3133 p++; 3134 } 3135 3136 initial = calloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t)); 3137 if (initial == NULL) 3138 ERROR_EXIT(REG_ESPACE); 3139 tnfa->initial = initial; 3140 3141 i = 0; 3142 for (p = tree->firstpos; p->position >= 0; p++) 3143 { 3144 initial[i].state = transitions + offs[p->position]; 3145 initial[i].state_id = p->position; 3146 initial[i].tags = NULL; 3147 /* Copy the arrays p->tags, and p->params, they are allocated 3148 from a tre_mem object. */ 3149 if (p->tags) 3150 { 3151 int j; 3152 for (j = 0; p->tags[j] >= 0; j++); 3153 initial[i].tags = malloc(sizeof(*p->tags) * (j + 1)); 3154 if (!initial[i].tags) 3155 ERROR_EXIT(REG_ESPACE); 3156 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1)); 3157 } 3158 initial[i].params = NULL; 3159 if (p->params) 3160 { 3161 initial[i].params = malloc(sizeof(*p->params) * TRE_PARAM_LAST); 3162 if (!initial[i].params) 3163 ERROR_EXIT(REG_ESPACE); 3164 memcpy(initial[i].params, p->params, 3165 sizeof(*p->params) * TRE_PARAM_LAST); 3166 } 3167 initial[i].assertions = p->assertions; 3168 i++; 3169 } 3170 initial[i].state = NULL; 3171 3172 tnfa->num_transitions = add; 3173 tnfa->final = transitions + offs[tree->lastpos[0].position]; 3174 tnfa->num_states = parse_ctx.position; 3175 tnfa->cflags = cflags; 3176 3177 DPRINT(("final state %d (%p)\n", tree->lastpos[0].position, 3178 (void *)tnfa->final)); 3179 3180 tre_mem_destroy(mem); 3181 tre_stack_destroy(stack); 3182 free(counts); 3183 free(offs); 3184 3185 preg->re_magic = RE_MAGIC; 3186 preg->TRE_REGEX_T_FIELD = (void *)tnfa; 3187 return REG_OK; 3188 3189 error_exit: 3190 /* Free everything that was allocated and return the error code. */ 3191 tre_mem_destroy(mem); 3192 if (stack != NULL) 3193 tre_stack_destroy(stack); 3194 if (counts != NULL) 3195 free(counts); 3196 if (offs != NULL) 3197 free(offs); 3198 3199 /* Set tnfa into preg, so that calling tre_free() will free the contents 3200 of tnfa. NULL out the loc field since we never retained the locale. */ 3201 preg->TRE_REGEX_T_FIELD = (void *)tnfa; 3202 if(tnfa) 3203 { 3204 if (tnfa->loc != (locale_t)0) 3205 freelocale(tnfa->loc); 3206 tnfa->loc = NULL; 3207 } 3208 3209 tre_free(preg); 3210 return errcode; 3211 } 3212 3213 void 3214 tre_free(regex_t *preg) 3215 { 3216 tre_tnfa_t *tnfa; 3217 unsigned int i; 3218 tre_tnfa_transition_t *trans; 3219 3220 preg->re_magic = 0; 3221 tnfa = (void *)preg->TRE_REGEX_T_FIELD; 3222 if (!tnfa) 3223 return; 3224 preg->TRE_REGEX_T_FIELD = NULL; 3225 3226 for (i = 0; i < tnfa->num_transitions; i++) 3227 if (tnfa->transitions[i].state) 3228 { 3229 if (tnfa->transitions[i].tags) 3230 free(tnfa->transitions[i].tags); 3231 if (tnfa->transitions[i].assertions & ASSERT_BRACKET_MATCH) 3232 free(tnfa->transitions[i].u.bracket_match_list); 3233 if (tnfa->transitions[i].params) 3234 free(tnfa->transitions[i].params); 3235 } 3236 if (tnfa->transitions) 3237 free(tnfa->transitions); 3238 3239 if (tnfa->initial) 3240 { 3241 for (trans = tnfa->initial; trans->state; trans++) 3242 { 3243 if (trans->tags) 3244 free(trans->tags); 3245 if (trans->params) 3246 free(trans->params); 3247 } 3248 free(tnfa->initial); 3249 } 3250 3251 if (tnfa->submatch_data) 3252 { 3253 free(tnfa->submatch_data); 3254 } 3255 3256 if (tnfa->tag_directions) 3257 free(tnfa->tag_directions); 3258 if (tnfa->minimal_tags) 3259 free(tnfa->minimal_tags); 3260 3261 if (tnfa->loc != (locale_t)0) 3262 freelocale(tnfa->loc); 3263 3264 if (tnfa->last_matched_branch) 3265 free(tnfa->last_matched_branch); 3266 3267 free(tnfa); 3268 }