1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28 /* 29 * Copyright (C) Lucent Technologies 1997 30 * All Rights Reserved 31 * 32 * Permission to use, copy, modify, and distribute this software and 33 * its documentation for any purpose and without fee is hereby 34 * granted, provided that the above copyright notice appear in all 35 * copies and that both that the copyright notice and this 36 * permission notice and warranty disclaimer appear in supporting 37 * documentation, and that the name Lucent Technologies or any of 38 * its entities not be used in advertising or publicity pertaining 39 * to distribution of the software without specific, written prior 40 * permission. 41 * 42 * LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 43 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 44 * IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 45 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 46 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 47 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 48 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 49 * THIS SOFTWARE. 50 */ 51 52 #define DEBUG 53 54 #include "awk.h" 55 #include "y.tab.h" 56 57 #define HAT (NCHARS+2) /* matches ^ in regular expr */ 58 /* NCHARS is 2**n */ 59 #define MAXLIN 22 60 61 #define type(v) (v)->nobj /* badly overloaded here */ 62 #define info(v) (v)->ntype /* badly overloaded here */ 63 #define left(v) (v)->narg[0] 64 #define right(v) (v)->narg[1] 65 #define parent(v) (v)->nnext 66 67 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 68 #define ELEAF case EMPTYRE: /* empty string in regexp */ 69 #define UNARY case STAR: case PLUS: case QUEST: 70 71 /* 72 * encoding in tree Nodes: 73 * leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL): 74 * left is index, right contains value or pointer to value 75 * unary (STAR, PLUS, QUEST): left is child, right is null 76 * binary (CAT, OR): left and right are children 77 * parent contains pointer to parent 78 */ 79 80 int *setvec; 81 int *tmpset; 82 int maxsetvec = 0; 83 84 int rtok; /* next token in current re */ 85 int rlxval; 86 static uchar *rlxstr; 87 static uchar *prestr; /* current position in current re */ 88 static uchar *lastre; /* origin of last re */ 89 90 static int setcnt; 91 static int poscnt; 92 93 uchar *patbeg; 94 int patlen; 95 96 #define NFA 20 /* cache this many dynamic fa's */ 97 fa *fatab[NFA]; 98 int nfatab = 0; /* entries in fatab */ 99 100 static fa *mkdfa(const uchar *, int); 101 static int makeinit(fa *, int); 102 static void penter(Node *); 103 static void freetr(Node *); 104 static void overflo(const char *); 105 static void cfoll(fa *, Node *); 106 static void follow(Node *); 107 static Node *reparse(const uchar *); 108 static int relex(void); 109 static void freefa(fa *); 110 static int cgoto(fa *, int, int); 111 static Node *regexp(void); 112 static Node *primary(void); 113 static Node *concat(Node *); 114 static Node *alt(Node *); 115 static Node *unary(Node *); 116 117 fa * 118 makedfa(const uchar *s, int anchor) /* returns dfa for reg expr s */ 119 { 120 int i, use, nuse; 121 fa *pfa; 122 static int now = 1; 123 124 if (setvec == 0) { /* first time through any RE */ 125 maxsetvec = MAXLIN; 126 setvec = (int *)malloc(maxsetvec * sizeof (int)); 127 tmpset = (int *)malloc(maxsetvec * sizeof (int)); 128 if (setvec == 0 || tmpset == 0) 129 overflo("out of space initializing makedfa"); 130 } 131 132 if (compile_time) /* a constant for sure */ 133 return (mkdfa(s, anchor)); 134 for (i = 0; i < nfatab; i++) { /* is it there already? */ 135 if (fatab[i]->anchor == anchor && 136 strcmp((const char *)fatab[i]->restr, (char *)s) == 0) { 137 fatab[i]->use = now++; 138 return (fatab[i]); 139 } 140 } 141 pfa = mkdfa(s, anchor); 142 if (nfatab < NFA) { /* room for another */ 143 fatab[nfatab] = pfa; 144 fatab[nfatab]->use = now++; 145 nfatab++; 146 return (pfa); 147 } 148 use = fatab[0]->use; /* replace least-recently used */ 149 nuse = 0; 150 for (i = 1; i < nfatab; i++) { 151 if (fatab[i]->use < use) { 152 use = fatab[i]->use; 153 nuse = i; 154 } 155 } 156 freefa(fatab[nuse]); 157 fatab[nuse] = pfa; 158 pfa->use = now++; 159 return (pfa); 160 } 161 162 fa * 163 mkdfa(const uchar *s, int anchor) 164 { /* does the real work of making a dfa */ 165 /* anchor = 1 for anchored matches, else 0 */ 166 Node *p, *p1; 167 fa *f; 168 169 p = reparse(s); 170 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 171 /* put ALL STAR in front of reg. exp. */ 172 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 173 /* put FINAL after reg. exp. */ 174 175 poscnt = 0; 176 penter(p1); /* enter parent pointers and leaf indices */ 177 if ((f = (fa *)calloc(1, sizeof (fa) + poscnt * sizeof (rrow))) == NULL) 178 overflo("out of space for fa"); 179 /* penter has computed number of positions in re */ 180 f->accept = poscnt-1; 181 cfoll(f, p1); /* set up follow sets */ 182 freetr(p1); 183 if ((f->posns[0] = 184 (int *)calloc(1, *(f->re[0].lfollow) * sizeof (int))) == NULL) { 185 overflo("out of space in makedfa"); 186 } 187 if ((f->posns[1] = (int *)calloc(1, sizeof (int))) == NULL) 188 overflo("out of space in makedfa"); 189 *f->posns[1] = 0; 190 f->initstat = makeinit(f, anchor); 191 f->anchor = anchor; 192 f->restr = tostring(s); 193 return (f); 194 } 195 196 static int 197 makeinit(fa *f, int anchor) 198 { 199 int i, k; 200 201 f->curstat = 2; 202 f->out[2] = 0; 203 f->reset = 0; 204 k = *(f->re[0].lfollow); 205 xfree(f->posns[2]); 206 if ((f->posns[2] = (int *)calloc(1, (k+1) * sizeof (int))) == NULL) 207 overflo("out of space in makeinit"); 208 for (i = 0; i <= k; i++) { 209 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 210 } 211 if ((f->posns[2])[1] == f->accept) 212 f->out[2] = 1; 213 for (i = 0; i < NCHARS; i++) 214 f->gototab[2][i] = 0; 215 f->curstat = cgoto(f, 2, HAT); 216 if (anchor) { 217 *f->posns[2] = k-1; /* leave out position 0 */ 218 for (i = 0; i < k; i++) { 219 (f->posns[0])[i] = (f->posns[2])[i]; 220 } 221 222 f->out[0] = f->out[2]; 223 if (f->curstat != 2) 224 --(*f->posns[f->curstat]); 225 } 226 return (f->curstat); 227 } 228 229 void 230 penter(Node *p) /* set up parent pointers and leaf indices */ 231 { 232 switch (type(p)) { 233 ELEAF 234 LEAF 235 info(p) = poscnt; 236 poscnt++; 237 break; 238 UNARY 239 penter(left(p)); 240 parent(left(p)) = p; 241 break; 242 case CAT: 243 case OR: 244 penter(left(p)); 245 penter(right(p)); 246 parent(left(p)) = p; 247 parent(right(p)) = p; 248 break; 249 default: /* can't happen */ 250 FATAL("can't happen: unknown type %d in penter", type(p)); 251 break; 252 } 253 } 254 255 static void 256 freetr(Node *p) /* free parse tree */ 257 { 258 switch (type(p)) { 259 ELEAF 260 LEAF 261 xfree(p); 262 break; 263 UNARY 264 freetr(left(p)); 265 xfree(p); 266 break; 267 case CAT: 268 case OR: 269 freetr(left(p)); 270 freetr(right(p)); 271 xfree(p); 272 break; 273 default: /* can't happen */ 274 FATAL("can't happen: unknown type %d in freetr", type(p)); 275 break; 276 } 277 } 278 279 /* 280 * in the parsing of regular expressions, metacharacters like . have 281 * to be seen literally; \056 is not a metacharacter. 282 */ 283 int 284 hexstr(uchar **pp) /* find and eval hex string at pp, return new p */ 285 { /* only pick up one 8-bit byte (2 chars) */ 286 uchar *p; 287 int n = 0; 288 int i; 289 290 for (i = 0, p = (uchar *)*pp; i < 2 && isxdigit(*p); i++, p++) { 291 if (isdigit(*p)) 292 n = 16 * n + *p - '0'; 293 else if (*p >= 'a' && *p <= 'f') 294 n = 16 * n + *p - 'a' + 10; 295 else if (*p >= 'A' && *p <= 'F') 296 n = 16 * n + *p - 'A' + 10; 297 } 298 *pp = (uchar *)p; 299 return (n); 300 } 301 302 /* multiple use of arg */ 303 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') 304 305 int 306 quoted(uchar **pp) /* pick up next thing after a \\ */ 307 { /* and increment *pp */ 308 309 uchar *p = *pp; 310 int c; 311 312 if ((c = *p++) == 't') 313 c = '\t'; 314 else if (c == 'n') 315 c = '\n'; 316 else if (c == 'f') 317 c = '\f'; 318 else if (c == 'r') 319 c = '\r'; 320 else if (c == 'b') 321 c = '\b'; 322 else if (c == '\\') 323 c = '\\'; 324 else if (c == 'x') { /* hexadecimal goo follows */ 325 c = hexstr(&p); /* this adds a null if number is invalid */ 326 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 327 int n = c - '0'; 328 if (isoctdigit(*p)) { 329 n = 8 * n + *p++ - '0'; 330 if (isoctdigit(*p)) 331 n = 8 * n + *p++ - '0'; 332 } 333 c = n; 334 } /* else */ 335 /* c = c; */ 336 *pp = p; 337 return (c); 338 } 339 340 uchar * 341 cclenter(const uchar *argp) /* add a character class */ 342 { 343 int i, c, c2; 344 uchar *p = (uchar *)argp; 345 uchar *op, *bp; 346 static uchar *buf = NULL; 347 size_t bufsz = 100; 348 349 op = p; 350 if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL) 351 FATAL("out of space for character class [%.10s...] 1", p); 352 bp = buf; 353 for (i = 0; (c = *p++) != 0; ) { 354 if (c == '\\') { 355 c = quoted(&p); 356 } else if (c == '-' && i > 0 && bp[-1] != 0) { 357 if (*p != 0) { 358 c = bp[-1]; 359 c2 = *p++; 360 if (c2 == '\\') 361 c2 = quoted(&p); 362 if (c > c2) { /* empty; ignore */ 363 bp--; 364 i--; 365 continue; 366 } 367 while (c < c2) { 368 if (adjbuf(&buf, &bufsz, 369 bp - buf + 2, 100, &bp, 370 "cclenter1") == 0) 371 FATAL( 372 "out of space for character class [%.10s...] 2", p); 373 *bp++ = ++c; 374 i++; 375 } 376 continue; 377 } 378 } 379 if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter2")) 380 FATAL( 381 "out of space for character class [%.10s...] 3", p); 382 *bp++ = c; 383 i++; 384 } 385 *bp = 0; 386 dprintf(("cclenter: in = |%s|, out = |%s|\n", op, buf)); 387 xfree(op); 388 return (tostring(buf)); 389 } 390 391 static void 392 overflo(const char *s) 393 { 394 FATAL("regular expression too big: %.30s...", gettext((char *)s)); 395 } 396 397 /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 398 static void 399 cfoll(fa *f, Node *v) 400 { 401 int i; 402 int *p; 403 404 switch (type(v)) { 405 ELEAF 406 LEAF 407 f->re[info(v)].ltype = type(v); 408 f->re[info(v)].lval.np = right(v); 409 while (f->accept >= maxsetvec) { /* guessing here! */ 410 maxsetvec *= 4; 411 setvec = (int *)realloc(setvec, maxsetvec * 412 sizeof (int)); 413 tmpset = (int *)realloc(tmpset, maxsetvec * 414 sizeof (int)); 415 if (setvec == 0 || tmpset == 0) 416 overflo("out of space in cfoll()"); 417 } 418 for (i = 0; i <= f->accept; i++) 419 setvec[i] = 0; 420 setcnt = 0; 421 follow(v); /* computes setvec and setcnt */ 422 if ((p = (int *)calloc(1, (setcnt+1) * sizeof (int))) == NULL) 423 overflo("out of space building follow set"); 424 f->re[info(v)].lfollow = p; 425 *p = setcnt; 426 for (i = f->accept; i >= 0; i--) { 427 if (setvec[i] == 1) 428 *++p = i; 429 } 430 break; 431 UNARY 432 cfoll(f, left(v)); 433 break; 434 case CAT: 435 case OR: 436 cfoll(f, left(v)); 437 cfoll(f, right(v)); 438 break; 439 default: /* can't happen */ 440 FATAL("can't happen: unknown type %d in cfoll", type(v)); 441 } 442 } 443 444 /* 445 * collects initially active leaves of p into setvec 446 * returns 0 or 1 depending on whether p matches empty string 447 */ 448 static int 449 first(Node *p) 450 { 451 int b, lp; 452 453 switch (type(p)) { 454 ELEAF 455 LEAF 456 /* look for high-water mark of subscripts */ 457 lp = info(p); 458 /* guessing here! */ 459 while (setcnt >= maxsetvec || lp >= maxsetvec) { 460 maxsetvec *= 4; 461 setvec = (int *)realloc(setvec, maxsetvec * 462 sizeof (int)); 463 tmpset = (int *)realloc(tmpset, maxsetvec * 464 sizeof (int)); 465 if (setvec == 0 || tmpset == 0) 466 overflo("out of space in first()"); 467 } 468 if (type(p) == EMPTYRE) { 469 setvec[lp] = 0; 470 return (0); 471 } 472 if (setvec[lp] != 1) { 473 setvec[lp] = 1; 474 setcnt++; 475 } 476 if (type(p) == CCL && (*(uchar *)right(p)) == '\0') 477 return (0); /* empty CCL */ 478 else 479 return (1); 480 case PLUS: 481 if (first(left(p)) == 0) 482 return (0); 483 return (1); 484 case STAR: 485 case QUEST: 486 (void) first(left(p)); 487 return (0); 488 case CAT: 489 if (first(left(p)) == 0 && first(right(p)) == 0) 490 return (0); 491 return (1); 492 case OR: 493 b = first(right(p)); 494 if (first(left(p)) == 0 || b == 0) 495 return (0); 496 return (1); 497 } 498 /* can't happen */ 499 FATAL("can't happen: unknown type %d in first", type(p)); 500 /*NOTREACHED*/ 501 return (-1); 502 } 503 504 /* collects leaves that can follow v into setvec */ 505 static void 506 follow(Node *v) 507 { 508 Node *p; 509 510 if (type(v) == FINAL) 511 return; 512 p = parent(v); 513 switch (type(p)) { 514 case STAR: 515 case PLUS: 516 (void) first(v); 517 follow(p); 518 return; 519 520 case OR: 521 case QUEST: 522 follow(p); 523 return; 524 525 case CAT: 526 if (v == left(p)) { /* v is left child of p */ 527 if (first(right(p)) == 0) { 528 follow(p); 529 return; 530 } 531 } else /* v is right child */ 532 follow(p); 533 return; 534 } 535 } 536 537 static int 538 member(int c, const uchar *sarg) /* is c in s? */ 539 { 540 uchar *s = (uchar *)sarg; 541 542 while (*s) 543 if (c == *s++) 544 return (1); 545 return (0); 546 } 547 548 549 int 550 match(fa *f, const uchar *p0) /* shortest match ? */ 551 { 552 int s, ns; 553 uchar *p = (uchar *)p0; 554 555 s = f->reset ? makeinit(f, 0) : f->initstat; 556 if (f->out[s]) 557 return (1); 558 do { 559 /* assert(*p < NCHARS); */ 560 if ((ns = f->gototab[s][*p]) != 0) 561 s = ns; 562 else 563 s = cgoto(f, s, *p); 564 if (f->out[s]) 565 return (1); 566 } while (*p++ != 0); 567 return (0); 568 } 569 570 int 571 pmatch(fa *f, const uchar *p0) /* longest match, for sub */ 572 { 573 int s, ns; 574 uchar *p = (uchar *)p0; 575 uchar *q; 576 int i, k; 577 578 /* s = f->reset ? makeinit(f,1) : f->initstat; */ 579 if (f->reset) { 580 f->initstat = s = makeinit(f, 1); 581 } else { 582 s = f->initstat; 583 } 584 patbeg = p; 585 patlen = -1; 586 do { 587 q = p; 588 do { 589 if (f->out[s]) /* final state */ 590 patlen = q-p; 591 /* assert(*q < NCHARS); */ 592 if ((ns = f->gototab[s][*q]) != 0) 593 s = ns; 594 else 595 s = cgoto(f, s, *q); 596 if (s == 1) { /* no transition */ 597 if (patlen >= 0) { 598 patbeg = p; 599 return (1); 600 } else 601 goto nextin; /* no match */ 602 } 603 } while (*q++ != 0); 604 if (f->out[s]) 605 patlen = q - p - 1; /* don't count $ */ 606 if (patlen >= 0) { 607 patbeg = p; 608 return (1); 609 } 610 nextin: 611 s = 2; 612 if (f->reset) { 613 for (i = 2; i <= f->curstat; i++) 614 xfree(f->posns[i]); 615 k = *f->posns[0]; 616 if ((f->posns[2] = 617 (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) { 618 overflo("out of space in pmatch"); 619 } 620 for (i = 0; i <= k; i++) 621 (f->posns[2])[i] = (f->posns[0])[i]; 622 f->initstat = f->curstat = 2; 623 f->out[2] = f->out[0]; 624 for (i = 0; i < NCHARS; i++) 625 f->gototab[2][i] = 0; 626 } 627 } while (*p++ != 0); 628 return (0); 629 } 630 631 int 632 nematch(fa *f, const uchar *p0) /* non-empty match, for sub */ 633 { 634 int s, ns; 635 uchar *p = (uchar *)p0; 636 uchar *q; 637 int i, k; 638 639 /* s = f->reset ? makeinit(f,1) : f->initstat; */ 640 if (f->reset) { 641 f->initstat = s = makeinit(f, 1); 642 } else { 643 s = f->initstat; 644 } 645 patlen = -1; 646 while (*p) { 647 q = p; 648 do { 649 if (f->out[s]) /* final state */ 650 patlen = q - p; 651 /* assert(*q < NCHARS); */ 652 if ((ns = f->gototab[s][*q]) != 0) 653 s = ns; 654 else 655 s = cgoto(f, s, *q); 656 if (s == 1) { /* no transition */ 657 if (patlen > 0) { 658 patbeg = p; 659 return (1); 660 } else 661 goto nnextin; /* no nonempty match */ 662 } 663 } while (*q++ != 0); 664 if (f->out[s]) 665 patlen = q - p - 1; /* don't count $ */ 666 if (patlen > 0) { 667 patbeg = p; 668 return (1); 669 } 670 nnextin: 671 s = 2; 672 if (f->reset) { 673 for (i = 2; i <= f->curstat; i++) 674 xfree(f->posns[i]); 675 k = *f->posns[0]; 676 if ((f->posns[2] = 677 (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) { 678 overflo("out of state space"); 679 } 680 for (i = 0; i <= k; i++) 681 (f->posns[2])[i] = (f->posns[0])[i]; 682 f->initstat = f->curstat = 2; 683 f->out[2] = f->out[0]; 684 for (i = 0; i < NCHARS; i++) 685 f->gototab[2][i] = 0; 686 } 687 p++; 688 } 689 return (0); 690 } 691 692 static Node * 693 reparse(const uchar *p) 694 { 695 /* parses regular expression pointed to by p */ 696 /* uses relex() to scan regular expression */ 697 Node *np; 698 699 dprintf(("reparse <%s>\n", p)); 700 /* prestr points to string to be parsed */ 701 lastre = prestr = (uchar *)p; 702 rtok = relex(); 703 /* GNU compatibility: an empty regexp matches anything */ 704 if (rtok == '\0') { 705 /* FATAL("empty regular expression"); previous */ 706 return (op2(EMPTYRE, NIL, NIL)); 707 } 708 np = regexp(); 709 if (rtok != '\0') { 710 FATAL("syntax error in regular expression %s at %s", 711 lastre, prestr); 712 } 713 return (np); 714 } 715 716 static Node * 717 regexp(void) /* top-level parse of reg expr */ 718 { 719 return (alt(concat(primary()))); 720 } 721 722 static Node * 723 primary(void) 724 { 725 Node *np; 726 727 switch (rtok) { 728 case CHAR: 729 np = op2(CHAR, NIL, itonp(rlxval)); 730 rtok = relex(); 731 return (unary(np)); 732 case ALL: 733 rtok = relex(); 734 return (unary(op2(ALL, NIL, NIL))); 735 case EMPTYRE: 736 rtok = relex(); 737 return (unary(op2(ALL, NIL, NIL))); 738 case DOT: 739 rtok = relex(); 740 return (unary(op2(DOT, NIL, NIL))); 741 case CCL: 742 /*LINTED align*/ 743 np = op2(CCL, NIL, (Node *)cclenter(rlxstr)); 744 rtok = relex(); 745 return (unary(np)); 746 case NCCL: 747 /*LINTED align*/ 748 np = op2(NCCL, NIL, (Node *)cclenter(rlxstr)); 749 rtok = relex(); 750 return (unary(np)); 751 case '^': 752 rtok = relex(); 753 return (unary(op2(CHAR, NIL, itonp(HAT)))); 754 case '$': 755 rtok = relex(); 756 return (unary(op2(CHAR, NIL, NIL))); 757 case '(': 758 rtok = relex(); 759 if (rtok == ')') { /* special pleading for () */ 760 rtok = relex(); 761 return (unary(op2(CCL, NIL, 762 /*LINTED align*/ 763 (Node *)tostring((uchar *)"")))); 764 } 765 np = regexp(); 766 if (rtok == ')') { 767 rtok = relex(); 768 return (unary(np)); 769 } else { 770 FATAL("syntax error in regular expression %s at %s", 771 lastre, prestr); 772 } 773 default: 774 FATAL("illegal primary in regular expression %s at %s", 775 lastre, prestr); 776 } 777 /*NOTREACHED*/ 778 return (NULL); 779 } 780 781 static Node * 782 concat(Node *np) 783 { 784 switch (rtok) { 785 case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: 786 case '$': case '(': 787 return (concat(op2(CAT, np, primary()))); 788 default: 789 return (np); 790 } 791 } 792 793 static Node * 794 alt(Node *np) 795 { 796 if (rtok == OR) { 797 rtok = relex(); 798 return (alt(op2(OR, np, concat(primary())))); 799 } 800 return (np); 801 } 802 803 static Node * 804 unary(Node *np) 805 { 806 switch (rtok) { 807 case STAR: 808 rtok = relex(); 809 return (unary(op2(STAR, np, NIL))); 810 case PLUS: 811 rtok = relex(); 812 return (unary(op2(PLUS, np, NIL))); 813 case QUEST: 814 rtok = relex(); 815 return (unary(op2(QUEST, np, NIL))); 816 default: 817 return (np); 818 } 819 } 820 821 /* 822 * Character class definitions conformant to the POSIX locale as 823 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 824 * and operating character sets are both ASCII (ISO646) or supersets 825 * thereof. 826 * 827 * Note that to avoid overflowing the temporary buffer used in 828 * relex(), the expanded character class (prior to range expansion) 829 * must be less than twice the size of their full name. 830 */ 831 832 /* 833 * Because isblank doesn't show up in any of the header files on any 834 * system i use, it's defined here. if some other locale has a richer 835 * definition of "blank", define HAS_ISBLANK and provide your own 836 * version. 837 * the parentheses here are an attempt to find a path through the maze 838 * of macro definition and/or function and/or version provided. thanks 839 * to nelson beebe for the suggestion; let's see if it works everywhere. 840 */ 841 842 /* #define HAS_ISBLANK */ 843 #ifndef HAS_ISBLANK 844 845 int 846 (xisblank)(int c) 847 { 848 return (c == ' ' || c == '\t'); 849 } 850 851 #endif 852 853 struct charclass { 854 const char *cc_name; 855 int cc_namelen; 856 int (*cc_func)(int); 857 } charclasses[] = { 858 { "alnum", 5, isalnum }, 859 { "alpha", 5, isalpha }, 860 #ifndef HAS_ISBLANK 861 { "blank", 5, isspace }, /* was isblank */ 862 #else 863 { "blank", 5, isblank }, 864 #endif 865 { "cntrl", 5, iscntrl }, 866 { "digit", 5, isdigit }, 867 { "graph", 5, isgraph }, 868 { "lower", 5, islower }, 869 { "print", 5, isprint }, 870 { "punct", 5, ispunct }, 871 { "space", 5, isspace }, 872 { "upper", 5, isupper }, 873 { "xdigit", 6, isxdigit }, 874 { NULL, 0, NULL }, 875 }; 876 877 static int 878 relex(void) /* lexical analyzer for reparse */ 879 { 880 int c, n; 881 int cflag; 882 uchar *buf = 0; 883 size_t bufsz = 100; 884 uchar *bp; 885 struct charclass *cc; 886 int i; 887 888 switch (c = *prestr++) { 889 case '|': return OR; 890 case '*': return STAR; 891 case '+': return PLUS; 892 case '?': return QUEST; 893 case '.': return DOT; 894 case '\0': prestr--; return '\0'; 895 case '^': 896 case '$': 897 case '(': 898 case ')': 899 return (c); 900 case '\\': 901 rlxval = quoted(&prestr); 902 return (CHAR); 903 default: 904 rlxval = c; 905 return (CHAR); 906 case '[': 907 if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL) 908 FATAL("out of space in reg expr %.10s..", lastre); 909 bp = buf; 910 if (*prestr == '^') { 911 cflag = 1; 912 prestr++; 913 } else 914 cflag = 0; 915 n = 2 * strlen((const char *) prestr)+1; 916 if (!adjbuf(&buf, &bufsz, n, n, &bp, "relex1")) 917 FATAL("out of space for reg expr %.10s...", lastre); 918 for (;;) { 919 if ((c = *prestr++) == '\\') { 920 *bp++ = '\\'; 921 if ((c = *prestr++) == '\0') { 922 FATAL( 923 "nonterminated character class %s", lastre); 924 } 925 *bp++ = c; 926 } else if (c == '[' && *prestr == ':') { 927 /* 928 * POSIX char class names, Dag-Erling 929 * Smorgrav, des@ofug.org 930 */ 931 for (cc = charclasses; cc->cc_name; cc++) 932 if (strncmp((const char *) prestr + 1, 933 (const char *) cc->cc_name, 934 cc->cc_namelen) == 0) 935 break; 936 if (cc->cc_name != NULL && 937 prestr[1 + cc->cc_namelen] == ':' && 938 prestr[2 + cc->cc_namelen] == ']') { 939 prestr += cc->cc_namelen + 3; 940 for (i = 0; i < NCHARS; i++) { 941 if (!adjbuf(&buf, &bufsz, 942 bp - buf + 1, 100, &bp, 943 "relex2")) 944 FATAL( 945 "out of space for reg expr %.10s...", lastre); 946 if (cc->cc_func(i)) { 947 *bp++ = i; 948 n++; 949 } 950 } 951 } else 952 *bp++ = c; 953 } else if (c == '\0') { 954 FATAL("nonterminated character class %.20s", 955 lastre); 956 } else if (bp == buf) { /* 1st char is special */ 957 *bp++ = c; 958 } else if (c == ']') { 959 *bp++ = 0; 960 rlxstr = tostring(buf); 961 if (cflag == 0) 962 return (CCL); 963 else 964 return (NCCL); 965 } else 966 *bp++ = c; 967 } 968 /*NOTREACHED*/ 969 } 970 return (0); 971 } 972 973 static int 974 cgoto(fa *f, int s, int c) 975 { 976 int i, j, k; 977 int *p, *q; 978 979 assert(c == HAT || c < NCHARS); 980 while (f->accept >= maxsetvec) { /* guessing here! */ 981 maxsetvec *= 4; 982 setvec = (int *)realloc(setvec, maxsetvec * sizeof (int)); 983 tmpset = (int *)realloc(tmpset, maxsetvec * sizeof (int)); 984 if (setvec == 0 || tmpset == 0) 985 overflo("out of space in cgoto()"); 986 } 987 for (i = 0; i <= f->accept; i++) 988 setvec[i] = 0; 989 setcnt = 0; 990 /* compute positions of gototab[s,c] into setvec */ 991 p = f->posns[s]; 992 for (i = 1; i <= *p; i++) { 993 if ((k = f->re[p[i]].ltype) != FINAL) { 994 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) || 995 (k == DOT && c != 0 && c != HAT) || 996 (k == ALL && c != 0) || 997 (k == EMPTYRE && c != 0) || 998 (k == CCL && 999 member(c, (uchar *)f->re[p[i]].lval.up)) || 1000 ((k == NCCL && 1001 !member(c, (uchar *)f->re[p[i]].lval.up)) && 1002 c != 0 && c != HAT)) { 1003 q = f->re[p[i]].lfollow; 1004 for (j = 1; j <= *q; j++) { 1005 if (q[j] >= maxsetvec) { 1006 maxsetvec *= 4; 1007 setvec = (int *)realloc(setvec, 1008 maxsetvec * sizeof (int)); 1009 tmpset = (int *)realloc(tmpset, 1010 maxsetvec * sizeof (int)); 1011 if (setvec == 0 || 1012 tmpset == 0) 1013 overflo( 1014 "cgoto overflow"); 1015 } 1016 if (setvec[q[j]] == 0) { 1017 setcnt++; 1018 setvec[q[j]] = 1; 1019 } 1020 } 1021 } 1022 } 1023 } 1024 /* determine if setvec is a previous state */ 1025 tmpset[0] = setcnt; 1026 j = 1; 1027 for (i = f->accept; i >= 0; i--) 1028 if (setvec[i]) { 1029 tmpset[j++] = i; 1030 } 1031 /* tmpset == previous state? */ 1032 for (i = 1; i <= f->curstat; i++) { 1033 p = f->posns[i]; 1034 if ((k = tmpset[0]) != p[0]) 1035 goto different; 1036 for (j = 1; j <= k; j++) 1037 if (tmpset[j] != p[j]) 1038 goto different; 1039 /* setvec is state i */ 1040 f->gototab[s][c] = i; 1041 return (i); 1042 different:; 1043 } 1044 1045 /* add tmpset to current set of states */ 1046 if (f->curstat >= NSTATES - 1) { 1047 f->curstat = 2; 1048 f->reset = 1; 1049 for (i = 2; i < NSTATES; i++) 1050 xfree(f->posns[i]); 1051 } else 1052 ++(f->curstat); 1053 for (i = 0; i < NCHARS; i++) 1054 f->gototab[f->curstat][i] = 0; 1055 xfree(f->posns[f->curstat]); 1056 if ((p = (int *)calloc(1, (setcnt + 1) * sizeof (int))) == NULL) 1057 overflo("out of space in cgoto"); 1058 1059 f->posns[f->curstat] = p; 1060 f->gototab[s][c] = f->curstat; 1061 for (i = 0; i <= setcnt; i++) 1062 p[i] = tmpset[i]; 1063 if (setvec[f->accept]) 1064 f->out[f->curstat] = 1; 1065 else 1066 f->out[f->curstat] = 0; 1067 return (f->curstat); 1068 } 1069 1070 static void 1071 freefa(fa *f) /* free a finite automaton */ 1072 { 1073 int i; 1074 1075 if (f == NULL) 1076 return; 1077 for (i = 0; i <= f->curstat; i++) 1078 xfree(f->posns[i]); 1079 for (i = 0; i <= f->accept; i++) { 1080 xfree(f->re[i].lfollow); 1081 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1082 xfree((f->re[i].lval.np)); 1083 } 1084 xfree(f->restr); 1085 xfree(f); 1086 }