Print this page
3731 Update nawk to version 20121220


   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 
  23 /*
  24  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  25  * Use is subject to license terms.
  26  */
  27 
  28 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  29 /*        All Rights Reserved   */
  30 
  31 #pragma ident   "%Z%%M% %I%     %E% SMI"



















  32 
  33 #define DEBUG
  34 
  35 #include "awk.h"
  36 #include "y.tab.h"
  37 
  38 #define HAT     (NCHARS-1)      /* matches ^ in regular expr */
  39                                 /* NCHARS is 2**n */
  40 #define MAXLIN (3 * LINE_MAX)
  41 
  42 #define type(v)         (v)->nobj

  43 #define left(v)         (v)->narg[0]
  44 #define right(v)        (v)->narg[1]
  45 #define parent(v)       (v)->nnext
  46 
  47 #define LEAF    case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:

  48 #define UNARY   case STAR: case PLUS: case QUEST:
  49 
  50 /*
  51  * encoding in tree Nodes:
  52  *      leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
  53  *              left is index, right contains value or pointer to value
  54  *      unary (STAR, PLUS, QUEST): left is child, right is null
  55  *      binary (CAT, OR): left and right are children
  56  *      parent contains pointer to parent
  57  */
  58 
  59 int     setvec[MAXLIN];
  60 int     tmpset[MAXLIN];
  61 Node    *point[MAXLIN];
  62 
  63 int     rtok;           /* next token in current re */
  64 int     rlxval;
  65 uchar   *rlxstr;
  66 uchar   *prestr;        /* current position in current re */
  67 uchar   *lastre;        /* origin of last re */
  68 
  69 static  int setcnt;
  70 static  int poscnt;
  71 
  72 uchar   *patbeg;
  73 int     patlen;
  74 
  75 #define NFA     20      /* cache this many dynamic fa's */
  76 fa      *fatab[NFA];
  77 int     nfatab  = 0;    /* entries in fatab */
  78 
  79 static fa       *mkdfa(uchar *, int);
  80 static int      makeinit(fa *, int);
  81 static void     penter(Node *);
  82 static void     freetr(Node *);
  83 static void     overflo(char *);
  84 static void     cfoll(fa *, Node *);
  85 static void     follow(Node *);
  86 static Node     *reparse(uchar *);
  87 static int      relex(void);
  88 static void     freefa(fa *);
  89 static int      cgoto(fa *, int, int);





  90 
  91 fa *
  92 makedfa(uchar *s, int anchor)   /* returns dfa for reg expr s */
  93 {
  94         int i, use, nuse;
  95         fa *pfa;









  96 
  97         if (compile_time)       /* a constant for sure */
  98                 return (mkdfa(s, anchor));
  99         for (i = 0; i < nfatab; i++) {       /* is it there already? */
 100                 if (fatab[i]->anchor == anchor &&
 101                     strcmp((char *)fatab[i]->restr, (char *)s) == 0) {
 102                         fatab[i]->use++;
 103                         return (fatab[i]);
 104                 }
 105         }
 106         pfa = mkdfa(s, anchor);
 107         if (nfatab < NFA) {  /* room for another */
 108                 fatab[nfatab] = pfa;
 109                 fatab[nfatab]->use = 1;
 110                 nfatab++;
 111                 return (pfa);
 112         }
 113         use = fatab[0]->use; /* replace least-recently used */
 114         nuse = 0;
 115         for (i = 1; i < nfatab; i++)
 116                 if (fatab[i]->use < use) {
 117                         use = fatab[i]->use;
 118                         nuse = i;
 119                 }

 120         freefa(fatab[nuse]);
 121         fatab[nuse] = pfa;
 122         pfa->use = 1;
 123         return (pfa);
 124 }
 125 
 126 fa *
 127 mkdfa(uchar *s, int anchor)     /* does the real work of making a dfa */

 128         /* anchor = 1 for anchored matches, else 0 */
 129 {
 130         Node *p, *p1;
 131         fa *f;
 132 
 133         p = reparse(s);
 134         p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
 135                 /* put ALL STAR in front of reg.  exp. */
 136         p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
 137                 /* put FINAL after reg.  exp. */
 138 
 139         poscnt = 0;
 140         penter(p1);     /* enter parent pointers and leaf indices */
 141         if ((f = (fa *)calloc(1, sizeof (fa) + poscnt * sizeof (rrow))) == NULL)
 142                 overflo("no room for fa");
 143         /* penter has computed number of positions in re */
 144         f->accept = poscnt-1;
 145         cfoll(f, p1);   /* set up follow sets */
 146         freetr(p1);
 147         if ((f->posns[0] =
 148             (int *)calloc(1, *(f->re[0].lfollow) * sizeof (int))) == NULL) {
 149                         overflo("out of space in makedfa");
 150         }
 151         if ((f->posns[1] = (int *)calloc(1, sizeof (int))) == NULL)
 152                 overflo("out of space in makedfa");
 153         *f->posns[1] = 0;
 154         f->initstat = makeinit(f, anchor);
 155         f->anchor = anchor;
 156         f->restr = tostring(s);
 157         return (f);
 158 }
 159 
 160 static int
 161 makeinit(fa *f, int anchor)
 162 {
 163         register int i, k;
 164 
 165         f->curstat = 2;
 166         f->out[2] = 0;
 167         f->reset = 0;
 168         k = *(f->re[0].lfollow);
 169         xfree(f->posns[2]);
 170         if ((f->posns[2] = (int *)calloc(1, (k+1) * sizeof (int))) == NULL)
 171                 overflo("out of space in makeinit");
 172         for (i = 0; i <= k; i++) {
 173                 (f->posns[2])[i] = (f->re[0].lfollow)[i];
 174         }
 175         if ((f->posns[2])[1] == f->accept)
 176                 f->out[2] = 1;
 177         for (i = 0; i < NCHARS; i++)
 178                 f->gototab[2][i] = 0;
 179         f->curstat = cgoto(f, 2, HAT);
 180         if (anchor) {
 181                 *f->posns[2] = k-1;  /* leave out position 0 */
 182                 for (i = 0; i < k; i++) {
 183                         (f->posns[0])[i] = (f->posns[2])[i];
 184                 }
 185 
 186                 f->out[0] = f->out[2];
 187                 if (f->curstat != 2)
 188                         --(*f->posns[f->curstat]);
 189         }
 190         return (f->curstat);
 191 }
 192 
 193 void
 194 penter(Node *p) /* set up parent pointers and leaf indices */
 195 {
 196         switch (type(p)) {

 197         LEAF
 198                 left(p) = (Node *) poscnt;
 199                 point[poscnt++] = p;
 200                 break;
 201         UNARY
 202                 penter(left(p));
 203                 parent(left(p)) = p;
 204                 break;
 205         case CAT:
 206         case OR:
 207                 penter(left(p));
 208                 penter(right(p));
 209                 parent(left(p)) = p;
 210                 parent(right(p)) = p;
 211                 break;
 212         default:
 213                 ERROR "unknown type %d in penter", type(p) FATAL;
 214                 break;
 215         }
 216 }
 217 
 218 static void
 219 freetr(Node *p) /* free parse tree */
 220 {
 221         switch (type(p)) {

 222         LEAF
 223                 xfree(p);
 224                 break;
 225         UNARY
 226                 freetr(left(p));
 227                 xfree(p);
 228                 break;
 229         case CAT:
 230         case OR:
 231                 freetr(left(p));
 232                 freetr(right(p));
 233                 xfree(p);
 234                 break;
 235         default:
 236                 ERROR "unknown type %d in freetr", type(p) FATAL;
 237                 break;
 238         }
 239 }
 240 
 241 uchar *
 242 cclenter(uchar *p)
 243 {
 244         register int i, c;
 245         uchar *op, *chars, *ret;
 246         size_t  bsize;


























 247 
 248         init_buf(&chars, &bsize, LINE_INCR);
 249         op = p;
 250         i = 0;
 251         while ((c = *p++) != 0) {
 252                 if (c == '\\') {
 253                         if ((c = *p++) == 't')
 254                                 c = '\t';
 255                         else if (c == 'n')
 256                                 c = '\n';
 257                         else if (c == 'f')
 258                                 c = '\f';
 259                         else if (c == 'r')
 260                                 c = '\r';
 261                         else if (c == 'b')
 262                                 c = '\b';
 263                         else if (c == '\\')
 264                                 c = '\\';
 265                         else if (isdigit(c)) {


 266                                 int n = c - '0';
 267                                 if (isdigit(*p)) {
 268                                         n = 8 * n + *p++ - '0';
 269                                         if (isdigit(*p))
 270                                                 n = 8 * n + *p++ - '0';
 271                                 }
 272                                 c = n;
 273                         } /* else */
 274                                 /* c = c; */
 275                 } else if (c == '-' && i > 0 && chars[i-1] != 0) {




















 276                         if (*p != 0) {
 277                                 c = chars[i-1];
 278                                 while ((uchar)c < *p) {      /* fails if *p is \\ */
 279                                         expand_buf(&chars, &bsize, i);
 280                                         chars[i++] = ++c;













 281                                 }
 282                                 p++;
 283                                 continue;
 284                         }
 285                 }
 286                 expand_buf(&chars, &bsize, i);
 287                 chars[i++] = c;



 288         }
 289         chars[i++] = '\0';
 290         dprintf(("cclenter: in = |%s|, out = |%s|\n", op, chars));
 291         xfree(op);
 292         ret = tostring(chars);
 293         free(chars);
 294         return (ret);
 295 }
 296 
 297 static void
 298 overflo(char *s)
 299 {
 300         ERROR "regular expression too big: %s", gettext((char *)s) FATAL;
 301 }
 302 
 303 /* enter follow set of each leaf of vertex v into lfollow[leaf] */
 304 static void
 305 cfoll(fa *f, Node *v)
 306 {
 307         register int i;
 308         register int *p;
 309 
 310         switch (type(v)) {

 311         LEAF
 312                 f->re[(int)left(v)].ltype = type(v);
 313                 f->re[(int)left(v)].lval = (int)right(v);









 314                 for (i = 0; i <= f->accept; i++)
 315                         setvec[i] = 0;
 316                 setcnt = 0;
 317                 follow(v);      /* computes setvec and setcnt */
 318                 if ((p = (int *)calloc(1, (setcnt+1) * sizeof (int))) == NULL)
 319                         overflo("follow set overflow");
 320                 f->re[(int)left(v)].lfollow = p;
 321                 *p = setcnt;
 322                 for (i = f->accept; i >= 0; i--) {
 323                         if (setvec[i] == 1)
 324                                 *++p = i;
 325                 }
 326                 break;
 327         UNARY
 328                 cfoll(f, left(v));
 329                 break;
 330         case CAT:
 331         case OR:
 332                 cfoll(f, left(v));
 333                 cfoll(f, right(v));
 334                 break;
 335         default:
 336                 ERROR "unknown type %d in cfoll", type(v) FATAL;
 337         }
 338 }
 339 
 340 /*
 341  * collects initially active leaves of p into setvec
 342  * returns 0 or 1 depending on whether p matches empty string
 343  */
 344 static int
 345 first(Node *p)
 346 {
 347         register int b;
 348 
 349         switch (type(p)) {

 350         LEAF
 351                 if (setvec[(int)left(p)] != 1) {
 352                         setvec[(int)left(p)] = 1;
















 353                         setcnt++;
 354                 }
 355                 if (type(p) == CCL && (*(uchar *)right(p)) == '\0')
 356                         return (0);             /* empty CCL */
 357                 else
 358                         return (1);
 359         case PLUS:
 360                 if (first(left(p)) == 0)
 361                         return (0);
 362                 return (1);
 363         case STAR:
 364         case QUEST:
 365                 (void) first(left(p));
 366                 return (0);
 367         case CAT:
 368                 if (first(left(p)) == 0 && first(right(p)) == 0)
 369                         return (0);
 370                 return (1);
 371         case OR:
 372                 b = first(right(p));
 373                 if (first(left(p)) == 0 || b == 0)
 374                         return (0);
 375                 return (1);
 376         }
 377         ERROR "unknown type %d in first", type(p) FATAL;


 378         return (-1);
 379 }
 380 
 381 /* collects leaves that can follow v into setvec */
 382 static void
 383 follow(Node *v)
 384 {
 385         Node *p;
 386 
 387         if (type(v) == FINAL)
 388                 return;
 389         p = parent(v);
 390         switch (type(p)) {
 391         case STAR:
 392         case PLUS:
 393                 (void) first(v);
 394                 follow(p);
 395                 return;
 396 
 397         case OR:
 398         case QUEST:
 399                 follow(p);
 400                 return;
 401 
 402         case CAT:
 403                 if (v == left(p)) {     /* v is left child of p */
 404                         if (first(right(p)) == 0) {
 405                                 follow(p);
 406                                 return;
 407                         }
 408                 } else          /* v is right child */
 409                         follow(p);
 410                 return;
 411         default:
 412                 ERROR "unknown type %d in follow", type(p) FATAL;
 413                 break;
 414         }
 415 }
 416 
 417 static int
 418 member(uchar c, uchar *s)       /* is c in s? */
 419 {


 420         while (*s)
 421                 if (c == *s++)
 422                         return (1);
 423         return (0);
 424 }
 425 
 426 
 427 int
 428 match(fa *f, uchar *p)
 429 {
 430         register int s, ns;

 431 
 432         s = f->reset ? makeinit(f, 0) : f->initstat;
 433         if (f->out[s])
 434                 return (1);
 435         do {

 436                 if ((ns = f->gototab[s][*p]) != 0)
 437                         s = ns;
 438                 else
 439                         s = cgoto(f, s, *p);
 440                 if (f->out[s])
 441                         return (1);
 442         } while (*p++ != 0);
 443         return (0);
 444 }
 445 
 446 int
 447 pmatch(fa *f, uchar *p)
 448 {
 449         register int s, ns;
 450         register uchar *q;

 451         int i, k;
 452 

 453         if (f->reset) {
 454                 f->initstat = s = makeinit(f, 1);
 455         } else {
 456                 s = f->initstat;
 457         }
 458         patbeg = p;
 459         patlen = -1;
 460         do {
 461                 q = p;
 462                 do {
 463                         if (f->out[s])               /* final state */
 464                                 patlen = q-p;

 465                         if ((ns = f->gototab[s][*q]) != 0)
 466                                 s = ns;
 467                         else
 468                                 s = cgoto(f, s, *q);
 469                         if (s == 1) {   /* no transition */
 470                                 if (patlen >= 0) {
 471                                         patbeg = p;
 472                                         return (1);
 473                                 } else
 474                                         goto nextin;    /* no match */
 475                         }
 476                 } while (*q++ != 0);
 477                 if (f->out[s])
 478                         patlen = q - p - 1;     /* don't count $ */
 479                 if (patlen >= 0) {
 480                         patbeg = p;
 481                         return (1);
 482                 }
 483         nextin:
 484                 s = 2;
 485                 if (f->reset) {
 486                         for (i = 2; i <= f->curstat; i++)
 487                                 xfree(f->posns[i]);
 488                         k = *f->posns[0];
 489                         if ((f->posns[2] =
 490                             (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
 491                                 overflo("out of space in pmatch");
 492                         }
 493                         for (i = 0; i <= k; i++)
 494                                 (f->posns[2])[i] = (f->posns[0])[i];
 495                         f->initstat = f->curstat = 2;
 496                         f->out[2] = f->out[0];
 497                         for (i = 0; i < NCHARS; i++)
 498                                 f->gototab[2][i] = 0;
 499                 }
 500         } while (*p++ != 0);
 501         return (0);
 502 }
 503 
 504 int
 505 nematch(fa *f, uchar *p)
 506 {
 507         register int s, ns;
 508         register uchar *q;

 509         int i, k;
 510 

 511         if (f->reset) {
 512                 f->initstat = s = makeinit(f, 1);
 513         } else {
 514                 s = f->initstat;
 515         }
 516         patlen = -1;
 517         while (*p) {
 518                 q = p;
 519                 do {
 520                         if (f->out[s])               /* final state */
 521                                 patlen = q-p;

 522                         if ((ns = f->gototab[s][*q]) != 0)
 523                                 s = ns;
 524                         else
 525                                 s = cgoto(f, s, *q);
 526                         if (s == 1) {   /* no transition */
 527                                 if (patlen > 0) {
 528                                         patbeg = p;
 529                                         return (1);
 530                                 } else
 531                                         goto nnextin;   /* no nonempty match */
 532                         }
 533                 } while (*q++ != 0);
 534                 if (f->out[s])
 535                         patlen = q - p - 1;     /* don't count $ */
 536                 if (patlen > 0) {
 537                         patbeg = p;
 538                         return (1);
 539                 }
 540         nnextin:
 541                 s = 2;
 542                 if (f->reset) {
 543                         for (i = 2; i <= f->curstat; i++)
 544                                 xfree(f->posns[i]);
 545                         k = *f->posns[0];
 546                         if ((f->posns[2] =
 547                             (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
 548                                 overflo("out of state space");
 549                         }
 550                         for (i = 0; i <= k; i++)
 551                                 (f->posns[2])[i] = (f->posns[0])[i];
 552                         f->initstat = f->curstat = 2;
 553                         f->out[2] = f->out[0];
 554                         for (i = 0; i < NCHARS; i++)
 555                                 f->gototab[2][i] = 0;
 556                 }
 557                 p++;
 558         }
 559         return (0);
 560 }
 561 
 562 static Node *regexp(void), *primary(void), *concat(Node *);
 563 static Node *alt(Node *), *unary(Node *);
 564 
 565 static Node *
 566 reparse(uchar *p)
 567 {
 568         /* parses regular expression pointed to by p */
 569         /* uses relex() to scan regular expression */
 570         Node *np;
 571 
 572         dprintf(("reparse <%s>\n", p));
 573         lastre = prestr = p;    /* prestr points to string to be parsed */

 574         rtok = relex();
 575         if (rtok == '\0')
 576                 ERROR "empty regular expression" FATAL;
 577         np = regexp();
 578         if (rtok == '\0') {
 579                 return (np);
 580         } else {
 581                 ERROR "syntax error in regular expression %s at %s",
 582                     lastre, prestr FATAL;
 583         }
 584         /*NOTREACHED*/
 585         return (NULL);




 586 }
 587 
 588 static Node *
 589 regexp(void)
 590 {
 591         return (alt(concat(primary())));
 592 }
 593 
 594 static Node *
 595 primary(void)
 596 {
 597         Node *np;
 598 
 599         switch (rtok) {
 600         case CHAR:
 601                 np = op2(CHAR, NIL, (Node *)rlxval);
 602                 rtok = relex();
 603                 return (unary(np));
 604         case ALL:
 605                 rtok = relex();
 606                 return (unary(op2(ALL, NIL, NIL)));



 607         case DOT:
 608                 rtok = relex();
 609                 return (unary(op2(DOT, NIL, NIL)));
 610         case CCL:
 611                 /*LINTED align*/
 612                 np = op2(CCL, NIL, (Node *)cclenter(rlxstr));
 613                 rtok = relex();
 614                 return (unary(np));
 615         case NCCL:
 616                 /*LINTED align*/
 617                 np = op2(NCCL, NIL, (Node *)cclenter(rlxstr));
 618                 rtok = relex();
 619                 return (unary(np));
 620         case '^':
 621                 rtok = relex();
 622                 return (unary(op2(CHAR, NIL, (Node *)HAT)));
 623         case '$':
 624                 rtok = relex();
 625                 return (unary(op2(CHAR, NIL, NIL)));
 626         case '(':
 627                 rtok = relex();
 628                 if (rtok == ')') {      /* special pleading for () */
 629                         rtok = relex();
 630                         return (unary(op2(CCL, NIL,
 631                             /*LINTED align*/
 632                             (Node *)tostring((uchar *)""))));
 633                 }
 634                 np = regexp();
 635                 if (rtok == ')') {
 636                         rtok = relex();
 637                         return (unary(np));
 638                 } else {
 639                         ERROR "syntax error in regular expression %s at %s",
 640                             lastre, prestr FATAL;
 641                 }
 642         default:
 643                 ERROR "illegal primary in regular expression %s at %s",
 644                     lastre, prestr FATAL;
 645         }
 646         /*NOTREACHED*/
 647         return (NULL);
 648 }
 649 
 650 static Node *
 651 concat(Node *np)
 652 {
 653         switch (rtok) {
 654         case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':

 655                 return (concat(op2(CAT, np, primary())));
 656         default:
 657                 return (np);
 658         }
 659 }
 660 
 661 static Node *
 662 alt(Node *np)
 663 {
 664         if (rtok == OR) {
 665                 rtok = relex();
 666                 return (alt(op2(OR, np, concat(primary()))));
 667         }
 668         return (np);
 669 }
 670 
 671 static Node *
 672 unary(Node *np)
 673 {
 674         switch (rtok) {
 675         case STAR:
 676                 rtok = relex();
 677                 return (unary(op2(STAR, np, NIL)));
 678         case PLUS:
 679                 rtok = relex();
 680                 return (unary(op2(PLUS, np, NIL)));
 681         case QUEST:
 682                 rtok = relex();
 683                 return (unary(op2(QUEST, np, NIL)));
 684         default:
 685                 return (np);
 686         }
 687 }
 688 
























































 689 static int
 690 relex(void)             /* lexical analyzer for reparse */
 691 {
 692         register int c;
 693         uchar *cbuf;
 694         int clen, cflag;




 695 
 696         switch (c = *prestr++) {
 697         case '|': return OR;
 698         case '*': return STAR;
 699         case '+': return PLUS;
 700         case '?': return QUEST;
 701         case '.': return DOT;
 702         case '\0': prestr--; return '\0';
 703         case '^':
 704         case '$':
 705         case '(':
 706         case ')':
 707                 return (c);
 708         case '\\':
 709                 if ((c = *prestr++) == 't')
 710                         c = '\t';
 711                 else if (c == 'n')
 712                         c = '\n';
 713                 else if (c == 'f')
 714                         c = '\f';
 715                 else if (c == 'r')
 716                         c = '\r';
 717                 else if (c == 'b')
 718                         c = '\b';
 719                 else if (c == '\\')
 720                         c = '\\';
 721                 else if (isdigit(c)) {
 722                         int n = c - '0';
 723                         if (isdigit(*prestr)) {
 724                                 n = 8 * n + *prestr++ - '0';
 725                                 if (isdigit(*prestr))
 726                                         n = 8 * n + *prestr++ - '0';
 727                         }
 728                         c = n;
 729                 } /* else it's now in c */
 730                 rlxval = c;
 731                 return (CHAR);
 732         default:
 733                 rlxval = c;
 734                 return (CHAR);
 735         case '[':
 736                 clen = 0;


 737                 if (*prestr == '^') {
 738                         cflag = 1;
 739                         prestr++;
 740                 } else
 741                         cflag = 0;
 742                 init_buf(&cbuf, NULL, strlen((char *)prestr) * 2 + 1);


 743                 for (;;) {
 744                         if ((c = *prestr++) == '\\') {
 745                                 cbuf[clen++] = '\\';
 746                                 if ((c = *prestr++) == '\0') {
 747                                         ERROR
 748                         "nonterminated character class %s", lastre FATAL;
 749                                 }
 750                                 cbuf[clen++] = c;
































 751                         } else if (c == ']') {
 752                                 cbuf[clen] = 0;
 753                                 rlxstr = tostring(cbuf);
 754                                 free(cbuf);
 755                                 if (cflag == 0)
 756                                         return (CCL);
 757                                 else
 758                                         return (NCCL);
 759                         } else if (c == '\n') {
 760                                 ERROR "newline in character class %s...",
 761                                     lastre FATAL;
 762                         } else if (c == '\0') {
 763                                 ERROR "nonterminated character class %s",
 764                                     lastre FATAL;
 765                         } else
 766                                 cbuf[clen++] = c;
 767                 }
 768                 /*NOTREACHED*/
 769         }
 770         return (0);
 771 }
 772 
 773 static int
 774 cgoto(fa *f, int s, int c)
 775 {
 776         register int i, j, k;
 777         register int *p, *q;
 778 








 779         for (i = 0; i <= f->accept; i++)
 780                 setvec[i] = 0;
 781         setcnt = 0;
 782         /* compute positions of gototab[s,c] into setvec */
 783         p = f->posns[s];
 784         for (i = 1; i <= *p; i++) {
 785                 if ((k = f->re[p[i]].ltype) != FINAL) {
 786                         if (k == CHAR && c == f->re[p[i]].lval ||
 787                             k == DOT && c != 0 && c != HAT ||
 788                             k == ALL && c != 0 ||
 789                             k == CCL &&
 790                             member(c, (uchar *)f->re[p[i]].lval) ||
 791                             k == NCCL &&
 792                             !member(c, (uchar *)f->re[p[i]].lval) &&
 793                             c != 0 && c != HAT) {

 794                                 q = f->re[p[i]].lfollow;
 795                                 for (j = 1; j <= *q; j++) {











 796                                         if (setvec[q[j]] == 0) {
 797                                                 setcnt++;
 798                                                 setvec[q[j]] = 1;
 799                                         }
 800                                 }
 801                         }
 802                 }
 803         }
 804         /* determine if setvec is a previous state */
 805         tmpset[0] = setcnt;
 806         j = 1;
 807         for (i = f->accept; i >= 0; i--)
 808                 if (setvec[i]) {
 809                         tmpset[j++] = i;
 810                 }
 811         /* tmpset == previous state? */
 812         for (i = 1; i <= f->curstat; i++) {
 813                 p = f->posns[i];
 814                 if ((k = tmpset[0]) != p[0])
 815                         goto different;
 816                 for (j = 1; j <= k; j++)
 817                         if (tmpset[j] != p[j])
 818                                 goto different;
 819                 /* setvec is state i */
 820                 f->gototab[s][c] = i;
 821                 return (i);
 822         different:;
 823         }
 824 
 825         /* add tmpset to current set of states */
 826         if (f->curstat >= NSTATES-1) {
 827                 f->curstat = 2;
 828                 f->reset = 1;
 829                 for (i = 2; i < NSTATES; i++)
 830                         xfree(f->posns[i]);
 831         } else
 832                 ++(f->curstat);
 833         for (i = 0; i < NCHARS; i++)
 834                 f->gototab[f->curstat][i] = 0;
 835         xfree(f->posns[f->curstat]);
 836         if ((p = (int *)calloc(1, (setcnt + 1) * sizeof (int))) == NULL)
 837                 overflo("out of space in cgoto");
 838 
 839         f->posns[f->curstat] = p;
 840         f->gototab[s][c] = f->curstat;
 841         for (i = 0; i <= setcnt; i++)
 842                 p[i] = tmpset[i];
 843         if (setvec[f->accept])
 844                 f->out[f->curstat] = 1;
 845         else
 846                 f->out[f->curstat] = 0;
 847         return (f->curstat);
 848 }
 849 
 850 static void
 851 freefa(fa *f)
 852 {
 853 
 854         register int i;
 855 
 856         if (f == NULL)
 857                 return;
 858         for (i = 0; i <= f->curstat; i++)
 859                 xfree(f->posns[i]);
 860         for (i = 0; i <= f->accept; i++)
 861                 xfree(f->re[i].lfollow);



 862         xfree(f->restr);
 863         xfree(f);
 864 }


   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 
  23 /*
  24  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  25  * Use is subject to license terms.
  26  */
  27 
  28 /*
  29  * Copyright (C) Lucent Technologies 1997
  30  * All Rights Reserved
  31  *
  32  * Permission to use, copy, modify, and distribute this software and
  33  * its documentation for any purpose and without fee is hereby
  34  * granted, provided that the above copyright notice appear in all
  35  * copies and that both that the copyright notice and this
  36  * permission notice and warranty disclaimer appear in supporting
  37  * documentation, and that the name Lucent Technologies or any of
  38  * its entities not be used in advertising or publicity pertaining
  39  * to distribution of the software without specific, written prior
  40  * permission.
  41  *
  42  * LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
  43  * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
  44  * IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
  45  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  46  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
  47  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
  48  * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
  49  * THIS SOFTWARE.
  50  */
  51 
  52 #define DEBUG
  53 
  54 #include "awk.h"
  55 #include "y.tab.h"
  56 
  57 #define HAT     (NCHARS+2)      /* matches ^ in regular expr */
  58                                 /* NCHARS is 2**n */
  59 #define MAXLIN  22
  60 
  61 #define type(v)         (v)->nobj    /* badly overloaded here */
  62 #define info(v)         (v)->ntype   /* badly overloaded here */
  63 #define left(v)         (v)->narg[0]
  64 #define right(v)        (v)->narg[1]
  65 #define parent(v)       (v)->nnext
  66 
  67 #define LEAF    case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
  68 #define ELEAF   case EMPTYRE:           /* empty string in regexp */
  69 #define UNARY   case STAR: case PLUS: case QUEST:
  70 
  71 /*
  72  * encoding in tree Nodes:
  73  *      leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
  74  *              left is index, right contains value or pointer to value
  75  *      unary (STAR, PLUS, QUEST): left is child, right is null
  76  *      binary (CAT, OR): left and right are children
  77  *      parent contains pointer to parent
  78  */
  79 
  80 int     *setvec;
  81 int     *tmpset;
  82 int     maxsetvec = 0;
  83 
  84 int     rtok;           /* next token in current re */
  85 int     rlxval;
  86 static  uchar   *rlxstr;
  87 static  uchar   *prestr;        /* current position in current re */
  88 static  uchar   *lastre;        /* origin of last re */
  89 
  90 static  int setcnt;
  91 static  int poscnt;
  92 
  93 uchar   *patbeg;
  94 int     patlen;
  95 
  96 #define NFA     20      /* cache this many dynamic fa's */
  97 fa      *fatab[NFA];
  98 int     nfatab  = 0;    /* entries in fatab */
  99 
 100 static fa       *mkdfa(const uchar *, int);
 101 static int      makeinit(fa *, int);
 102 static void     penter(Node *);
 103 static void     freetr(Node *);
 104 static void     overflo(const char *);
 105 static void     cfoll(fa *, Node *);
 106 static void     follow(Node *);
 107 static Node     *reparse(const uchar *);
 108 static int      relex(void);
 109 static void     freefa(fa *);
 110 static int      cgoto(fa *, int, int);
 111 static Node     *regexp(void);
 112 static Node     *primary(void);
 113 static Node     *concat(Node *);
 114 static Node     *alt(Node *);
 115 static Node     *unary(Node *);
 116 
 117 fa *
 118 makedfa(const uchar *s, int anchor)     /* returns dfa for reg expr s */
 119 {
 120         int i, use, nuse;
 121         fa *pfa;
 122         static int now = 1;
 123 
 124         if (setvec == 0) {      /* first time through any RE */
 125                 maxsetvec = MAXLIN;
 126                 setvec = (int *)malloc(maxsetvec * sizeof (int));
 127                 tmpset = (int *)malloc(maxsetvec * sizeof (int));
 128                 if (setvec == 0 || tmpset == 0)
 129                         overflo("out of space initializing makedfa");
 130         }
 131 
 132         if (compile_time)       /* a constant for sure */
 133                 return (mkdfa(s, anchor));
 134         for (i = 0; i < nfatab; i++) {       /* is it there already? */
 135                 if (fatab[i]->anchor == anchor &&
 136                     strcmp((const char *)fatab[i]->restr, (char *)s) == 0) {
 137                         fatab[i]->use = now++;
 138                         return (fatab[i]);
 139                 }
 140         }
 141         pfa = mkdfa(s, anchor);
 142         if (nfatab < NFA) {  /* room for another */
 143                 fatab[nfatab] = pfa;
 144                 fatab[nfatab]->use = now++;
 145                 nfatab++;
 146                 return (pfa);
 147         }
 148         use = fatab[0]->use; /* replace least-recently used */
 149         nuse = 0;
 150         for (i = 1; i < nfatab; i++) {
 151                 if (fatab[i]->use < use) {
 152                         use = fatab[i]->use;
 153                         nuse = i;
 154                 }
 155         }
 156         freefa(fatab[nuse]);
 157         fatab[nuse] = pfa;
 158         pfa->use = now++;
 159         return (pfa);
 160 }
 161 
 162 fa *
 163 mkdfa(const uchar *s, int anchor)
 164 {                               /* does the real work of making a dfa */
 165                                 /* anchor = 1 for anchored matches, else 0 */

 166         Node *p, *p1;
 167         fa *f;
 168 
 169         p = reparse(s);
 170         p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
 171                 /* put ALL STAR in front of reg.  exp. */
 172         p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
 173                 /* put FINAL after reg.  exp. */
 174 
 175         poscnt = 0;
 176         penter(p1);     /* enter parent pointers and leaf indices */
 177         if ((f = (fa *)calloc(1, sizeof (fa) + poscnt * sizeof (rrow))) == NULL)
 178                 overflo("out of space for fa");
 179         /* penter has computed number of positions in re */
 180         f->accept = poscnt-1;
 181         cfoll(f, p1);   /* set up follow sets */
 182         freetr(p1);
 183         if ((f->posns[0] =
 184             (int *)calloc(1, *(f->re[0].lfollow) * sizeof (int))) == NULL) {
 185                         overflo("out of space in makedfa");
 186         }
 187         if ((f->posns[1] = (int *)calloc(1, sizeof (int))) == NULL)
 188                 overflo("out of space in makedfa");
 189         *f->posns[1] = 0;
 190         f->initstat = makeinit(f, anchor);
 191         f->anchor = anchor;
 192         f->restr = tostring(s);
 193         return (f);
 194 }
 195 
 196 static int
 197 makeinit(fa *f, int anchor)
 198 {
 199         int i, k;
 200 
 201         f->curstat = 2;
 202         f->out[2] = 0;
 203         f->reset = 0;
 204         k = *(f->re[0].lfollow);
 205         xfree(f->posns[2]);
 206         if ((f->posns[2] = (int *)calloc(1, (k+1) * sizeof (int))) == NULL)
 207                 overflo("out of space in makeinit");
 208         for (i = 0; i <= k; i++) {
 209                 (f->posns[2])[i] = (f->re[0].lfollow)[i];
 210         }
 211         if ((f->posns[2])[1] == f->accept)
 212                 f->out[2] = 1;
 213         for (i = 0; i < NCHARS; i++)
 214                 f->gototab[2][i] = 0;
 215         f->curstat = cgoto(f, 2, HAT);
 216         if (anchor) {
 217                 *f->posns[2] = k-1;  /* leave out position 0 */
 218                 for (i = 0; i < k; i++) {
 219                         (f->posns[0])[i] = (f->posns[2])[i];
 220                 }
 221 
 222                 f->out[0] = f->out[2];
 223                 if (f->curstat != 2)
 224                         --(*f->posns[f->curstat]);
 225         }
 226         return (f->curstat);
 227 }
 228 
 229 void
 230 penter(Node *p) /* set up parent pointers and leaf indices */
 231 {
 232         switch (type(p)) {
 233         ELEAF
 234         LEAF
 235                 info(p) = poscnt;
 236                 poscnt++;
 237                 break;
 238         UNARY
 239                 penter(left(p));
 240                 parent(left(p)) = p;
 241                 break;
 242         case CAT:
 243         case OR:
 244                 penter(left(p));
 245                 penter(right(p));
 246                 parent(left(p)) = p;
 247                 parent(right(p)) = p;
 248                 break;
 249         default:        /* can't happen */
 250                 FATAL("can't happen: unknown type %d in penter", type(p));
 251                 break;
 252         }
 253 }
 254 
 255 static void
 256 freetr(Node *p) /* free parse tree */
 257 {
 258         switch (type(p)) {
 259         ELEAF
 260         LEAF
 261                 xfree(p);
 262                 break;
 263         UNARY
 264                 freetr(left(p));
 265                 xfree(p);
 266                 break;
 267         case CAT:
 268         case OR:
 269                 freetr(left(p));
 270                 freetr(right(p));
 271                 xfree(p);
 272                 break;
 273         default:        /* can't happen */
 274                 FATAL("can't happen: unknown type %d in freetr", type(p));
 275                 break;
 276         }
 277 }
 278 
 279 /*
 280  * in the parsing of regular expressions, metacharacters like . have
 281  * to be seen literally;  \056 is not a metacharacter.
 282  */
 283 int
 284 hexstr(uchar **pp)      /* find and eval hex string at pp, return new p */
 285 {                       /* only pick up one 8-bit byte (2 chars) */
 286         uchar *p;
 287         int n = 0;
 288         int i;
 289 
 290         for (i = 0, p = (uchar *)*pp; i < 2 && isxdigit(*p); i++, p++) {
 291                 if (isdigit(*p))
 292                         n = 16 * n + *p - '0';
 293                 else if (*p >= 'a' && *p <= 'f')
 294                         n = 16 * n + *p - 'a' + 10;
 295                 else if (*p >= 'A' && *p <= 'F')
 296                         n = 16 * n + *p - 'A' + 10;
 297         }
 298         *pp = (uchar *)p;
 299         return (n);
 300 }
 301 
 302 /* multiple use of arg */
 303 #define isoctdigit(c)   ((c) >= '0' && (c) <= '7')
 304 
 305 int
 306 quoted(uchar **pp)      /* pick up next thing after a \\ */
 307 {                       /* and increment *pp */
 308 
 309         uchar *p = *pp;
 310         int c;
 311 





 312         if ((c = *p++) == 't')
 313                 c = '\t';
 314         else if (c == 'n')
 315                 c = '\n';
 316         else if (c == 'f')
 317                 c = '\f';
 318         else if (c == 'r')
 319                 c = '\r';
 320         else if (c == 'b')
 321                 c = '\b';
 322         else if (c == '\\')
 323                 c = '\\';
 324         else if (c == 'x') {    /* hexadecimal goo follows */
 325                 c = hexstr(&p);     /* this adds a null if number is invalid */
 326         } else if (isoctdigit(c)) {     /* \d \dd \ddd */
 327                 int n = c - '0';
 328                 if (isoctdigit(*p)) {
 329                         n = 8 * n + *p++ - '0';
 330                         if (isoctdigit(*p))
 331                                 n = 8 * n + *p++ - '0';
 332                 }
 333                 c = n;
 334         } /* else */
 335                 /* c = c; */
 336         *pp = p;
 337         return (c);
 338 }
 339 
 340 uchar *
 341 cclenter(const uchar *argp)             /* add a character class */
 342 {
 343         int i, c, c2;
 344         uchar  *p = (uchar *)argp;
 345         uchar *op, *bp;
 346         static uchar *buf = NULL;
 347         size_t  bufsz = 100;
 348 
 349         op = p;
 350         if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL)
 351                 FATAL("out of space for character class [%.10s...] 1", p);
 352         bp = buf;
 353         for (i = 0; (c = *p++) != 0; ) {
 354                 if (c == '\\') {
 355                         c = quoted(&p);
 356                 } else if (c == '-' && i > 0 && bp[-1] != 0) {
 357                         if (*p != 0) {
 358                                 c = bp[-1];
 359                                 c2 = *p++;
 360                                 if (c2 == '\\')
 361                                         c2 = quoted(&p);
 362                                 if (c > c2) {   /* empty; ignore */
 363                                         bp--;
 364                                         i--;
 365                                         continue;
 366                                 }
 367                                 while (c < c2) {
 368                                         if (adjbuf(&buf, &bufsz,
 369                                             bp - buf + 2, 100, &bp,
 370                                             "cclenter1") == 0)
 371                                                 FATAL(
 372                         "out of space for character class [%.10s...] 2", p);
 373                                         *bp++ = ++c;
 374                                         i++;
 375                                 }

 376                                 continue;
 377                         }
 378                 }
 379                 if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter2"))
 380                         FATAL(
 381                 "out of space for character class [%.10s...] 3", p);
 382                 *bp++ = c;
 383                 i++;
 384         }
 385         *bp = 0;
 386         dprintf(("cclenter: in = |%s|, out = |%s|\n", op, buf));
 387         xfree(op);
 388         return (tostring(buf));


 389 }
 390 
 391 static void
 392 overflo(const char *s)
 393 {
 394         FATAL("regular expression too big: %.30s...", gettext((char *)s));
 395 }
 396 
 397 /* enter follow set of each leaf of vertex v into lfollow[leaf] */
 398 static void
 399 cfoll(fa *f, Node *v)
 400 {
 401         int i;
 402         int *p;
 403 
 404         switch (type(v)) {
 405         ELEAF
 406         LEAF
 407                 f->re[info(v)].ltype = type(v);
 408                 f->re[info(v)].lval.np = right(v);
 409                 while (f->accept >= maxsetvec) {  /* guessing here! */
 410                         maxsetvec *= 4;
 411                         setvec = (int *)realloc(setvec, maxsetvec *
 412                             sizeof (int));
 413                         tmpset = (int *)realloc(tmpset, maxsetvec *
 414                             sizeof (int));
 415                         if (setvec == 0 || tmpset == 0)
 416                                 overflo("out of space in cfoll()");
 417                 }
 418                 for (i = 0; i <= f->accept; i++)
 419                         setvec[i] = 0;
 420                 setcnt = 0;
 421                 follow(v);      /* computes setvec and setcnt */
 422                 if ((p = (int *)calloc(1, (setcnt+1) * sizeof (int))) == NULL)
 423                         overflo("out of space building follow set");
 424                 f->re[info(v)].lfollow = p;
 425                 *p = setcnt;
 426                 for (i = f->accept; i >= 0; i--) {
 427                         if (setvec[i] == 1)
 428                                 *++p = i;
 429                 }
 430                 break;
 431         UNARY
 432                 cfoll(f, left(v));
 433                 break;
 434         case CAT:
 435         case OR:
 436                 cfoll(f, left(v));
 437                 cfoll(f, right(v));
 438                 break;
 439         default:        /* can't happen */
 440                 FATAL("can't happen: unknown type %d in cfoll", type(v));
 441         }
 442 }
 443 
 444 /*
 445  * collects initially active leaves of p into setvec
 446  * returns 0 or 1 depending on whether p matches empty string
 447  */
 448 static int
 449 first(Node *p)
 450 {
 451         int b, lp;
 452 
 453         switch (type(p)) {
 454         ELEAF
 455         LEAF
 456                 /* look for high-water mark of subscripts */
 457                 lp = info(p);
 458                 /* guessing here! */
 459                 while (setcnt >= maxsetvec || lp >= maxsetvec) {
 460                         maxsetvec *= 4;
 461                         setvec = (int *)realloc(setvec, maxsetvec *
 462                             sizeof (int));
 463                         tmpset = (int *)realloc(tmpset, maxsetvec *
 464                             sizeof (int));
 465                         if (setvec == 0 || tmpset == 0)
 466                                 overflo("out of space in first()");
 467                 }
 468                 if (type(p) == EMPTYRE) {
 469                         setvec[lp] = 0;
 470                         return (0);
 471                 }
 472                 if (setvec[lp] != 1) {
 473                         setvec[lp] = 1;
 474                         setcnt++;
 475                 }
 476                 if (type(p) == CCL && (*(uchar *)right(p)) == '\0')
 477                         return (0);             /* empty CCL */
 478                 else
 479                         return (1);
 480         case PLUS:
 481                 if (first(left(p)) == 0)
 482                         return (0);
 483                 return (1);
 484         case STAR:
 485         case QUEST:
 486                 (void) first(left(p));
 487                 return (0);
 488         case CAT:
 489                 if (first(left(p)) == 0 && first(right(p)) == 0)
 490                         return (0);
 491                 return (1);
 492         case OR:
 493                 b = first(right(p));
 494                 if (first(left(p)) == 0 || b == 0)
 495                         return (0);
 496                 return (1);
 497         }
 498         /* can't happen */
 499         FATAL("can't happen: unknown type %d in first", type(p));
 500         /*NOTREACHED*/
 501         return (-1);
 502 }
 503 
 504 /* collects leaves that can follow v into setvec */
 505 static void
 506 follow(Node *v)
 507 {
 508         Node *p;
 509 
 510         if (type(v) == FINAL)
 511                 return;
 512         p = parent(v);
 513         switch (type(p)) {
 514         case STAR:
 515         case PLUS:
 516                 (void) first(v);
 517                 follow(p);
 518                 return;
 519 
 520         case OR:
 521         case QUEST:
 522                 follow(p);
 523                 return;
 524 
 525         case CAT:
 526                 if (v == left(p)) {     /* v is left child of p */
 527                         if (first(right(p)) == 0) {
 528                                 follow(p);
 529                                 return;
 530                         }
 531                 } else          /* v is right child */
 532                         follow(p);
 533                 return;



 534         }
 535 }
 536 
 537 static int
 538 member(int c, const uchar *sarg)        /* is c in s? */
 539 {
 540         uchar *s = (uchar *)sarg;
 541 
 542         while (*s)
 543                 if (c == *s++)
 544                         return (1);
 545         return (0);
 546 }
 547 
 548 
 549 int
 550 match(fa *f, const uchar *p0)   /* shortest match ? */
 551 {
 552         int s, ns;
 553         uchar *p = (uchar *)p0;
 554 
 555         s = f->reset ? makeinit(f, 0) : f->initstat;
 556         if (f->out[s])
 557                 return (1);
 558         do {
 559                 /* assert(*p < NCHARS); */
 560                 if ((ns = f->gototab[s][*p]) != 0)
 561                         s = ns;
 562                 else
 563                         s = cgoto(f, s, *p);
 564                 if (f->out[s])
 565                         return (1);
 566         } while (*p++ != 0);
 567         return (0);
 568 }
 569 
 570 int
 571 pmatch(fa *f, const uchar *p0)          /* longest match, for sub */
 572 {
 573         int s, ns;
 574         uchar *p = (uchar *)p0;
 575         uchar *q;
 576         int i, k;
 577 
 578         /* s = f->reset ? makeinit(f,1) : f->initstat; */
 579         if (f->reset) {
 580                 f->initstat = s = makeinit(f, 1);
 581         } else {
 582                 s = f->initstat;
 583         }
 584         patbeg = p;
 585         patlen = -1;
 586         do {
 587                 q = p;
 588                 do {
 589                         if (f->out[s])               /* final state */
 590                                 patlen = q-p;
 591                         /* assert(*q < NCHARS); */
 592                         if ((ns = f->gototab[s][*q]) != 0)
 593                                 s = ns;
 594                         else
 595                                 s = cgoto(f, s, *q);
 596                         if (s == 1) {   /* no transition */
 597                                 if (patlen >= 0) {
 598                                         patbeg = p;
 599                                         return (1);
 600                                 } else
 601                                         goto nextin;    /* no match */
 602                         }
 603                 } while (*q++ != 0);
 604                 if (f->out[s])
 605                         patlen = q - p - 1;     /* don't count $ */
 606                 if (patlen >= 0) {
 607                         patbeg = p;
 608                         return (1);
 609                 }
 610 nextin:
 611                 s = 2;
 612                 if (f->reset) {
 613                         for (i = 2; i <= f->curstat; i++)
 614                                 xfree(f->posns[i]);
 615                         k = *f->posns[0];
 616                         if ((f->posns[2] =
 617                             (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
 618                                 overflo("out of space in pmatch");
 619                         }
 620                         for (i = 0; i <= k; i++)
 621                                 (f->posns[2])[i] = (f->posns[0])[i];
 622                         f->initstat = f->curstat = 2;
 623                         f->out[2] = f->out[0];
 624                         for (i = 0; i < NCHARS; i++)
 625                                 f->gototab[2][i] = 0;
 626                 }
 627         } while (*p++ != 0);
 628         return (0);
 629 }
 630 
 631 int
 632 nematch(fa *f, const uchar *p0)         /* non-empty match, for sub */
 633 {
 634         int s, ns;
 635         uchar *p = (uchar *)p0;
 636         uchar *q;
 637         int i, k;
 638 
 639         /* s = f->reset ? makeinit(f,1) : f->initstat; */
 640         if (f->reset) {
 641                 f->initstat = s = makeinit(f, 1);
 642         } else {
 643                 s = f->initstat;
 644         }
 645         patlen = -1;
 646         while (*p) {
 647                 q = p;
 648                 do {
 649                         if (f->out[s])               /* final state */
 650                                 patlen = q - p;
 651                         /* assert(*q < NCHARS); */
 652                         if ((ns = f->gototab[s][*q]) != 0)
 653                                 s = ns;
 654                         else
 655                                 s = cgoto(f, s, *q);
 656                         if (s == 1) {   /* no transition */
 657                                 if (patlen > 0) {
 658                                         patbeg = p;
 659                                         return (1);
 660                                 } else
 661                                         goto nnextin;   /* no nonempty match */
 662                         }
 663                 } while (*q++ != 0);
 664                 if (f->out[s])
 665                         patlen = q - p - 1;     /* don't count $ */
 666                 if (patlen > 0) {
 667                         patbeg = p;
 668                         return (1);
 669                 }
 670 nnextin:
 671                 s = 2;
 672                 if (f->reset) {
 673                         for (i = 2; i <= f->curstat; i++)
 674                                 xfree(f->posns[i]);
 675                         k = *f->posns[0];
 676                         if ((f->posns[2] =
 677                             (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
 678                                 overflo("out of state space");
 679                         }
 680                         for (i = 0; i <= k; i++)
 681                                 (f->posns[2])[i] = (f->posns[0])[i];
 682                         f->initstat = f->curstat = 2;
 683                         f->out[2] = f->out[0];
 684                         for (i = 0; i < NCHARS; i++)
 685                                 f->gototab[2][i] = 0;
 686                 }
 687                 p++;
 688         }
 689         return (0);
 690 }
 691 



 692 static Node *
 693 reparse(const uchar *p)
 694 {
 695         /* parses regular expression pointed to by p */
 696         /* uses relex() to scan regular expression */
 697         Node *np;
 698 
 699         dprintf(("reparse <%s>\n", p));
 700         /* prestr points to string to be parsed */
 701         lastre = prestr = (uchar *)p;
 702         rtok = relex();
 703         /* GNU compatibility: an empty regexp matches anything */


 704         if (rtok == '\0') {
 705                 /* FATAL("empty regular expression"); previous */
 706                 return (op2(EMPTYRE, NIL, NIL));


 707         }
 708         np = regexp();
 709         if (rtok != '\0') {
 710                 FATAL("syntax error in regular expression %s at %s",
 711                     lastre, prestr);
 712         }
 713         return (np);
 714 }
 715 
 716 static Node *
 717 regexp(void)    /* top-level parse of reg expr */
 718 {
 719         return (alt(concat(primary())));
 720 }
 721 
 722 static Node *
 723 primary(void)
 724 {
 725         Node *np;
 726 
 727         switch (rtok) {
 728         case CHAR:
 729                 np = op2(CHAR, NIL, itonp(rlxval));
 730                 rtok = relex();
 731                 return (unary(np));
 732         case ALL:
 733                 rtok = relex();
 734                 return (unary(op2(ALL, NIL, NIL)));
 735         case EMPTYRE:
 736                 rtok = relex();
 737                 return (unary(op2(ALL, NIL, NIL)));
 738         case DOT:
 739                 rtok = relex();
 740                 return (unary(op2(DOT, NIL, NIL)));
 741         case CCL:
 742                 /*LINTED align*/
 743                 np = op2(CCL, NIL, (Node *)cclenter(rlxstr));
 744                 rtok = relex();
 745                 return (unary(np));
 746         case NCCL:
 747                 /*LINTED align*/
 748                 np = op2(NCCL, NIL, (Node *)cclenter(rlxstr));
 749                 rtok = relex();
 750                 return (unary(np));
 751         case '^':
 752                 rtok = relex();
 753                 return (unary(op2(CHAR, NIL, itonp(HAT))));
 754         case '$':
 755                 rtok = relex();
 756                 return (unary(op2(CHAR, NIL, NIL)));
 757         case '(':
 758                 rtok = relex();
 759                 if (rtok == ')') {      /* special pleading for () */
 760                         rtok = relex();
 761                         return (unary(op2(CCL, NIL,
 762                             /*LINTED align*/
 763                             (Node *)tostring((uchar *)""))));
 764                 }
 765                 np = regexp();
 766                 if (rtok == ')') {
 767                         rtok = relex();
 768                         return (unary(np));
 769                 } else {
 770                         FATAL("syntax error in regular expression %s at %s",
 771                             lastre, prestr);
 772                 }
 773         default:
 774                 FATAL("illegal primary in regular expression %s at %s",
 775                     lastre, prestr);
 776         }
 777         /*NOTREACHED*/
 778         return (NULL);
 779 }
 780 
 781 static Node *
 782 concat(Node *np)
 783 {
 784         switch (rtok) {
 785         case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL:
 786         case '$': case '(':
 787                 return (concat(op2(CAT, np, primary())));
 788         default:
 789                 return (np);
 790         }
 791 }
 792 
 793 static Node *
 794 alt(Node *np)
 795 {
 796         if (rtok == OR) {
 797                 rtok = relex();
 798                 return (alt(op2(OR, np, concat(primary()))));
 799         }
 800         return (np);
 801 }
 802 
 803 static Node *
 804 unary(Node *np)
 805 {
 806         switch (rtok) {
 807         case STAR:
 808                 rtok = relex();
 809                 return (unary(op2(STAR, np, NIL)));
 810         case PLUS:
 811                 rtok = relex();
 812                 return (unary(op2(PLUS, np, NIL)));
 813         case QUEST:
 814                 rtok = relex();
 815                 return (unary(op2(QUEST, np, NIL)));
 816         default:
 817                 return (np);
 818         }
 819 }
 820 
 821 /*
 822  * Character class definitions conformant to the POSIX locale as
 823  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
 824  * and operating character sets are both ASCII (ISO646) or supersets
 825  * thereof.
 826  *
 827  * Note that to avoid overflowing the temporary buffer used in
 828  * relex(), the expanded character class (prior to range expansion)
 829  * must be less than twice the size of their full name.
 830  */
 831 
 832 /*
 833  * Because isblank doesn't show up in any of the header files on any
 834  * system i use, it's defined here.  if some other locale has a richer
 835  * definition of "blank", define HAS_ISBLANK and provide your own
 836  * version.
 837  * the parentheses here are an attempt to find a path through the maze
 838  * of macro definition and/or function and/or version provided.  thanks
 839  * to nelson beebe for the suggestion; let's see if it works everywhere.
 840  */
 841 
 842 /* #define HAS_ISBLANK */
 843 #ifndef HAS_ISBLANK
 844 
 845 int
 846 (xisblank)(int c)
 847 {
 848         return (c == ' ' || c == '\t');
 849 }
 850 
 851 #endif
 852 
 853 struct charclass {
 854         const char *cc_name;
 855         int cc_namelen;
 856         int (*cc_func)(int);
 857 } charclasses[] = {
 858         { "alnum",      5,      isalnum },
 859         { "alpha",      5,      isalpha },
 860 #ifndef HAS_ISBLANK
 861         { "blank",      5,      isspace }, /* was isblank */
 862 #else
 863         { "blank",      5,      isblank },
 864 #endif
 865         { "cntrl",      5,      iscntrl },
 866         { "digit",      5,      isdigit },
 867         { "graph",      5,      isgraph },
 868         { "lower",      5,      islower },
 869         { "print",      5,      isprint },
 870         { "punct",      5,      ispunct },
 871         { "space",      5,      isspace },
 872         { "upper",      5,      isupper },
 873         { "xdigit",     6,      isxdigit },
 874         { NULL,         0,      NULL },
 875 };
 876 
 877 static int
 878 relex(void)             /* lexical analyzer for reparse */
 879 {
 880         int c, n;
 881         int cflag;
 882         uchar *buf = 0;
 883         size_t bufsz = 100;
 884         uchar *bp;
 885         struct charclass *cc;
 886         int i;
 887 
 888         switch (c = *prestr++) {
 889         case '|': return OR;
 890         case '*': return STAR;
 891         case '+': return PLUS;
 892         case '?': return QUEST;
 893         case '.': return DOT;
 894         case '\0': prestr--; return '\0';
 895         case '^':
 896         case '$':
 897         case '(':
 898         case ')':
 899                 return (c);
 900         case '\\':
 901                 rlxval = quoted(&prestr);





















 902                 return (CHAR);
 903         default:
 904                 rlxval = c;
 905                 return (CHAR);
 906         case '[':
 907                 if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL)
 908                         FATAL("out of space in reg expr %.10s..", lastre);
 909                 bp = buf;
 910                 if (*prestr == '^') {
 911                         cflag = 1;
 912                         prestr++;
 913                 } else
 914                         cflag = 0;
 915                 n = 2 * strlen((const char *) prestr)+1;
 916                 if (!adjbuf(&buf, &bufsz, n, n, &bp, "relex1"))
 917                         FATAL("out of space for reg expr %.10s...", lastre);
 918                 for (;;) {
 919                         if ((c = *prestr++) == '\\') {
 920                                 *bp++ = '\\';
 921                                 if ((c = *prestr++) == '\0') {
 922                                         FATAL(
 923                         "nonterminated character class %s", lastre);
 924                                 }
 925                                 *bp++ = c;
 926                         } else if (c == '[' && *prestr == ':') {
 927                                 /*
 928                                  * POSIX char class names, Dag-Erling
 929                                  * Smorgrav, des@ofug.org
 930                                  */
 931                                 for (cc = charclasses; cc->cc_name; cc++)
 932                                         if (strncmp((const char *) prestr + 1,
 933                                             (const char *) cc->cc_name,
 934                                             cc->cc_namelen) == 0)
 935                                                 break;
 936                                 if (cc->cc_name != NULL &&
 937                                     prestr[1 + cc->cc_namelen] == ':' &&
 938                                     prestr[2 + cc->cc_namelen] == ']') {
 939                                         prestr += cc->cc_namelen + 3;
 940                                         for (i = 0; i < NCHARS; i++) {
 941                                                 if (!adjbuf(&buf, &bufsz,
 942                                                     bp - buf + 1, 100, &bp,
 943                                                     "relex2"))
 944                                                         FATAL(
 945                                 "out of space for reg expr %.10s...", lastre);
 946                                                 if (cc->cc_func(i)) {
 947                                                         *bp++ = i;
 948                                                         n++;
 949                                                 }
 950                                         }
 951                                 } else
 952                                         *bp++ = c;
 953                         } else if (c == '\0') {
 954                                 FATAL("nonterminated character class %.20s",
 955                                     lastre);
 956                         } else if (bp == buf) { /* 1st char is special */
 957                                 *bp++ = c;
 958                         } else if (c == ']') {
 959                                 *bp++ = 0;
 960                                 rlxstr = tostring(buf);

 961                                 if (cflag == 0)
 962                                         return (CCL);
 963                                 else
 964                                         return (NCCL);






 965                         } else
 966                                 *bp++ = c;
 967                 }
 968                 /*NOTREACHED*/
 969         }
 970         return (0);
 971 }
 972 
 973 static int
 974 cgoto(fa *f, int s, int c)
 975 {
 976         int i, j, k;
 977         int *p, *q;
 978 
 979         assert(c == HAT || c < NCHARS);
 980         while (f->accept >= maxsetvec) {  /* guessing here! */
 981                 maxsetvec *= 4;
 982                 setvec = (int *)realloc(setvec, maxsetvec * sizeof (int));
 983                 tmpset = (int *)realloc(tmpset, maxsetvec * sizeof (int));
 984                 if (setvec == 0 || tmpset == 0)
 985                         overflo("out of space in cgoto()");
 986         }
 987         for (i = 0; i <= f->accept; i++)
 988                 setvec[i] = 0;
 989         setcnt = 0;
 990         /* compute positions of gototab[s,c] into setvec */
 991         p = f->posns[s];
 992         for (i = 1; i <= *p; i++) {
 993                 if ((k = f->re[p[i]].ltype) != FINAL) {
 994                         if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) ||
 995                             (k == DOT && c != 0 && c != HAT) ||
 996                             (k == ALL && c != 0) ||
 997                             (k == EMPTYRE && c != 0) ||
 998                             (k == CCL &&
 999                             member(c, (uchar *)f->re[p[i]].lval.up)) ||
1000                             ((k == NCCL &&
1001                             !member(c, (uchar *)f->re[p[i]].lval.up)) &&
1002                             c != 0 && c != HAT)) {
1003                                 q = f->re[p[i]].lfollow;
1004                                 for (j = 1; j <= *q; j++) {
1005                                         if (q[j] >= maxsetvec) {
1006                                                 maxsetvec *= 4;
1007                                                 setvec = (int *)realloc(setvec,
1008                                                     maxsetvec * sizeof (int));
1009                                                 tmpset = (int *)realloc(tmpset,
1010                                                     maxsetvec * sizeof (int));
1011                                                 if (setvec == 0 ||
1012                                                     tmpset == 0)
1013                                                         overflo(
1014                                                 "cgoto overflow");
1015                                         }
1016                                         if (setvec[q[j]] == 0) {
1017                                                 setcnt++;
1018                                                 setvec[q[j]] = 1;
1019                                         }
1020                                 }
1021                         }
1022                 }
1023         }
1024         /* determine if setvec is a previous state */
1025         tmpset[0] = setcnt;
1026         j = 1;
1027         for (i = f->accept; i >= 0; i--)
1028                 if (setvec[i]) {
1029                         tmpset[j++] = i;
1030                 }
1031         /* tmpset == previous state? */
1032         for (i = 1; i <= f->curstat; i++) {
1033                 p = f->posns[i];
1034                 if ((k = tmpset[0]) != p[0])
1035                         goto different;
1036                 for (j = 1; j <= k; j++)
1037                         if (tmpset[j] != p[j])
1038                                 goto different;
1039                 /* setvec is state i */
1040                 f->gototab[s][c] = i;
1041                 return (i);
1042         different:;
1043         }
1044 
1045         /* add tmpset to current set of states */
1046         if (f->curstat >= NSTATES - 1) {
1047                 f->curstat = 2;
1048                 f->reset = 1;
1049                 for (i = 2; i < NSTATES; i++)
1050                         xfree(f->posns[i]);
1051         } else
1052                 ++(f->curstat);
1053         for (i = 0; i < NCHARS; i++)
1054                 f->gototab[f->curstat][i] = 0;
1055         xfree(f->posns[f->curstat]);
1056         if ((p = (int *)calloc(1, (setcnt + 1) * sizeof (int))) == NULL)
1057                 overflo("out of space in cgoto");
1058 
1059         f->posns[f->curstat] = p;
1060         f->gototab[s][c] = f->curstat;
1061         for (i = 0; i <= setcnt; i++)
1062                 p[i] = tmpset[i];
1063         if (setvec[f->accept])
1064                 f->out[f->curstat] = 1;
1065         else
1066                 f->out[f->curstat] = 0;
1067         return (f->curstat);
1068 }
1069 
1070 static void
1071 freefa(fa *f)   /* free a finite automaton */
1072 {
1073         int i;

1074 
1075         if (f == NULL)
1076                 return;
1077         for (i = 0; i <= f->curstat; i++)
1078                 xfree(f->posns[i]);
1079         for (i = 0; i <= f->accept; i++) {
1080                 xfree(f->re[i].lfollow);
1081                 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1082                         xfree((f->re[i].lval.np));
1083         }
1084         xfree(f->restr);
1085         xfree(f);
1086 }