1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 
  23 /*
  24  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  25  * Use is subject to license terms.
  26  */
  27 
  28 /*
  29  * Copyright (C) Lucent Technologies 1997
  30  * All Rights Reserved
  31  *
  32  * Permission to use, copy, modify, and distribute this software and
  33  * its documentation for any purpose and without fee is hereby
  34  * granted, provided that the above copyright notice appear in all
  35  * copies and that both that the copyright notice and this
  36  * permission notice and warranty disclaimer appear in supporting
  37  * documentation, and that the name Lucent Technologies or any of
  38  * its entities not be used in advertising or publicity pertaining
  39  * to distribution of the software without specific, written prior
  40  * permission.
  41  *
  42  * LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
  43  * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
  44  * IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
  45  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  46  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
  47  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
  48  * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
  49  * THIS SOFTWARE.
  50  */
  51 
  52 #define DEBUG
  53 
  54 #include "awk.h"
  55 #include "y.tab.h"
  56 
  57 #define HAT     (NCHARS+2)      /* matches ^ in regular expr */
  58                                 /* NCHARS is 2**n */
  59 #define MAXLIN  22
  60 
  61 #define type(v)         (v)->nobj    /* badly overloaded here */
  62 #define info(v)         (v)->ntype   /* badly overloaded here */
  63 #define left(v)         (v)->narg[0]
  64 #define right(v)        (v)->narg[1]
  65 #define parent(v)       (v)->nnext
  66 
  67 #define LEAF    case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
  68 #define ELEAF   case EMPTYRE:           /* empty string in regexp */
  69 #define UNARY   case STAR: case PLUS: case QUEST:
  70 
  71 /*
  72  * encoding in tree Nodes:
  73  *      leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
  74  *              left is index, right contains value or pointer to value
  75  *      unary (STAR, PLUS, QUEST): left is child, right is null
  76  *      binary (CAT, OR): left and right are children
  77  *      parent contains pointer to parent
  78  */
  79 
  80 int     *setvec;
  81 int     *tmpset;
  82 int     maxsetvec = 0;
  83 
  84 int     rtok;           /* next token in current re */
  85 int     rlxval;
  86 static  uchar   *rlxstr;
  87 static  uchar   *prestr;        /* current position in current re */
  88 static  uchar   *lastre;        /* origin of last re */
  89 
  90 static  int setcnt;
  91 static  int poscnt;
  92 
  93 uchar   *patbeg;
  94 int     patlen;
  95 
  96 #define NFA     20      /* cache this many dynamic fa's */
  97 fa      *fatab[NFA];
  98 int     nfatab  = 0;    /* entries in fatab */
  99 
 100 static fa       *mkdfa(const uchar *, int);
 101 static int      makeinit(fa *, int);
 102 static void     penter(Node *);
 103 static void     freetr(Node *);
 104 static void     overflo(const char *);
 105 static void     cfoll(fa *, Node *);
 106 static void     follow(Node *);
 107 static Node     *reparse(const uchar *);
 108 static int      relex(void);
 109 static void     freefa(fa *);
 110 static int      cgoto(fa *, int, int);
 111 static Node     *regexp(void);
 112 static Node     *primary(void);
 113 static Node     *concat(Node *);
 114 static Node     *alt(Node *);
 115 static Node     *unary(Node *);
 116 
 117 fa *
 118 makedfa(const uchar *s, int anchor)     /* returns dfa for reg expr s */
 119 {
 120         int i, use, nuse;
 121         fa *pfa;
 122         static int now = 1;
 123 
 124         if (setvec == 0) {      /* first time through any RE */
 125                 maxsetvec = MAXLIN;
 126                 setvec = (int *)malloc(maxsetvec * sizeof (int));
 127                 tmpset = (int *)malloc(maxsetvec * sizeof (int));
 128                 if (setvec == 0 || tmpset == 0)
 129                         overflo("out of space initializing makedfa");
 130         }
 131 
 132         if (compile_time)       /* a constant for sure */
 133                 return (mkdfa(s, anchor));
 134         for (i = 0; i < nfatab; i++) {       /* is it there already? */
 135                 if (fatab[i]->anchor == anchor &&
 136                     strcmp((const char *)fatab[i]->restr, (char *)s) == 0) {
 137                         fatab[i]->use = now++;
 138                         return (fatab[i]);
 139                 }
 140         }
 141         pfa = mkdfa(s, anchor);
 142         if (nfatab < NFA) {  /* room for another */
 143                 fatab[nfatab] = pfa;
 144                 fatab[nfatab]->use = now++;
 145                 nfatab++;
 146                 return (pfa);
 147         }
 148         use = fatab[0]->use; /* replace least-recently used */
 149         nuse = 0;
 150         for (i = 1; i < nfatab; i++) {
 151                 if (fatab[i]->use < use) {
 152                         use = fatab[i]->use;
 153                         nuse = i;
 154                 }
 155         }
 156         freefa(fatab[nuse]);
 157         fatab[nuse] = pfa;
 158         pfa->use = now++;
 159         return (pfa);
 160 }
 161 
 162 fa *
 163 mkdfa(const uchar *s, int anchor)
 164 {                               /* does the real work of making a dfa */
 165                                 /* anchor = 1 for anchored matches, else 0 */
 166         Node *p, *p1;
 167         fa *f;
 168 
 169         p = reparse(s);
 170         p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
 171                 /* put ALL STAR in front of reg.  exp. */
 172         p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
 173                 /* put FINAL after reg.  exp. */
 174 
 175         poscnt = 0;
 176         penter(p1);     /* enter parent pointers and leaf indices */
 177         if ((f = (fa *)calloc(1, sizeof (fa) + poscnt * sizeof (rrow))) == NULL)
 178                 overflo("out of space for fa");
 179         /* penter has computed number of positions in re */
 180         f->accept = poscnt-1;
 181         cfoll(f, p1);   /* set up follow sets */
 182         freetr(p1);
 183         if ((f->posns[0] =
 184             (int *)calloc(1, *(f->re[0].lfollow) * sizeof (int))) == NULL) {
 185                         overflo("out of space in makedfa");
 186         }
 187         if ((f->posns[1] = (int *)calloc(1, sizeof (int))) == NULL)
 188                 overflo("out of space in makedfa");
 189         *f->posns[1] = 0;
 190         f->initstat = makeinit(f, anchor);
 191         f->anchor = anchor;
 192         f->restr = tostring(s);
 193         return (f);
 194 }
 195 
 196 static int
 197 makeinit(fa *f, int anchor)
 198 {
 199         int i, k;
 200 
 201         f->curstat = 2;
 202         f->out[2] = 0;
 203         f->reset = 0;
 204         k = *(f->re[0].lfollow);
 205         xfree(f->posns[2]);
 206         if ((f->posns[2] = (int *)calloc(1, (k+1) * sizeof (int))) == NULL)
 207                 overflo("out of space in makeinit");
 208         for (i = 0; i <= k; i++) {
 209                 (f->posns[2])[i] = (f->re[0].lfollow)[i];
 210         }
 211         if ((f->posns[2])[1] == f->accept)
 212                 f->out[2] = 1;
 213         for (i = 0; i < NCHARS; i++)
 214                 f->gototab[2][i] = 0;
 215         f->curstat = cgoto(f, 2, HAT);
 216         if (anchor) {
 217                 *f->posns[2] = k-1;  /* leave out position 0 */
 218                 for (i = 0; i < k; i++) {
 219                         (f->posns[0])[i] = (f->posns[2])[i];
 220                 }
 221 
 222                 f->out[0] = f->out[2];
 223                 if (f->curstat != 2)
 224                         --(*f->posns[f->curstat]);
 225         }
 226         return (f->curstat);
 227 }
 228 
 229 void
 230 penter(Node *p) /* set up parent pointers and leaf indices */
 231 {
 232         switch (type(p)) {
 233         ELEAF
 234         LEAF
 235                 info(p) = poscnt;
 236                 poscnt++;
 237                 break;
 238         UNARY
 239                 penter(left(p));
 240                 parent(left(p)) = p;
 241                 break;
 242         case CAT:
 243         case OR:
 244                 penter(left(p));
 245                 penter(right(p));
 246                 parent(left(p)) = p;
 247                 parent(right(p)) = p;
 248                 break;
 249         default:        /* can't happen */
 250                 FATAL("can't happen: unknown type %d in penter", type(p));
 251                 break;
 252         }
 253 }
 254 
 255 static void
 256 freetr(Node *p) /* free parse tree */
 257 {
 258         switch (type(p)) {
 259         ELEAF
 260         LEAF
 261                 xfree(p);
 262                 break;
 263         UNARY
 264                 freetr(left(p));
 265                 xfree(p);
 266                 break;
 267         case CAT:
 268         case OR:
 269                 freetr(left(p));
 270                 freetr(right(p));
 271                 xfree(p);
 272                 break;
 273         default:        /* can't happen */
 274                 FATAL("can't happen: unknown type %d in freetr", type(p));
 275                 break;
 276         }
 277 }
 278 
 279 /*
 280  * in the parsing of regular expressions, metacharacters like . have
 281  * to be seen literally;  \056 is not a metacharacter.
 282  */
 283 int
 284 hexstr(uchar **pp)      /* find and eval hex string at pp, return new p */
 285 {                       /* only pick up one 8-bit byte (2 chars) */
 286         uchar *p;
 287         int n = 0;
 288         int i;
 289 
 290         for (i = 0, p = (uchar *)*pp; i < 2 && isxdigit(*p); i++, p++) {
 291                 if (isdigit(*p))
 292                         n = 16 * n + *p - '0';
 293                 else if (*p >= 'a' && *p <= 'f')
 294                         n = 16 * n + *p - 'a' + 10;
 295                 else if (*p >= 'A' && *p <= 'F')
 296                         n = 16 * n + *p - 'A' + 10;
 297         }
 298         *pp = (uchar *)p;
 299         return (n);
 300 }
 301 
 302 /* multiple use of arg */
 303 #define isoctdigit(c)   ((c) >= '0' && (c) <= '7')
 304 
 305 int
 306 quoted(uchar **pp)      /* pick up next thing after a \\ */
 307 {                       /* and increment *pp */
 308 
 309         uchar *p = *pp;
 310         int c;
 311 
 312         if ((c = *p++) == 't')
 313                 c = '\t';
 314         else if (c == 'n')
 315                 c = '\n';
 316         else if (c == 'f')
 317                 c = '\f';
 318         else if (c == 'r')
 319                 c = '\r';
 320         else if (c == 'b')
 321                 c = '\b';
 322         else if (c == '\\')
 323                 c = '\\';
 324         else if (c == 'x') {    /* hexadecimal goo follows */
 325                 c = hexstr(&p);     /* this adds a null if number is invalid */
 326         } else if (isoctdigit(c)) {     /* \d \dd \ddd */
 327                 int n = c - '0';
 328                 if (isoctdigit(*p)) {
 329                         n = 8 * n + *p++ - '0';
 330                         if (isoctdigit(*p))
 331                                 n = 8 * n + *p++ - '0';
 332                 }
 333                 c = n;
 334         } /* else */
 335                 /* c = c; */
 336         *pp = p;
 337         return (c);
 338 }
 339 
 340 uchar *
 341 cclenter(const uchar *argp)             /* add a character class */
 342 {
 343         int i, c, c2;
 344         uchar  *p = (uchar *)argp;
 345         uchar *op, *bp;
 346         static uchar *buf = NULL;
 347         size_t  bufsz = 100;
 348 
 349         op = p;
 350         if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL)
 351                 FATAL("out of space for character class [%.10s...] 1", p);
 352         bp = buf;
 353         for (i = 0; (c = *p++) != 0; ) {
 354                 if (c == '\\') {
 355                         c = quoted(&p);
 356                 } else if (c == '-' && i > 0 && bp[-1] != 0) {
 357                         if (*p != 0) {
 358                                 c = bp[-1];
 359                                 c2 = *p++;
 360                                 if (c2 == '\\')
 361                                         c2 = quoted(&p);
 362                                 if (c > c2) {   /* empty; ignore */
 363                                         bp--;
 364                                         i--;
 365                                         continue;
 366                                 }
 367                                 while (c < c2) {
 368                                         if (adjbuf(&buf, &bufsz,
 369                                             bp - buf + 2, 100, &bp,
 370                                             "cclenter1") == 0)
 371                                                 FATAL(
 372                         "out of space for character class [%.10s...] 2", p);
 373                                         *bp++ = ++c;
 374                                         i++;
 375                                 }
 376                                 continue;
 377                         }
 378                 }
 379                 if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter2"))
 380                         FATAL(
 381                 "out of space for character class [%.10s...] 3", p);
 382                 *bp++ = c;
 383                 i++;
 384         }
 385         *bp = 0;
 386         dprintf(("cclenter: in = |%s|, out = |%s|\n", op, buf));
 387         xfree(op);
 388         return (tostring(buf));
 389 }
 390 
 391 static void
 392 overflo(const char *s)
 393 {
 394         FATAL("regular expression too big: %.30s...", gettext((char *)s));
 395 }
 396 
 397 /* enter follow set of each leaf of vertex v into lfollow[leaf] */
 398 static void
 399 cfoll(fa *f, Node *v)
 400 {
 401         int i;
 402         int *p;
 403 
 404         switch (type(v)) {
 405         ELEAF
 406         LEAF
 407                 f->re[info(v)].ltype = type(v);
 408                 f->re[info(v)].lval.np = right(v);
 409                 while (f->accept >= maxsetvec) {  /* guessing here! */
 410                         maxsetvec *= 4;
 411                         setvec = (int *)realloc(setvec, maxsetvec *
 412                             sizeof (int));
 413                         tmpset = (int *)realloc(tmpset, maxsetvec *
 414                             sizeof (int));
 415                         if (setvec == 0 || tmpset == 0)
 416                                 overflo("out of space in cfoll()");
 417                 }
 418                 for (i = 0; i <= f->accept; i++)
 419                         setvec[i] = 0;
 420                 setcnt = 0;
 421                 follow(v);      /* computes setvec and setcnt */
 422                 if ((p = (int *)calloc(1, (setcnt+1) * sizeof (int))) == NULL)
 423                         overflo("out of space building follow set");
 424                 f->re[info(v)].lfollow = p;
 425                 *p = setcnt;
 426                 for (i = f->accept; i >= 0; i--) {
 427                         if (setvec[i] == 1)
 428                                 *++p = i;
 429                 }
 430                 break;
 431         UNARY
 432                 cfoll(f, left(v));
 433                 break;
 434         case CAT:
 435         case OR:
 436                 cfoll(f, left(v));
 437                 cfoll(f, right(v));
 438                 break;
 439         default:        /* can't happen */
 440                 FATAL("can't happen: unknown type %d in cfoll", type(v));
 441         }
 442 }
 443 
 444 /*
 445  * collects initially active leaves of p into setvec
 446  * returns 0 or 1 depending on whether p matches empty string
 447  */
 448 static int
 449 first(Node *p)
 450 {
 451         int b, lp;
 452 
 453         switch (type(p)) {
 454         ELEAF
 455         LEAF
 456                 /* look for high-water mark of subscripts */
 457                 lp = info(p);
 458                 /* guessing here! */
 459                 while (setcnt >= maxsetvec || lp >= maxsetvec) {
 460                         maxsetvec *= 4;
 461                         setvec = (int *)realloc(setvec, maxsetvec *
 462                             sizeof (int));
 463                         tmpset = (int *)realloc(tmpset, maxsetvec *
 464                             sizeof (int));
 465                         if (setvec == 0 || tmpset == 0)
 466                                 overflo("out of space in first()");
 467                 }
 468                 if (type(p) == EMPTYRE) {
 469                         setvec[lp] = 0;
 470                         return (0);
 471                 }
 472                 if (setvec[lp] != 1) {
 473                         setvec[lp] = 1;
 474                         setcnt++;
 475                 }
 476                 if (type(p) == CCL && (*(uchar *)right(p)) == '\0')
 477                         return (0);             /* empty CCL */
 478                 else
 479                         return (1);
 480         case PLUS:
 481                 if (first(left(p)) == 0)
 482                         return (0);
 483                 return (1);
 484         case STAR:
 485         case QUEST:
 486                 (void) first(left(p));
 487                 return (0);
 488         case CAT:
 489                 if (first(left(p)) == 0 && first(right(p)) == 0)
 490                         return (0);
 491                 return (1);
 492         case OR:
 493                 b = first(right(p));
 494                 if (first(left(p)) == 0 || b == 0)
 495                         return (0);
 496                 return (1);
 497         }
 498         /* can't happen */
 499         FATAL("can't happen: unknown type %d in first", type(p));
 500         /*NOTREACHED*/
 501         return (-1);
 502 }
 503 
 504 /* collects leaves that can follow v into setvec */
 505 static void
 506 follow(Node *v)
 507 {
 508         Node *p;
 509 
 510         if (type(v) == FINAL)
 511                 return;
 512         p = parent(v);
 513         switch (type(p)) {
 514         case STAR:
 515         case PLUS:
 516                 (void) first(v);
 517                 follow(p);
 518                 return;
 519 
 520         case OR:
 521         case QUEST:
 522                 follow(p);
 523                 return;
 524 
 525         case CAT:
 526                 if (v == left(p)) {     /* v is left child of p */
 527                         if (first(right(p)) == 0) {
 528                                 follow(p);
 529                                 return;
 530                         }
 531                 } else          /* v is right child */
 532                         follow(p);
 533                 return;
 534         }
 535 }
 536 
 537 static int
 538 member(int c, const uchar *sarg)        /* is c in s? */
 539 {
 540         uchar *s = (uchar *)sarg;
 541 
 542         while (*s)
 543                 if (c == *s++)
 544                         return (1);
 545         return (0);
 546 }
 547 
 548 
 549 int
 550 match(fa *f, const uchar *p0)   /* shortest match ? */
 551 {
 552         int s, ns;
 553         uchar *p = (uchar *)p0;
 554 
 555         s = f->reset ? makeinit(f, 0) : f->initstat;
 556         if (f->out[s])
 557                 return (1);
 558         do {
 559                 /* assert(*p < NCHARS); */
 560                 if ((ns = f->gototab[s][*p]) != 0)
 561                         s = ns;
 562                 else
 563                         s = cgoto(f, s, *p);
 564                 if (f->out[s])
 565                         return (1);
 566         } while (*p++ != 0);
 567         return (0);
 568 }
 569 
 570 int
 571 pmatch(fa *f, const uchar *p0)          /* longest match, for sub */
 572 {
 573         int s, ns;
 574         uchar *p = (uchar *)p0;
 575         uchar *q;
 576         int i, k;
 577 
 578         /* s = f->reset ? makeinit(f,1) : f->initstat; */
 579         if (f->reset) {
 580                 f->initstat = s = makeinit(f, 1);
 581         } else {
 582                 s = f->initstat;
 583         }
 584         patbeg = p;
 585         patlen = -1;
 586         do {
 587                 q = p;
 588                 do {
 589                         if (f->out[s])               /* final state */
 590                                 patlen = q-p;
 591                         /* assert(*q < NCHARS); */
 592                         if ((ns = f->gototab[s][*q]) != 0)
 593                                 s = ns;
 594                         else
 595                                 s = cgoto(f, s, *q);
 596                         if (s == 1) {   /* no transition */
 597                                 if (patlen >= 0) {
 598                                         patbeg = p;
 599                                         return (1);
 600                                 } else
 601                                         goto nextin;    /* no match */
 602                         }
 603                 } while (*q++ != 0);
 604                 if (f->out[s])
 605                         patlen = q - p - 1;     /* don't count $ */
 606                 if (patlen >= 0) {
 607                         patbeg = p;
 608                         return (1);
 609                 }
 610 nextin:
 611                 s = 2;
 612                 if (f->reset) {
 613                         for (i = 2; i <= f->curstat; i++)
 614                                 xfree(f->posns[i]);
 615                         k = *f->posns[0];
 616                         if ((f->posns[2] =
 617                             (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
 618                                 overflo("out of space in pmatch");
 619                         }
 620                         for (i = 0; i <= k; i++)
 621                                 (f->posns[2])[i] = (f->posns[0])[i];
 622                         f->initstat = f->curstat = 2;
 623                         f->out[2] = f->out[0];
 624                         for (i = 0; i < NCHARS; i++)
 625                                 f->gototab[2][i] = 0;
 626                 }
 627         } while (*p++ != 0);
 628         return (0);
 629 }
 630 
 631 int
 632 nematch(fa *f, const uchar *p0)         /* non-empty match, for sub */
 633 {
 634         int s, ns;
 635         uchar *p = (uchar *)p0;
 636         uchar *q;
 637         int i, k;
 638 
 639         /* s = f->reset ? makeinit(f,1) : f->initstat; */
 640         if (f->reset) {
 641                 f->initstat = s = makeinit(f, 1);
 642         } else {
 643                 s = f->initstat;
 644         }
 645         patlen = -1;
 646         while (*p) {
 647                 q = p;
 648                 do {
 649                         if (f->out[s])               /* final state */
 650                                 patlen = q - p;
 651                         /* assert(*q < NCHARS); */
 652                         if ((ns = f->gototab[s][*q]) != 0)
 653                                 s = ns;
 654                         else
 655                                 s = cgoto(f, s, *q);
 656                         if (s == 1) {   /* no transition */
 657                                 if (patlen > 0) {
 658                                         patbeg = p;
 659                                         return (1);
 660                                 } else
 661                                         goto nnextin;   /* no nonempty match */
 662                         }
 663                 } while (*q++ != 0);
 664                 if (f->out[s])
 665                         patlen = q - p - 1;     /* don't count $ */
 666                 if (patlen > 0) {
 667                         patbeg = p;
 668                         return (1);
 669                 }
 670 nnextin:
 671                 s = 2;
 672                 if (f->reset) {
 673                         for (i = 2; i <= f->curstat; i++)
 674                                 xfree(f->posns[i]);
 675                         k = *f->posns[0];
 676                         if ((f->posns[2] =
 677                             (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
 678                                 overflo("out of state space");
 679                         }
 680                         for (i = 0; i <= k; i++)
 681                                 (f->posns[2])[i] = (f->posns[0])[i];
 682                         f->initstat = f->curstat = 2;
 683                         f->out[2] = f->out[0];
 684                         for (i = 0; i < NCHARS; i++)
 685                                 f->gototab[2][i] = 0;
 686                 }
 687                 p++;
 688         }
 689         return (0);
 690 }
 691 
 692 static Node *
 693 reparse(const uchar *p)
 694 {
 695         /* parses regular expression pointed to by p */
 696         /* uses relex() to scan regular expression */
 697         Node *np;
 698 
 699         dprintf(("reparse <%s>\n", p));
 700         /* prestr points to string to be parsed */
 701         lastre = prestr = (uchar *)p;
 702         rtok = relex();
 703         /* GNU compatibility: an empty regexp matches anything */
 704         if (rtok == '\0') {
 705                 /* FATAL("empty regular expression"); previous */
 706                 return (op2(EMPTYRE, NIL, NIL));
 707         }
 708         np = regexp();
 709         if (rtok != '\0') {
 710                 FATAL("syntax error in regular expression %s at %s",
 711                     lastre, prestr);
 712         }
 713         return (np);
 714 }
 715 
 716 static Node *
 717 regexp(void)    /* top-level parse of reg expr */
 718 {
 719         return (alt(concat(primary())));
 720 }
 721 
 722 static Node *
 723 primary(void)
 724 {
 725         Node *np;
 726 
 727         switch (rtok) {
 728         case CHAR:
 729                 np = op2(CHAR, NIL, itonp(rlxval));
 730                 rtok = relex();
 731                 return (unary(np));
 732         case ALL:
 733                 rtok = relex();
 734                 return (unary(op2(ALL, NIL, NIL)));
 735         case EMPTYRE:
 736                 rtok = relex();
 737                 return (unary(op2(ALL, NIL, NIL)));
 738         case DOT:
 739                 rtok = relex();
 740                 return (unary(op2(DOT, NIL, NIL)));
 741         case CCL:
 742                 /*LINTED align*/
 743                 np = op2(CCL, NIL, (Node *)cclenter(rlxstr));
 744                 rtok = relex();
 745                 return (unary(np));
 746         case NCCL:
 747                 /*LINTED align*/
 748                 np = op2(NCCL, NIL, (Node *)cclenter(rlxstr));
 749                 rtok = relex();
 750                 return (unary(np));
 751         case '^':
 752                 rtok = relex();
 753                 return (unary(op2(CHAR, NIL, itonp(HAT))));
 754         case '$':
 755                 rtok = relex();
 756                 return (unary(op2(CHAR, NIL, NIL)));
 757         case '(':
 758                 rtok = relex();
 759                 if (rtok == ')') {      /* special pleading for () */
 760                         rtok = relex();
 761                         return (unary(op2(CCL, NIL,
 762                             /*LINTED align*/
 763                             (Node *)tostring((uchar *)""))));
 764                 }
 765                 np = regexp();
 766                 if (rtok == ')') {
 767                         rtok = relex();
 768                         return (unary(np));
 769                 } else {
 770                         FATAL("syntax error in regular expression %s at %s",
 771                             lastre, prestr);
 772                 }
 773         default:
 774                 FATAL("illegal primary in regular expression %s at %s",
 775                     lastre, prestr);
 776         }
 777         /*NOTREACHED*/
 778         return (NULL);
 779 }
 780 
 781 static Node *
 782 concat(Node *np)
 783 {
 784         switch (rtok) {
 785         case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL:
 786         case '$': case '(':
 787                 return (concat(op2(CAT, np, primary())));
 788         default:
 789                 return (np);
 790         }
 791 }
 792 
 793 static Node *
 794 alt(Node *np)
 795 {
 796         if (rtok == OR) {
 797                 rtok = relex();
 798                 return (alt(op2(OR, np, concat(primary()))));
 799         }
 800         return (np);
 801 }
 802 
 803 static Node *
 804 unary(Node *np)
 805 {
 806         switch (rtok) {
 807         case STAR:
 808                 rtok = relex();
 809                 return (unary(op2(STAR, np, NIL)));
 810         case PLUS:
 811                 rtok = relex();
 812                 return (unary(op2(PLUS, np, NIL)));
 813         case QUEST:
 814                 rtok = relex();
 815                 return (unary(op2(QUEST, np, NIL)));
 816         default:
 817                 return (np);
 818         }
 819 }
 820 
 821 /*
 822  * Character class definitions conformant to the POSIX locale as
 823  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
 824  * and operating character sets are both ASCII (ISO646) or supersets
 825  * thereof.
 826  *
 827  * Note that to avoid overflowing the temporary buffer used in
 828  * relex(), the expanded character class (prior to range expansion)
 829  * must be less than twice the size of their full name.
 830  */
 831 
 832 /*
 833  * Because isblank doesn't show up in any of the header files on any
 834  * system i use, it's defined here.  if some other locale has a richer
 835  * definition of "blank", define HAS_ISBLANK and provide your own
 836  * version.
 837  * the parentheses here are an attempt to find a path through the maze
 838  * of macro definition and/or function and/or version provided.  thanks
 839  * to nelson beebe for the suggestion; let's see if it works everywhere.
 840  */
 841 
 842 /* #define HAS_ISBLANK */
 843 #ifndef HAS_ISBLANK
 844 
 845 int
 846 (xisblank)(int c)
 847 {
 848         return (c == ' ' || c == '\t');
 849 }
 850 
 851 #endif
 852 
 853 struct charclass {
 854         const char *cc_name;
 855         int cc_namelen;
 856         int (*cc_func)(int);
 857 } charclasses[] = {
 858         { "alnum",      5,      isalnum },
 859         { "alpha",      5,      isalpha },
 860 #ifndef HAS_ISBLANK
 861         { "blank",      5,      isspace }, /* was isblank */
 862 #else
 863         { "blank",      5,      isblank },
 864 #endif
 865         { "cntrl",      5,      iscntrl },
 866         { "digit",      5,      isdigit },
 867         { "graph",      5,      isgraph },
 868         { "lower",      5,      islower },
 869         { "print",      5,      isprint },
 870         { "punct",      5,      ispunct },
 871         { "space",      5,      isspace },
 872         { "upper",      5,      isupper },
 873         { "xdigit",     6,      isxdigit },
 874         { NULL,         0,      NULL },
 875 };
 876 
 877 static int
 878 relex(void)             /* lexical analyzer for reparse */
 879 {
 880         int c, n;
 881         int cflag;
 882         uchar *buf = 0;
 883         size_t bufsz = 100;
 884         uchar *bp;
 885         struct charclass *cc;
 886         int i;
 887 
 888         switch (c = *prestr++) {
 889         case '|': return OR;
 890         case '*': return STAR;
 891         case '+': return PLUS;
 892         case '?': return QUEST;
 893         case '.': return DOT;
 894         case '\0': prestr--; return '\0';
 895         case '^':
 896         case '$':
 897         case '(':
 898         case ')':
 899                 return (c);
 900         case '\\':
 901                 rlxval = quoted(&prestr);
 902                 return (CHAR);
 903         default:
 904                 rlxval = c;
 905                 return (CHAR);
 906         case '[':
 907                 if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL)
 908                         FATAL("out of space in reg expr %.10s..", lastre);
 909                 bp = buf;
 910                 if (*prestr == '^') {
 911                         cflag = 1;
 912                         prestr++;
 913                 } else
 914                         cflag = 0;
 915                 n = 2 * strlen((const char *) prestr)+1;
 916                 if (!adjbuf(&buf, &bufsz, n, n, &bp, "relex1"))
 917                         FATAL("out of space for reg expr %.10s...", lastre);
 918                 for (;;) {
 919                         if ((c = *prestr++) == '\\') {
 920                                 *bp++ = '\\';
 921                                 if ((c = *prestr++) == '\0') {
 922                                         FATAL(
 923                         "nonterminated character class %s", lastre);
 924                                 }
 925                                 *bp++ = c;
 926                         } else if (c == '[' && *prestr == ':') {
 927                                 /*
 928                                  * POSIX char class names, Dag-Erling
 929                                  * Smorgrav, des@ofug.org
 930                                  */
 931                                 for (cc = charclasses; cc->cc_name; cc++)
 932                                         if (strncmp((const char *) prestr + 1,
 933                                             (const char *) cc->cc_name,
 934                                             cc->cc_namelen) == 0)
 935                                                 break;
 936                                 if (cc->cc_name != NULL &&
 937                                     prestr[1 + cc->cc_namelen] == ':' &&
 938                                     prestr[2 + cc->cc_namelen] == ']') {
 939                                         prestr += cc->cc_namelen + 3;
 940                                         for (i = 0; i < NCHARS; i++) {
 941                                                 if (!adjbuf(&buf, &bufsz,
 942                                                     bp - buf + 1, 100, &bp,
 943                                                     "relex2"))
 944                                                         FATAL(
 945                                 "out of space for reg expr %.10s...", lastre);
 946                                                 if (cc->cc_func(i)) {
 947                                                         *bp++ = i;
 948                                                         n++;
 949                                                 }
 950                                         }
 951                                 } else
 952                                         *bp++ = c;
 953                         } else if (c == '\0') {
 954                                 FATAL("nonterminated character class %.20s",
 955                                     lastre);
 956                         } else if (bp == buf) { /* 1st char is special */
 957                                 *bp++ = c;
 958                         } else if (c == ']') {
 959                                 *bp++ = 0;
 960                                 rlxstr = tostring(buf);
 961                                 if (cflag == 0)
 962                                         return (CCL);
 963                                 else
 964                                         return (NCCL);
 965                         } else
 966                                 *bp++ = c;
 967                 }
 968                 /*NOTREACHED*/
 969         }
 970         return (0);
 971 }
 972 
 973 static int
 974 cgoto(fa *f, int s, int c)
 975 {
 976         int i, j, k;
 977         int *p, *q;
 978 
 979         assert(c == HAT || c < NCHARS);
 980         while (f->accept >= maxsetvec) {  /* guessing here! */
 981                 maxsetvec *= 4;
 982                 setvec = (int *)realloc(setvec, maxsetvec * sizeof (int));
 983                 tmpset = (int *)realloc(tmpset, maxsetvec * sizeof (int));
 984                 if (setvec == 0 || tmpset == 0)
 985                         overflo("out of space in cgoto()");
 986         }
 987         for (i = 0; i <= f->accept; i++)
 988                 setvec[i] = 0;
 989         setcnt = 0;
 990         /* compute positions of gototab[s,c] into setvec */
 991         p = f->posns[s];
 992         for (i = 1; i <= *p; i++) {
 993                 if ((k = f->re[p[i]].ltype) != FINAL) {
 994                         if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) ||
 995                             (k == DOT && c != 0 && c != HAT) ||
 996                             (k == ALL && c != 0) ||
 997                             (k == EMPTYRE && c != 0) ||
 998                             (k == CCL &&
 999                             member(c, (uchar *)f->re[p[i]].lval.up)) ||
1000                             ((k == NCCL &&
1001                             !member(c, (uchar *)f->re[p[i]].lval.up)) &&
1002                             c != 0 && c != HAT)) {
1003                                 q = f->re[p[i]].lfollow;
1004                                 for (j = 1; j <= *q; j++) {
1005                                         if (q[j] >= maxsetvec) {
1006                                                 maxsetvec *= 4;
1007                                                 setvec = (int *)realloc(setvec,
1008                                                     maxsetvec * sizeof (int));
1009                                                 tmpset = (int *)realloc(tmpset,
1010                                                     maxsetvec * sizeof (int));
1011                                                 if (setvec == 0 ||
1012                                                     tmpset == 0)
1013                                                         overflo(
1014                                                 "cgoto overflow");
1015                                         }
1016                                         if (setvec[q[j]] == 0) {
1017                                                 setcnt++;
1018                                                 setvec[q[j]] = 1;
1019                                         }
1020                                 }
1021                         }
1022                 }
1023         }
1024         /* determine if setvec is a previous state */
1025         tmpset[0] = setcnt;
1026         j = 1;
1027         for (i = f->accept; i >= 0; i--)
1028                 if (setvec[i]) {
1029                         tmpset[j++] = i;
1030                 }
1031         /* tmpset == previous state? */
1032         for (i = 1; i <= f->curstat; i++) {
1033                 p = f->posns[i];
1034                 if ((k = tmpset[0]) != p[0])
1035                         goto different;
1036                 for (j = 1; j <= k; j++)
1037                         if (tmpset[j] != p[j])
1038                                 goto different;
1039                 /* setvec is state i */
1040                 f->gototab[s][c] = i;
1041                 return (i);
1042         different:;
1043         }
1044 
1045         /* add tmpset to current set of states */
1046         if (f->curstat >= NSTATES - 1) {
1047                 f->curstat = 2;
1048                 f->reset = 1;
1049                 for (i = 2; i < NSTATES; i++)
1050                         xfree(f->posns[i]);
1051         } else
1052                 ++(f->curstat);
1053         for (i = 0; i < NCHARS; i++)
1054                 f->gototab[f->curstat][i] = 0;
1055         xfree(f->posns[f->curstat]);
1056         if ((p = (int *)calloc(1, (setcnt + 1) * sizeof (int))) == NULL)
1057                 overflo("out of space in cgoto");
1058 
1059         f->posns[f->curstat] = p;
1060         f->gototab[s][c] = f->curstat;
1061         for (i = 0; i <= setcnt; i++)
1062                 p[i] = tmpset[i];
1063         if (setvec[f->accept])
1064                 f->out[f->curstat] = 1;
1065         else
1066                 f->out[f->curstat] = 0;
1067         return (f->curstat);
1068 }
1069 
1070 static void
1071 freefa(fa *f)   /* free a finite automaton */
1072 {
1073         int i;
1074 
1075         if (f == NULL)
1076                 return;
1077         for (i = 0; i <= f->curstat; i++)
1078                 xfree(f->posns[i]);
1079         for (i = 0; i <= f->accept; i++) {
1080                 xfree(f->re[i].lfollow);
1081                 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1082                         xfree((f->re[i].lval.np));
1083         }
1084         xfree(f->restr);
1085         xfree(f);
1086 }