Print this page
3731 Update nawk to version 20121220

Split Close
Expand all
Collapse all
          --- old/usr/src/cmd/awk/b.c
          +++ new/usr/src/cmd/awk/b.c
↓ open down ↓ 17 lines elided ↑ open up ↑
  18   18   * information: Portions Copyright [yyyy] [name of copyright owner]
  19   19   *
  20   20   * CDDL HEADER END
  21   21   */
  22   22  
  23   23  /*
  24   24   * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  25   25   * Use is subject to license terms.
  26   26   */
  27   27  
  28      -/*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  29      -/*        All Rights Reserved   */
  30      -
  31      -#pragma ident   "%Z%%M% %I%     %E% SMI"
       28 +/*
       29 + * Copyright (C) Lucent Technologies 1997
       30 + * All Rights Reserved
       31 + *
       32 + * Permission to use, copy, modify, and distribute this software and
       33 + * its documentation for any purpose and without fee is hereby
       34 + * granted, provided that the above copyright notice appear in all
       35 + * copies and that both that the copyright notice and this
       36 + * permission notice and warranty disclaimer appear in supporting
       37 + * documentation, and that the name Lucent Technologies or any of
       38 + * its entities not be used in advertising or publicity pertaining
       39 + * to distribution of the software without specific, written prior
       40 + * permission.
       41 + *
       42 + * LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
       43 + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
       44 + * IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
       45 + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
       46 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
       47 + * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
       48 + * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
       49 + * THIS SOFTWARE.
       50 + */
  32   51  
  33   52  #define DEBUG
  34   53  
  35   54  #include "awk.h"
  36   55  #include "y.tab.h"
  37   56  
  38      -#define HAT     (NCHARS-1)      /* matches ^ in regular expr */
       57 +#define HAT     (NCHARS+2)      /* matches ^ in regular expr */
  39   58                                  /* NCHARS is 2**n */
  40      -#define MAXLIN (3 * LINE_MAX)
       59 +#define MAXLIN  22
  41   60  
  42      -#define type(v)         (v)->nobj
       61 +#define type(v)         (v)->nobj       /* badly overloaded here */
       62 +#define info(v)         (v)->ntype      /* badly overloaded here */
  43   63  #define left(v)         (v)->narg[0]
  44   64  #define right(v)        (v)->narg[1]
  45   65  #define parent(v)       (v)->nnext
  46   66  
  47   67  #define LEAF    case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
       68 +#define ELEAF   case EMPTYRE:           /* empty string in regexp */
  48   69  #define UNARY   case STAR: case PLUS: case QUEST:
  49   70  
  50   71  /*
  51   72   * encoding in tree Nodes:
  52   73   *      leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
  53   74   *              left is index, right contains value or pointer to value
  54   75   *      unary (STAR, PLUS, QUEST): left is child, right is null
  55   76   *      binary (CAT, OR): left and right are children
  56   77   *      parent contains pointer to parent
  57   78   */
  58   79  
  59      -int     setvec[MAXLIN];
  60      -int     tmpset[MAXLIN];
  61      -Node    *point[MAXLIN];
       80 +int     *setvec;
       81 +int     *tmpset;
       82 +int     maxsetvec = 0;
  62   83  
  63   84  int     rtok;           /* next token in current re */
  64   85  int     rlxval;
  65      -uchar   *rlxstr;
  66      -uchar   *prestr;        /* current position in current re */
  67      -uchar   *lastre;        /* origin of last re */
       86 +static  uchar   *rlxstr;
       87 +static  uchar   *prestr;        /* current position in current re */
       88 +static  uchar   *lastre;        /* origin of last re */
  68   89  
  69   90  static  int setcnt;
  70   91  static  int poscnt;
  71   92  
  72   93  uchar   *patbeg;
  73   94  int     patlen;
  74   95  
  75   96  #define NFA     20      /* cache this many dynamic fa's */
  76   97  fa      *fatab[NFA];
  77   98  int     nfatab  = 0;    /* entries in fatab */
  78   99  
  79      -static fa       *mkdfa(uchar *, int);
      100 +static fa       *mkdfa(const uchar *, int);
  80  101  static int      makeinit(fa *, int);
  81  102  static void     penter(Node *);
  82  103  static void     freetr(Node *);
  83      -static void     overflo(char *);
      104 +static void     overflo(const char *);
  84  105  static void     cfoll(fa *, Node *);
  85  106  static void     follow(Node *);
  86      -static Node     *reparse(uchar *);
      107 +static Node     *reparse(const uchar *);
  87  108  static int      relex(void);
  88  109  static void     freefa(fa *);
  89  110  static int      cgoto(fa *, int, int);
      111 +static Node     *regexp(void);
      112 +static Node     *primary(void);
      113 +static Node     *concat(Node *);
      114 +static Node     *alt(Node *);
      115 +static Node     *unary(Node *);
  90  116  
  91  117  fa *
  92      -makedfa(uchar *s, int anchor)   /* returns dfa for reg expr s */
      118 +makedfa(const uchar *s, int anchor)     /* returns dfa for reg expr s */
  93  119  {
  94  120          int i, use, nuse;
  95  121          fa *pfa;
      122 +        static int now = 1;
      123 +
      124 +        if (setvec == 0) {      /* first time through any RE */
      125 +                maxsetvec = MAXLIN;
      126 +                setvec = (int *)malloc(maxsetvec * sizeof (int));
      127 +                tmpset = (int *)malloc(maxsetvec * sizeof (int));
      128 +                if (setvec == 0 || tmpset == 0)
      129 +                        overflo("out of space initializing makedfa");
      130 +        }
  96  131  
  97  132          if (compile_time)       /* a constant for sure */
  98  133                  return (mkdfa(s, anchor));
  99  134          for (i = 0; i < nfatab; i++) {  /* is it there already? */
 100  135                  if (fatab[i]->anchor == anchor &&
 101      -                    strcmp((char *)fatab[i]->restr, (char *)s) == 0) {
 102      -                        fatab[i]->use++;
      136 +                    strcmp((const char *)fatab[i]->restr, (char *)s) == 0) {
      137 +                        fatab[i]->use = now++;
 103  138                          return (fatab[i]);
 104  139                  }
 105  140          }
 106  141          pfa = mkdfa(s, anchor);
 107  142          if (nfatab < NFA) {     /* room for another */
 108  143                  fatab[nfatab] = pfa;
 109      -                fatab[nfatab]->use = 1;
      144 +                fatab[nfatab]->use = now++;
 110  145                  nfatab++;
 111  146                  return (pfa);
 112  147          }
 113  148          use = fatab[0]->use;    /* replace least-recently used */
 114  149          nuse = 0;
 115      -        for (i = 1; i < nfatab; i++)
      150 +        for (i = 1; i < nfatab; i++) {
 116  151                  if (fatab[i]->use < use) {
 117  152                          use = fatab[i]->use;
 118  153                          nuse = i;
 119  154                  }
      155 +        }
 120  156          freefa(fatab[nuse]);
 121  157          fatab[nuse] = pfa;
 122      -        pfa->use = 1;
      158 +        pfa->use = now++;
 123  159          return (pfa);
 124  160  }
 125  161  
 126  162  fa *
 127      -mkdfa(uchar *s, int anchor)     /* does the real work of making a dfa */
 128      -        /* anchor = 1 for anchored matches, else 0 */
 129      -{
      163 +mkdfa(const uchar *s, int anchor)
      164 +{                               /* does the real work of making a dfa */
      165 +                                /* anchor = 1 for anchored matches, else 0 */
 130  166          Node *p, *p1;
 131  167          fa *f;
 132  168  
 133  169          p = reparse(s);
 134  170          p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
 135  171                  /* put ALL STAR in front of reg.  exp. */
 136  172          p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
 137  173                  /* put FINAL after reg.  exp. */
 138  174  
 139  175          poscnt = 0;
 140  176          penter(p1);     /* enter parent pointers and leaf indices */
 141  177          if ((f = (fa *)calloc(1, sizeof (fa) + poscnt * sizeof (rrow))) == NULL)
 142      -                overflo("no room for fa");
      178 +                overflo("out of space for fa");
 143  179          /* penter has computed number of positions in re */
 144  180          f->accept = poscnt-1;
 145  181          cfoll(f, p1);   /* set up follow sets */
 146  182          freetr(p1);
 147  183          if ((f->posns[0] =
 148  184              (int *)calloc(1, *(f->re[0].lfollow) * sizeof (int))) == NULL) {
 149  185                          overflo("out of space in makedfa");
 150  186          }
 151  187          if ((f->posns[1] = (int *)calloc(1, sizeof (int))) == NULL)
 152  188                  overflo("out of space in makedfa");
 153  189          *f->posns[1] = 0;
 154  190          f->initstat = makeinit(f, anchor);
 155  191          f->anchor = anchor;
 156  192          f->restr = tostring(s);
 157  193          return (f);
 158  194  }
 159  195  
 160  196  static int
 161  197  makeinit(fa *f, int anchor)
 162  198  {
 163      -        register int i, k;
      199 +        int i, k;
 164  200  
 165  201          f->curstat = 2;
 166  202          f->out[2] = 0;
 167  203          f->reset = 0;
 168  204          k = *(f->re[0].lfollow);
 169  205          xfree(f->posns[2]);
 170  206          if ((f->posns[2] = (int *)calloc(1, (k+1) * sizeof (int))) == NULL)
 171  207                  overflo("out of space in makeinit");
 172  208          for (i = 0; i <= k; i++) {
 173  209                  (f->posns[2])[i] = (f->re[0].lfollow)[i];
↓ open down ↓ 13 lines elided ↑ open up ↑
 187  223                  if (f->curstat != 2)
 188  224                          --(*f->posns[f->curstat]);
 189  225          }
 190  226          return (f->curstat);
 191  227  }
 192  228  
 193  229  void
 194  230  penter(Node *p) /* set up parent pointers and leaf indices */
 195  231  {
 196  232          switch (type(p)) {
      233 +        ELEAF
 197  234          LEAF
 198      -                left(p) = (Node *) poscnt;
 199      -                point[poscnt++] = p;
      235 +                info(p) = poscnt;
      236 +                poscnt++;
 200  237                  break;
 201  238          UNARY
 202  239                  penter(left(p));
 203  240                  parent(left(p)) = p;
 204  241                  break;
 205  242          case CAT:
 206  243          case OR:
 207  244                  penter(left(p));
 208  245                  penter(right(p));
 209  246                  parent(left(p)) = p;
 210  247                  parent(right(p)) = p;
 211  248                  break;
 212      -        default:
 213      -                ERROR "unknown type %d in penter", type(p) FATAL;
      249 +        default:        /* can't happen */
      250 +                FATAL("can't happen: unknown type %d in penter", type(p));
 214  251                  break;
 215  252          }
 216  253  }
 217  254  
 218  255  static void
 219  256  freetr(Node *p) /* free parse tree */
 220  257  {
 221  258          switch (type(p)) {
      259 +        ELEAF
 222  260          LEAF
 223  261                  xfree(p);
 224  262                  break;
 225  263          UNARY
 226  264                  freetr(left(p));
 227  265                  xfree(p);
 228  266                  break;
 229  267          case CAT:
 230  268          case OR:
 231  269                  freetr(left(p));
 232  270                  freetr(right(p));
 233  271                  xfree(p);
 234  272                  break;
 235      -        default:
 236      -                ERROR "unknown type %d in freetr", type(p) FATAL;
      273 +        default:        /* can't happen */
      274 +                FATAL("can't happen: unknown type %d in freetr", type(p));
 237  275                  break;
 238  276          }
 239  277  }
 240  278  
      279 +/*
      280 + * in the parsing of regular expressions, metacharacters like . have
      281 + * to be seen literally;  \056 is not a metacharacter.
      282 + */
      283 +int
      284 +hexstr(uchar **pp)      /* find and eval hex string at pp, return new p */
      285 +{                       /* only pick up one 8-bit byte (2 chars) */
      286 +        uchar *p;
      287 +        int n = 0;
      288 +        int i;
      289 +
      290 +        for (i = 0, p = (uchar *)*pp; i < 2 && isxdigit(*p); i++, p++) {
      291 +                if (isdigit(*p))
      292 +                        n = 16 * n + *p - '0';
      293 +                else if (*p >= 'a' && *p <= 'f')
      294 +                        n = 16 * n + *p - 'a' + 10;
      295 +                else if (*p >= 'A' && *p <= 'F')
      296 +                        n = 16 * n + *p - 'A' + 10;
      297 +        }
      298 +        *pp = (uchar *)p;
      299 +        return (n);
      300 +}
      301 +
      302 +/* multiple use of arg */
      303 +#define isoctdigit(c)   ((c) >= '0' && (c) <= '7')
      304 +
      305 +int
      306 +quoted(uchar **pp)      /* pick up next thing after a \\ */
      307 +{                       /* and increment *pp */
      308 +
      309 +        uchar *p = *pp;
      310 +        int c;
      311 +
      312 +        if ((c = *p++) == 't')
      313 +                c = '\t';
      314 +        else if (c == 'n')
      315 +                c = '\n';
      316 +        else if (c == 'f')
      317 +                c = '\f';
      318 +        else if (c == 'r')
      319 +                c = '\r';
      320 +        else if (c == 'b')
      321 +                c = '\b';
      322 +        else if (c == '\\')
      323 +                c = '\\';
      324 +        else if (c == 'x') {    /* hexadecimal goo follows */
      325 +                c = hexstr(&p); /* this adds a null if number is invalid */
      326 +        } else if (isoctdigit(c)) {     /* \d \dd \ddd */
      327 +                int n = c - '0';
      328 +                if (isoctdigit(*p)) {
      329 +                        n = 8 * n + *p++ - '0';
      330 +                        if (isoctdigit(*p))
      331 +                                n = 8 * n + *p++ - '0';
      332 +                }
      333 +                c = n;
      334 +        } /* else */
      335 +                /* c = c; */
      336 +        *pp = p;
      337 +        return (c);
      338 +}
      339 +
 241  340  uchar *
 242      -cclenter(uchar *p)
      341 +cclenter(const uchar *argp)             /* add a character class */
 243  342  {
 244      -        register int i, c;
 245      -        uchar *op, *chars, *ret;
 246      -        size_t  bsize;
      343 +        int i, c, c2;
      344 +        uchar  *p = (uchar *)argp;
      345 +        uchar *op, *bp;
      346 +        static uchar *buf = NULL;
      347 +        size_t  bufsz = 100;
 247  348  
 248      -        init_buf(&chars, &bsize, LINE_INCR);
 249  349          op = p;
 250      -        i = 0;
 251      -        while ((c = *p++) != 0) {
      350 +        if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL)
      351 +                FATAL("out of space for character class [%.10s...] 1", p);
      352 +        bp = buf;
      353 +        for (i = 0; (c = *p++) != 0; ) {
 252  354                  if (c == '\\') {
 253      -                        if ((c = *p++) == 't')
 254      -                                c = '\t';
 255      -                        else if (c == 'n')
 256      -                                c = '\n';
 257      -                        else if (c == 'f')
 258      -                                c = '\f';
 259      -                        else if (c == 'r')
 260      -                                c = '\r';
 261      -                        else if (c == 'b')
 262      -                                c = '\b';
 263      -                        else if (c == '\\')
 264      -                                c = '\\';
 265      -                        else if (isdigit(c)) {
 266      -                                int n = c - '0';
 267      -                                if (isdigit(*p)) {
 268      -                                        n = 8 * n + *p++ - '0';
 269      -                                        if (isdigit(*p))
 270      -                                                n = 8 * n + *p++ - '0';
 271      -                                }
 272      -                                c = n;
 273      -                        } /* else */
 274      -                                /* c = c; */
 275      -                } else if (c == '-' && i > 0 && chars[i-1] != 0) {
      355 +                        c = quoted(&p);
      356 +                } else if (c == '-' && i > 0 && bp[-1] != 0) {
 276  357                          if (*p != 0) {
 277      -                                c = chars[i-1];
 278      -                                while ((uchar)c < *p) { /* fails if *p is \\ */
 279      -                                        expand_buf(&chars, &bsize, i);
 280      -                                        chars[i++] = ++c;
      358 +                                c = bp[-1];
      359 +                                c2 = *p++;
      360 +                                if (c2 == '\\')
      361 +                                        c2 = quoted(&p);
      362 +                                if (c > c2) {   /* empty; ignore */
      363 +                                        bp--;
      364 +                                        i--;
      365 +                                        continue;
      366 +                                }
      367 +                                while (c < c2) {
      368 +                                        if (adjbuf(&buf, &bufsz,
      369 +                                            bp - buf + 2, 100, &bp,
      370 +                                            "cclenter1") == 0)
      371 +                                                FATAL(
      372 +                        "out of space for character class [%.10s...] 2", p);
      373 +                                        *bp++ = ++c;
      374 +                                        i++;
 281  375                                  }
 282      -                                p++;
 283  376                                  continue;
 284  377                          }
 285  378                  }
 286      -                expand_buf(&chars, &bsize, i);
 287      -                chars[i++] = c;
      379 +                if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter2"))
      380 +                        FATAL(
      381 +                "out of space for character class [%.10s...] 3", p);
      382 +                *bp++ = c;
      383 +                i++;
 288  384          }
 289      -        chars[i++] = '\0';
 290      -        dprintf(("cclenter: in = |%s|, out = |%s|\n", op, chars));
      385 +        *bp = 0;
      386 +        dprintf(("cclenter: in = |%s|, out = |%s|\n", op, buf));
 291  387          xfree(op);
 292      -        ret = tostring(chars);
 293      -        free(chars);
 294      -        return (ret);
      388 +        return (tostring(buf));
 295  389  }
 296  390  
 297  391  static void
 298      -overflo(char *s)
      392 +overflo(const char *s)
 299  393  {
 300      -        ERROR "regular expression too big: %s", gettext((char *)s) FATAL;
      394 +        FATAL("regular expression too big: %.30s...", gettext((char *)s));
 301  395  }
 302  396  
 303  397  /* enter follow set of each leaf of vertex v into lfollow[leaf] */
 304  398  static void
 305  399  cfoll(fa *f, Node *v)
 306  400  {
 307      -        register int i;
 308      -        register int *p;
      401 +        int i;
      402 +        int *p;
 309  403  
 310  404          switch (type(v)) {
      405 +        ELEAF
 311  406          LEAF
 312      -                f->re[(int)left(v)].ltype = type(v);
 313      -                f->re[(int)left(v)].lval = (int)right(v);
      407 +                f->re[info(v)].ltype = type(v);
      408 +                f->re[info(v)].lval.np = right(v);
      409 +                while (f->accept >= maxsetvec) {        /* guessing here! */
      410 +                        maxsetvec *= 4;
      411 +                        setvec = (int *)realloc(setvec, maxsetvec *
      412 +                            sizeof (int));
      413 +                        tmpset = (int *)realloc(tmpset, maxsetvec *
      414 +                            sizeof (int));
      415 +                        if (setvec == 0 || tmpset == 0)
      416 +                                overflo("out of space in cfoll()");
      417 +                }
 314  418                  for (i = 0; i <= f->accept; i++)
 315  419                          setvec[i] = 0;
 316  420                  setcnt = 0;
 317  421                  follow(v);      /* computes setvec and setcnt */
 318  422                  if ((p = (int *)calloc(1, (setcnt+1) * sizeof (int))) == NULL)
 319      -                        overflo("follow set overflow");
 320      -                f->re[(int)left(v)].lfollow = p;
      423 +                        overflo("out of space building follow set");
      424 +                f->re[info(v)].lfollow = p;
 321  425                  *p = setcnt;
 322  426                  for (i = f->accept; i >= 0; i--) {
 323  427                          if (setvec[i] == 1)
 324  428                                  *++p = i;
 325  429                  }
 326  430                  break;
 327  431          UNARY
 328  432                  cfoll(f, left(v));
 329  433                  break;
 330  434          case CAT:
 331  435          case OR:
 332  436                  cfoll(f, left(v));
 333  437                  cfoll(f, right(v));
 334  438                  break;
 335      -        default:
 336      -                ERROR "unknown type %d in cfoll", type(v) FATAL;
      439 +        default:        /* can't happen */
      440 +                FATAL("can't happen: unknown type %d in cfoll", type(v));
 337  441          }
 338  442  }
 339  443  
 340  444  /*
 341  445   * collects initially active leaves of p into setvec
 342  446   * returns 0 or 1 depending on whether p matches empty string
 343  447   */
 344  448  static int
 345  449  first(Node *p)
 346  450  {
 347      -        register int b;
      451 +        int b, lp;
 348  452  
 349  453          switch (type(p)) {
      454 +        ELEAF
 350  455          LEAF
 351      -                if (setvec[(int)left(p)] != 1) {
 352      -                        setvec[(int)left(p)] = 1;
      456 +                /* look for high-water mark of subscripts */
      457 +                lp = info(p);
      458 +                /* guessing here! */
      459 +                while (setcnt >= maxsetvec || lp >= maxsetvec) {
      460 +                        maxsetvec *= 4;
      461 +                        setvec = (int *)realloc(setvec, maxsetvec *
      462 +                            sizeof (int));
      463 +                        tmpset = (int *)realloc(tmpset, maxsetvec *
      464 +                            sizeof (int));
      465 +                        if (setvec == 0 || tmpset == 0)
      466 +                                overflo("out of space in first()");
      467 +                }
      468 +                if (type(p) == EMPTYRE) {
      469 +                        setvec[lp] = 0;
      470 +                        return (0);
      471 +                }
      472 +                if (setvec[lp] != 1) {
      473 +                        setvec[lp] = 1;
 353  474                          setcnt++;
 354  475                  }
 355  476                  if (type(p) == CCL && (*(uchar *)right(p)) == '\0')
 356  477                          return (0);             /* empty CCL */
 357  478                  else
 358  479                          return (1);
 359  480          case PLUS:
 360  481                  if (first(left(p)) == 0)
 361  482                          return (0);
 362  483                  return (1);
↓ open down ↓ 4 lines elided ↑ open up ↑
 367  488          case CAT:
 368  489                  if (first(left(p)) == 0 && first(right(p)) == 0)
 369  490                          return (0);
 370  491                  return (1);
 371  492          case OR:
 372  493                  b = first(right(p));
 373  494                  if (first(left(p)) == 0 || b == 0)
 374  495                          return (0);
 375  496                  return (1);
 376  497          }
 377      -        ERROR "unknown type %d in first", type(p) FATAL;
      498 +        /* can't happen */
      499 +        FATAL("can't happen: unknown type %d in first", type(p));
      500 +        /*NOTREACHED*/
 378  501          return (-1);
 379  502  }
 380  503  
 381  504  /* collects leaves that can follow v into setvec */
 382  505  static void
 383  506  follow(Node *v)
 384  507  {
 385  508          Node *p;
 386  509  
 387  510          if (type(v) == FINAL)
↓ open down ↓ 13 lines elided ↑ open up ↑
 401  524  
 402  525          case CAT:
 403  526                  if (v == left(p)) {     /* v is left child of p */
 404  527                          if (first(right(p)) == 0) {
 405  528                                  follow(p);
 406  529                                  return;
 407  530                          }
 408  531                  } else          /* v is right child */
 409  532                          follow(p);
 410  533                  return;
 411      -        default:
 412      -                ERROR "unknown type %d in follow", type(p) FATAL;
 413      -                break;
 414  534          }
 415  535  }
 416  536  
 417  537  static int
 418      -member(uchar c, uchar *s)       /* is c in s? */
      538 +member(int c, const uchar *sarg)        /* is c in s? */
 419  539  {
      540 +        uchar *s = (uchar *)sarg;
      541 +
 420  542          while (*s)
 421  543                  if (c == *s++)
 422  544                          return (1);
 423  545          return (0);
 424  546  }
 425  547  
 426  548  
 427  549  int
 428      -match(fa *f, uchar *p)
      550 +match(fa *f, const uchar *p0)   /* shortest match ? */
 429  551  {
 430      -        register int s, ns;
      552 +        int s, ns;
      553 +        uchar *p = (uchar *)p0;
 431  554  
 432  555          s = f->reset ? makeinit(f, 0) : f->initstat;
 433  556          if (f->out[s])
 434  557                  return (1);
 435  558          do {
      559 +                /* assert(*p < NCHARS); */
 436  560                  if ((ns = f->gototab[s][*p]) != 0)
 437  561                          s = ns;
 438  562                  else
 439  563                          s = cgoto(f, s, *p);
 440  564                  if (f->out[s])
 441  565                          return (1);
 442  566          } while (*p++ != 0);
 443  567          return (0);
 444  568  }
 445  569  
 446  570  int
 447      -pmatch(fa *f, uchar *p)
      571 +pmatch(fa *f, const uchar *p0)          /* longest match, for sub */
 448  572  {
 449      -        register int s, ns;
 450      -        register uchar *q;
      573 +        int s, ns;
      574 +        uchar *p = (uchar *)p0;
      575 +        uchar *q;
 451  576          int i, k;
 452  577  
      578 +        /* s = f->reset ? makeinit(f,1) : f->initstat; */
 453  579          if (f->reset) {
 454  580                  f->initstat = s = makeinit(f, 1);
 455  581          } else {
 456  582                  s = f->initstat;
 457  583          }
 458  584          patbeg = p;
 459  585          patlen = -1;
 460  586          do {
 461  587                  q = p;
 462  588                  do {
 463  589                          if (f->out[s])          /* final state */
 464  590                                  patlen = q-p;
      591 +                        /* assert(*q < NCHARS); */
 465  592                          if ((ns = f->gototab[s][*q]) != 0)
 466  593                                  s = ns;
 467  594                          else
 468  595                                  s = cgoto(f, s, *q);
 469  596                          if (s == 1) {   /* no transition */
 470  597                                  if (patlen >= 0) {
 471  598                                          patbeg = p;
 472  599                                          return (1);
 473  600                                  } else
 474  601                                          goto nextin;    /* no match */
 475  602                          }
 476  603                  } while (*q++ != 0);
 477  604                  if (f->out[s])
 478  605                          patlen = q - p - 1;     /* don't count $ */
 479  606                  if (patlen >= 0) {
 480  607                          patbeg = p;
 481  608                          return (1);
 482  609                  }
 483      -        nextin:
      610 +nextin:
 484  611                  s = 2;
 485  612                  if (f->reset) {
 486  613                          for (i = 2; i <= f->curstat; i++)
 487  614                                  xfree(f->posns[i]);
 488  615                          k = *f->posns[0];
 489  616                          if ((f->posns[2] =
 490  617                              (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
 491  618                                  overflo("out of space in pmatch");
 492  619                          }
 493  620                          for (i = 0; i <= k; i++)
↓ open down ↓ 1 lines elided ↑ open up ↑
 495  622                          f->initstat = f->curstat = 2;
 496  623                          f->out[2] = f->out[0];
 497  624                          for (i = 0; i < NCHARS; i++)
 498  625                                  f->gototab[2][i] = 0;
 499  626                  }
 500  627          } while (*p++ != 0);
 501  628          return (0);
 502  629  }
 503  630  
 504  631  int
 505      -nematch(fa *f, uchar *p)
      632 +nematch(fa *f, const uchar *p0)         /* non-empty match, for sub */
 506  633  {
 507      -        register int s, ns;
 508      -        register uchar *q;
      634 +        int s, ns;
      635 +        uchar *p = (uchar *)p0;
      636 +        uchar *q;
 509  637          int i, k;
 510  638  
      639 +        /* s = f->reset ? makeinit(f,1) : f->initstat; */
 511  640          if (f->reset) {
 512  641                  f->initstat = s = makeinit(f, 1);
 513  642          } else {
 514  643                  s = f->initstat;
 515  644          }
 516  645          patlen = -1;
 517  646          while (*p) {
 518  647                  q = p;
 519  648                  do {
 520  649                          if (f->out[s])          /* final state */
 521      -                                patlen = q-p;
      650 +                                patlen = q - p;
      651 +                        /* assert(*q < NCHARS); */
 522  652                          if ((ns = f->gototab[s][*q]) != 0)
 523  653                                  s = ns;
 524  654                          else
 525  655                                  s = cgoto(f, s, *q);
 526  656                          if (s == 1) {   /* no transition */
 527  657                                  if (patlen > 0) {
 528  658                                          patbeg = p;
 529  659                                          return (1);
 530  660                                  } else
 531  661                                          goto nnextin;   /* no nonempty match */
 532  662                          }
 533  663                  } while (*q++ != 0);
 534  664                  if (f->out[s])
 535  665                          patlen = q - p - 1;     /* don't count $ */
 536  666                  if (patlen > 0) {
 537  667                          patbeg = p;
 538  668                          return (1);
 539  669                  }
 540      -        nnextin:
      670 +nnextin:
 541  671                  s = 2;
 542  672                  if (f->reset) {
 543  673                          for (i = 2; i <= f->curstat; i++)
 544  674                                  xfree(f->posns[i]);
 545  675                          k = *f->posns[0];
 546  676                          if ((f->posns[2] =
 547  677                              (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
 548  678                                  overflo("out of state space");
 549  679                          }
 550  680                          for (i = 0; i <= k; i++)
↓ open down ↓ 1 lines elided ↑ open up ↑
 552  682                          f->initstat = f->curstat = 2;
 553  683                          f->out[2] = f->out[0];
 554  684                          for (i = 0; i < NCHARS; i++)
 555  685                                  f->gototab[2][i] = 0;
 556  686                  }
 557  687                  p++;
 558  688          }
 559  689          return (0);
 560  690  }
 561  691  
 562      -static Node *regexp(void), *primary(void), *concat(Node *);
 563      -static Node *alt(Node *), *unary(Node *);
 564      -
 565  692  static Node *
 566      -reparse(uchar *p)
      693 +reparse(const uchar *p)
 567  694  {
 568  695          /* parses regular expression pointed to by p */
 569  696          /* uses relex() to scan regular expression */
 570  697          Node *np;
 571  698  
 572  699          dprintf(("reparse <%s>\n", p));
 573      -        lastre = prestr = p;    /* prestr points to string to be parsed */
      700 +        /* prestr points to string to be parsed */
      701 +        lastre = prestr = (uchar *)p;
 574  702          rtok = relex();
 575      -        if (rtok == '\0')
 576      -                ERROR "empty regular expression" FATAL;
 577      -        np = regexp();
      703 +        /* GNU compatibility: an empty regexp matches anything */
 578  704          if (rtok == '\0') {
 579      -                return (np);
 580      -        } else {
 581      -                ERROR "syntax error in regular expression %s at %s",
 582      -                    lastre, prestr FATAL;
      705 +                /* FATAL("empty regular expression"); previous */
      706 +                return (op2(EMPTYRE, NIL, NIL));
 583  707          }
 584      -        /*NOTREACHED*/
 585      -        return (NULL);
      708 +        np = regexp();
      709 +        if (rtok != '\0') {
      710 +                FATAL("syntax error in regular expression %s at %s",
      711 +                    lastre, prestr);
      712 +        }
      713 +        return (np);
 586  714  }
 587  715  
 588  716  static Node *
 589      -regexp(void)
      717 +regexp(void)    /* top-level parse of reg expr */
 590  718  {
 591  719          return (alt(concat(primary())));
 592  720  }
 593  721  
 594  722  static Node *
 595  723  primary(void)
 596  724  {
 597  725          Node *np;
 598  726  
 599  727          switch (rtok) {
 600  728          case CHAR:
 601      -                np = op2(CHAR, NIL, (Node *)rlxval);
      729 +                np = op2(CHAR, NIL, itonp(rlxval));
 602  730                  rtok = relex();
 603  731                  return (unary(np));
 604  732          case ALL:
 605  733                  rtok = relex();
 606  734                  return (unary(op2(ALL, NIL, NIL)));
      735 +        case EMPTYRE:
      736 +                rtok = relex();
      737 +                return (unary(op2(ALL, NIL, NIL)));
 607  738          case DOT:
 608  739                  rtok = relex();
 609  740                  return (unary(op2(DOT, NIL, NIL)));
 610  741          case CCL:
 611  742                  /*LINTED align*/
 612  743                  np = op2(CCL, NIL, (Node *)cclenter(rlxstr));
 613  744                  rtok = relex();
 614  745                  return (unary(np));
 615  746          case NCCL:
 616  747                  /*LINTED align*/
 617  748                  np = op2(NCCL, NIL, (Node *)cclenter(rlxstr));
 618  749                  rtok = relex();
 619  750                  return (unary(np));
 620  751          case '^':
 621  752                  rtok = relex();
 622      -                return (unary(op2(CHAR, NIL, (Node *)HAT)));
      753 +                return (unary(op2(CHAR, NIL, itonp(HAT))));
 623  754          case '$':
 624  755                  rtok = relex();
 625  756                  return (unary(op2(CHAR, NIL, NIL)));
 626  757          case '(':
 627  758                  rtok = relex();
 628  759                  if (rtok == ')') {      /* special pleading for () */
 629  760                          rtok = relex();
 630  761                          return (unary(op2(CCL, NIL,
 631  762                              /*LINTED align*/
 632  763                              (Node *)tostring((uchar *)""))));
 633  764                  }
 634  765                  np = regexp();
 635  766                  if (rtok == ')') {
 636  767                          rtok = relex();
 637  768                          return (unary(np));
 638  769                  } else {
 639      -                        ERROR "syntax error in regular expression %s at %s",
 640      -                            lastre, prestr FATAL;
      770 +                        FATAL("syntax error in regular expression %s at %s",
      771 +                            lastre, prestr);
 641  772                  }
 642  773          default:
 643      -                ERROR "illegal primary in regular expression %s at %s",
 644      -                    lastre, prestr FATAL;
      774 +                FATAL("illegal primary in regular expression %s at %s",
      775 +                    lastre, prestr);
 645  776          }
 646  777          /*NOTREACHED*/
 647  778          return (NULL);
 648  779  }
 649  780  
 650  781  static Node *
 651  782  concat(Node *np)
 652  783  {
 653  784          switch (rtok) {
 654      -        case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
      785 +        case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL:
      786 +        case '$': case '(':
 655  787                  return (concat(op2(CAT, np, primary())));
 656  788          default:
 657  789                  return (np);
 658  790          }
 659  791  }
 660  792  
 661  793  static Node *
 662  794  alt(Node *np)
 663  795  {
 664  796          if (rtok == OR) {
↓ open down ↓ 14 lines elided ↑ open up ↑
 679  811                  rtok = relex();
 680  812                  return (unary(op2(PLUS, np, NIL)));
 681  813          case QUEST:
 682  814                  rtok = relex();
 683  815                  return (unary(op2(QUEST, np, NIL)));
 684  816          default:
 685  817                  return (np);
 686  818          }
 687  819  }
 688  820  
      821 +/*
      822 + * Character class definitions conformant to the POSIX locale as
      823 + * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
      824 + * and operating character sets are both ASCII (ISO646) or supersets
      825 + * thereof.
      826 + *
      827 + * Note that to avoid overflowing the temporary buffer used in
      828 + * relex(), the expanded character class (prior to range expansion)
      829 + * must be less than twice the size of their full name.
      830 + */
      831 +
      832 +/*
      833 + * Because isblank doesn't show up in any of the header files on any
      834 + * system i use, it's defined here.  if some other locale has a richer
      835 + * definition of "blank", define HAS_ISBLANK and provide your own
      836 + * version.
      837 + * the parentheses here are an attempt to find a path through the maze
      838 + * of macro definition and/or function and/or version provided.  thanks
      839 + * to nelson beebe for the suggestion; let's see if it works everywhere.
      840 + */
      841 +
      842 +/* #define HAS_ISBLANK */
      843 +#ifndef HAS_ISBLANK
      844 +
      845 +int
      846 +(xisblank)(int c)
      847 +{
      848 +        return (c == ' ' || c == '\t');
      849 +}
      850 +
      851 +#endif
      852 +
      853 +struct charclass {
      854 +        const char *cc_name;
      855 +        int cc_namelen;
      856 +        int (*cc_func)(int);
      857 +} charclasses[] = {
      858 +        { "alnum",      5,      isalnum },
      859 +        { "alpha",      5,      isalpha },
      860 +#ifndef HAS_ISBLANK
      861 +        { "blank",      5,      isspace }, /* was isblank */
      862 +#else
      863 +        { "blank",      5,      isblank },
      864 +#endif
      865 +        { "cntrl",      5,      iscntrl },
      866 +        { "digit",      5,      isdigit },
      867 +        { "graph",      5,      isgraph },
      868 +        { "lower",      5,      islower },
      869 +        { "print",      5,      isprint },
      870 +        { "punct",      5,      ispunct },
      871 +        { "space",      5,      isspace },
      872 +        { "upper",      5,      isupper },
      873 +        { "xdigit",     6,      isxdigit },
      874 +        { NULL,         0,      NULL },
      875 +};
      876 +
 689  877  static int
 690  878  relex(void)             /* lexical analyzer for reparse */
 691  879  {
 692      -        register int c;
 693      -        uchar *cbuf;
 694      -        int clen, cflag;
      880 +        int c, n;
      881 +        int cflag;
      882 +        uchar *buf = 0;
      883 +        size_t bufsz = 100;
      884 +        uchar *bp;
      885 +        struct charclass *cc;
      886 +        int i;
 695  887  
 696  888          switch (c = *prestr++) {
 697  889          case '|': return OR;
 698  890          case '*': return STAR;
 699  891          case '+': return PLUS;
 700  892          case '?': return QUEST;
 701  893          case '.': return DOT;
 702  894          case '\0': prestr--; return '\0';
 703  895          case '^':
 704  896          case '$':
 705  897          case '(':
 706  898          case ')':
 707  899                  return (c);
 708  900          case '\\':
 709      -                if ((c = *prestr++) == 't')
 710      -                        c = '\t';
 711      -                else if (c == 'n')
 712      -                        c = '\n';
 713      -                else if (c == 'f')
 714      -                        c = '\f';
 715      -                else if (c == 'r')
 716      -                        c = '\r';
 717      -                else if (c == 'b')
 718      -                        c = '\b';
 719      -                else if (c == '\\')
 720      -                        c = '\\';
 721      -                else if (isdigit(c)) {
 722      -                        int n = c - '0';
 723      -                        if (isdigit(*prestr)) {
 724      -                                n = 8 * n + *prestr++ - '0';
 725      -                                if (isdigit(*prestr))
 726      -                                        n = 8 * n + *prestr++ - '0';
 727      -                        }
 728      -                        c = n;
 729      -                } /* else it's now in c */
 730      -                rlxval = c;
      901 +                rlxval = quoted(&prestr);
 731  902                  return (CHAR);
 732  903          default:
 733  904                  rlxval = c;
 734  905                  return (CHAR);
 735  906          case '[':
 736      -                clen = 0;
      907 +                if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL)
      908 +                        FATAL("out of space in reg expr %.10s..", lastre);
      909 +                bp = buf;
 737  910                  if (*prestr == '^') {
 738  911                          cflag = 1;
 739  912                          prestr++;
 740  913                  } else
 741  914                          cflag = 0;
 742      -                init_buf(&cbuf, NULL, strlen((char *)prestr) * 2 + 1);
      915 +                n = 2 * strlen((const char *) prestr)+1;
      916 +                if (!adjbuf(&buf, &bufsz, n, n, &bp, "relex1"))
      917 +                        FATAL("out of space for reg expr %.10s...", lastre);
 743  918                  for (;;) {
 744  919                          if ((c = *prestr++) == '\\') {
 745      -                                cbuf[clen++] = '\\';
      920 +                                *bp++ = '\\';
 746  921                                  if ((c = *prestr++) == '\0') {
 747      -                                        ERROR
 748      -                        "nonterminated character class %s", lastre FATAL;
      922 +                                        FATAL(
      923 +                        "nonterminated character class %s", lastre);
 749  924                                  }
 750      -                                cbuf[clen++] = c;
      925 +                                *bp++ = c;
      926 +                        } else if (c == '[' && *prestr == ':') {
      927 +                                /*
      928 +                                 * POSIX char class names, Dag-Erling
      929 +                                 * Smorgrav, des@ofug.org
      930 +                                 */
      931 +                                for (cc = charclasses; cc->cc_name; cc++)
      932 +                                        if (strncmp((const char *) prestr + 1,
      933 +                                            (const char *) cc->cc_name,
      934 +                                            cc->cc_namelen) == 0)
      935 +                                                break;
      936 +                                if (cc->cc_name != NULL &&
      937 +                                    prestr[1 + cc->cc_namelen] == ':' &&
      938 +                                    prestr[2 + cc->cc_namelen] == ']') {
      939 +                                        prestr += cc->cc_namelen + 3;
      940 +                                        for (i = 0; i < NCHARS; i++) {
      941 +                                                if (!adjbuf(&buf, &bufsz,
      942 +                                                    bp - buf + 1, 100, &bp,
      943 +                                                    "relex2"))
      944 +                                                        FATAL(
      945 +                                "out of space for reg expr %.10s...", lastre);
      946 +                                                if (cc->cc_func(i)) {
      947 +                                                        *bp++ = i;
      948 +                                                        n++;
      949 +                                                }
      950 +                                        }
      951 +                                } else
      952 +                                        *bp++ = c;
      953 +                        } else if (c == '\0') {
      954 +                                FATAL("nonterminated character class %.20s",
      955 +                                    lastre);
      956 +                        } else if (bp == buf) { /* 1st char is special */
      957 +                                *bp++ = c;
 751  958                          } else if (c == ']') {
 752      -                                cbuf[clen] = 0;
 753      -                                rlxstr = tostring(cbuf);
 754      -                                free(cbuf);
      959 +                                *bp++ = 0;
      960 +                                rlxstr = tostring(buf);
 755  961                                  if (cflag == 0)
 756  962                                          return (CCL);
 757  963                                  else
 758  964                                          return (NCCL);
 759      -                        } else if (c == '\n') {
 760      -                                ERROR "newline in character class %s...",
 761      -                                    lastre FATAL;
 762      -                        } else if (c == '\0') {
 763      -                                ERROR "nonterminated character class %s",
 764      -                                    lastre FATAL;
 765  965                          } else
 766      -                                cbuf[clen++] = c;
      966 +                                *bp++ = c;
 767  967                  }
 768  968                  /*NOTREACHED*/
 769  969          }
 770  970          return (0);
 771  971  }
 772  972  
 773  973  static int
 774  974  cgoto(fa *f, int s, int c)
 775  975  {
 776      -        register int i, j, k;
 777      -        register int *p, *q;
      976 +        int i, j, k;
      977 +        int *p, *q;
 778  978  
      979 +        assert(c == HAT || c < NCHARS);
      980 +        while (f->accept >= maxsetvec) {        /* guessing here! */
      981 +                maxsetvec *= 4;
      982 +                setvec = (int *)realloc(setvec, maxsetvec * sizeof (int));
      983 +                tmpset = (int *)realloc(tmpset, maxsetvec * sizeof (int));
      984 +                if (setvec == 0 || tmpset == 0)
      985 +                        overflo("out of space in cgoto()");
      986 +        }
 779  987          for (i = 0; i <= f->accept; i++)
 780  988                  setvec[i] = 0;
 781  989          setcnt = 0;
 782  990          /* compute positions of gototab[s,c] into setvec */
 783  991          p = f->posns[s];
 784  992          for (i = 1; i <= *p; i++) {
 785  993                  if ((k = f->re[p[i]].ltype) != FINAL) {
 786      -                        if (k == CHAR && c == f->re[p[i]].lval ||
 787      -                            k == DOT && c != 0 && c != HAT ||
 788      -                            k == ALL && c != 0 ||
 789      -                            k == CCL &&
 790      -                            member(c, (uchar *)f->re[p[i]].lval) ||
 791      -                            k == NCCL &&
 792      -                            !member(c, (uchar *)f->re[p[i]].lval) &&
 793      -                            c != 0 && c != HAT) {
      994 +                        if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) ||
      995 +                            (k == DOT && c != 0 && c != HAT) ||
      996 +                            (k == ALL && c != 0) ||
      997 +                            (k == EMPTYRE && c != 0) ||
      998 +                            (k == CCL &&
      999 +                            member(c, (uchar *)f->re[p[i]].lval.up)) ||
     1000 +                            ((k == NCCL &&
     1001 +                            !member(c, (uchar *)f->re[p[i]].lval.up)) &&
     1002 +                            c != 0 && c != HAT)) {
 794 1003                                  q = f->re[p[i]].lfollow;
 795 1004                                  for (j = 1; j <= *q; j++) {
     1005 +                                        if (q[j] >= maxsetvec) {
     1006 +                                                maxsetvec *= 4;
     1007 +                                                setvec = (int *)realloc(setvec,
     1008 +                                                    maxsetvec * sizeof (int));
     1009 +                                                tmpset = (int *)realloc(tmpset,
     1010 +                                                    maxsetvec * sizeof (int));
     1011 +                                                if (setvec == 0 ||
     1012 +                                                    tmpset == 0)
     1013 +                                                        overflo(
     1014 +                                                "cgoto overflow");
     1015 +                                        }
 796 1016                                          if (setvec[q[j]] == 0) {
 797 1017                                                  setcnt++;
 798 1018                                                  setvec[q[j]] = 1;
 799 1019                                          }
 800 1020                                  }
 801 1021                          }
 802 1022                  }
 803 1023          }
 804 1024          /* determine if setvec is a previous state */
 805 1025          tmpset[0] = setcnt;
↓ open down ↓ 10 lines elided ↑ open up ↑
 816 1036                  for (j = 1; j <= k; j++)
 817 1037                          if (tmpset[j] != p[j])
 818 1038                                  goto different;
 819 1039                  /* setvec is state i */
 820 1040                  f->gototab[s][c] = i;
 821 1041                  return (i);
 822 1042          different:;
 823 1043          }
 824 1044  
 825 1045          /* add tmpset to current set of states */
 826      -        if (f->curstat >= NSTATES-1) {
     1046 +        if (f->curstat >= NSTATES - 1) {
 827 1047                  f->curstat = 2;
 828 1048                  f->reset = 1;
 829 1049                  for (i = 2; i < NSTATES; i++)
 830 1050                          xfree(f->posns[i]);
 831 1051          } else
 832 1052                  ++(f->curstat);
 833 1053          for (i = 0; i < NCHARS; i++)
 834 1054                  f->gototab[f->curstat][i] = 0;
 835 1055          xfree(f->posns[f->curstat]);
 836 1056          if ((p = (int *)calloc(1, (setcnt + 1) * sizeof (int))) == NULL)
↓ open down ↓ 4 lines elided ↑ open up ↑
 841 1061          for (i = 0; i <= setcnt; i++)
 842 1062                  p[i] = tmpset[i];
 843 1063          if (setvec[f->accept])
 844 1064                  f->out[f->curstat] = 1;
 845 1065          else
 846 1066                  f->out[f->curstat] = 0;
 847 1067          return (f->curstat);
 848 1068  }
 849 1069  
 850 1070  static void
 851      -freefa(fa *f)
     1071 +freefa(fa *f)   /* free a finite automaton */
 852 1072  {
 853      -
 854      -        register int i;
     1073 +        int i;
 855 1074  
 856 1075          if (f == NULL)
 857 1076                  return;
 858 1077          for (i = 0; i <= f->curstat; i++)
 859 1078                  xfree(f->posns[i]);
 860      -        for (i = 0; i <= f->accept; i++)
     1079 +        for (i = 0; i <= f->accept; i++) {
 861 1080                  xfree(f->re[i].lfollow);
     1081 +                if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
     1082 +                        xfree((f->re[i].lval.np));
     1083 +        }
 862 1084          xfree(f->restr);
 863 1085          xfree(f);
 864 1086  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX