1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
25 * Use is subject to license terms.
26 */
27
28 /*
29 * Copyright (C) Lucent Technologies 1997
30 * All Rights Reserved
31 *
32 * Permission to use, copy, modify, and distribute this software and
33 * its documentation for any purpose and without fee is hereby
34 * granted, provided that the above copyright notice appear in all
35 * copies and that both that the copyright notice and this
36 * permission notice and warranty disclaimer appear in supporting
37 * documentation, and that the name Lucent Technologies or any of
38 * its entities not be used in advertising or publicity pertaining
39 * to distribution of the software without specific, written prior
40 * permission.
41 *
42 * LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
43 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
44 * IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
45 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
46 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
47 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
48 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
49 * THIS SOFTWARE.
50 */
51
52 #define DEBUG
53
54 #include "awk.h"
55 #include "y.tab.h"
56
57 #define HAT (NCHARS+2) /* matches ^ in regular expr */
58 /* NCHARS is 2**n */
59 #define MAXLIN 22
60
61 #define type(v) (v)->nobj /* badly overloaded here */
62 #define info(v) (v)->ntype /* badly overloaded here */
63 #define left(v) (v)->narg[0]
64 #define right(v) (v)->narg[1]
65 #define parent(v) (v)->nnext
66
67 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
68 #define ELEAF case EMPTYRE: /* empty string in regexp */
69 #define UNARY case STAR: case PLUS: case QUEST:
70
71 /*
72 * encoding in tree Nodes:
73 * leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
74 * left is index, right contains value or pointer to value
75 * unary (STAR, PLUS, QUEST): left is child, right is null
76 * binary (CAT, OR): left and right are children
77 * parent contains pointer to parent
78 */
79
80 int *setvec;
81 int *tmpset;
82 int maxsetvec = 0;
83
84 int rtok; /* next token in current re */
85 int rlxval;
86 static uchar *rlxstr;
87 static uchar *prestr; /* current position in current re */
88 static uchar *lastre; /* origin of last re */
89
90 static int setcnt;
91 static int poscnt;
92
93 uchar *patbeg;
94 int patlen;
95
96 #define NFA 20 /* cache this many dynamic fa's */
97 fa *fatab[NFA];
98 int nfatab = 0; /* entries in fatab */
99
100 static fa *mkdfa(const uchar *, int);
101 static int makeinit(fa *, int);
102 static void penter(Node *);
103 static void freetr(Node *);
104 static void overflo(const char *);
105 static void cfoll(fa *, Node *);
106 static void follow(Node *);
107 static Node *reparse(const uchar *);
108 static int relex(void);
109 static void freefa(fa *);
110 static int cgoto(fa *, int, int);
111 static Node *regexp(void);
112 static Node *primary(void);
113 static Node *concat(Node *);
114 static Node *alt(Node *);
115 static Node *unary(Node *);
116
117 fa *
118 makedfa(const uchar *s, int anchor) /* returns dfa for reg expr s */
119 {
120 int i, use, nuse;
121 fa *pfa;
122 static int now = 1;
123
124 if (setvec == 0) { /* first time through any RE */
125 maxsetvec = MAXLIN;
126 setvec = (int *)malloc(maxsetvec * sizeof (int));
127 tmpset = (int *)malloc(maxsetvec * sizeof (int));
128 if (setvec == 0 || tmpset == 0)
129 overflo("out of space initializing makedfa");
130 }
131
132 if (compile_time) /* a constant for sure */
133 return (mkdfa(s, anchor));
134 for (i = 0; i < nfatab; i++) { /* is it there already? */
135 if (fatab[i]->anchor == anchor &&
136 strcmp((const char *)fatab[i]->restr, (char *)s) == 0) {
137 fatab[i]->use = now++;
138 return (fatab[i]);
139 }
140 }
141 pfa = mkdfa(s, anchor);
142 if (nfatab < NFA) { /* room for another */
143 fatab[nfatab] = pfa;
144 fatab[nfatab]->use = now++;
145 nfatab++;
146 return (pfa);
147 }
148 use = fatab[0]->use; /* replace least-recently used */
149 nuse = 0;
150 for (i = 1; i < nfatab; i++) {
151 if (fatab[i]->use < use) {
152 use = fatab[i]->use;
153 nuse = i;
154 }
155 }
156 freefa(fatab[nuse]);
157 fatab[nuse] = pfa;
158 pfa->use = now++;
159 return (pfa);
160 }
161
162 fa *
163 mkdfa(const uchar *s, int anchor)
164 { /* does the real work of making a dfa */
165 /* anchor = 1 for anchored matches, else 0 */
166 Node *p, *p1;
167 fa *f;
168
169 p = reparse(s);
170 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
171 /* put ALL STAR in front of reg. exp. */
172 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
173 /* put FINAL after reg. exp. */
174
175 poscnt = 0;
176 penter(p1); /* enter parent pointers and leaf indices */
177 if ((f = (fa *)calloc(1, sizeof (fa) + poscnt * sizeof (rrow))) == NULL)
178 overflo("out of space for fa");
179 /* penter has computed number of positions in re */
180 f->accept = poscnt-1;
181 cfoll(f, p1); /* set up follow sets */
182 freetr(p1);
183 if ((f->posns[0] =
184 (int *)calloc(1, *(f->re[0].lfollow) * sizeof (int))) == NULL) {
185 overflo("out of space in makedfa");
186 }
187 if ((f->posns[1] = (int *)calloc(1, sizeof (int))) == NULL)
188 overflo("out of space in makedfa");
189 *f->posns[1] = 0;
190 f->initstat = makeinit(f, anchor);
191 f->anchor = anchor;
192 f->restr = tostring(s);
193 return (f);
194 }
195
196 static int
197 makeinit(fa *f, int anchor)
198 {
199 int i, k;
200
201 f->curstat = 2;
202 f->out[2] = 0;
203 f->reset = 0;
204 k = *(f->re[0].lfollow);
205 xfree(f->posns[2]);
206 if ((f->posns[2] = (int *)calloc(1, (k+1) * sizeof (int))) == NULL)
207 overflo("out of space in makeinit");
208 for (i = 0; i <= k; i++) {
209 (f->posns[2])[i] = (f->re[0].lfollow)[i];
210 }
211 if ((f->posns[2])[1] == f->accept)
212 f->out[2] = 1;
213 for (i = 0; i < NCHARS; i++)
214 f->gototab[2][i] = 0;
215 f->curstat = cgoto(f, 2, HAT);
216 if (anchor) {
217 *f->posns[2] = k-1; /* leave out position 0 */
218 for (i = 0; i < k; i++) {
219 (f->posns[0])[i] = (f->posns[2])[i];
220 }
221
222 f->out[0] = f->out[2];
223 if (f->curstat != 2)
224 --(*f->posns[f->curstat]);
225 }
226 return (f->curstat);
227 }
228
229 void
230 penter(Node *p) /* set up parent pointers and leaf indices */
231 {
232 switch (type(p)) {
233 ELEAF
234 LEAF
235 info(p) = poscnt;
236 poscnt++;
237 break;
238 UNARY
239 penter(left(p));
240 parent(left(p)) = p;
241 break;
242 case CAT:
243 case OR:
244 penter(left(p));
245 penter(right(p));
246 parent(left(p)) = p;
247 parent(right(p)) = p;
248 break;
249 default: /* can't happen */
250 FATAL("can't happen: unknown type %d in penter", type(p));
251 break;
252 }
253 }
254
255 static void
256 freetr(Node *p) /* free parse tree */
257 {
258 switch (type(p)) {
259 ELEAF
260 LEAF
261 xfree(p);
262 break;
263 UNARY
264 freetr(left(p));
265 xfree(p);
266 break;
267 case CAT:
268 case OR:
269 freetr(left(p));
270 freetr(right(p));
271 xfree(p);
272 break;
273 default: /* can't happen */
274 FATAL("can't happen: unknown type %d in freetr", type(p));
275 break;
276 }
277 }
278
279 /*
280 * in the parsing of regular expressions, metacharacters like . have
281 * to be seen literally; \056 is not a metacharacter.
282 */
283 int
284 hexstr(uchar **pp) /* find and eval hex string at pp, return new p */
285 { /* only pick up one 8-bit byte (2 chars) */
286 uchar *p;
287 int n = 0;
288 int i;
289
290 for (i = 0, p = (uchar *)*pp; i < 2 && isxdigit(*p); i++, p++) {
291 if (isdigit(*p))
292 n = 16 * n + *p - '0';
293 else if (*p >= 'a' && *p <= 'f')
294 n = 16 * n + *p - 'a' + 10;
295 else if (*p >= 'A' && *p <= 'F')
296 n = 16 * n + *p - 'A' + 10;
297 }
298 *pp = (uchar *)p;
299 return (n);
300 }
301
302 /* multiple use of arg */
303 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')
304
305 int
306 quoted(uchar **pp) /* pick up next thing after a \\ */
307 { /* and increment *pp */
308
309 uchar *p = *pp;
310 int c;
311
312 if ((c = *p++) == 't')
313 c = '\t';
314 else if (c == 'n')
315 c = '\n';
316 else if (c == 'f')
317 c = '\f';
318 else if (c == 'r')
319 c = '\r';
320 else if (c == 'b')
321 c = '\b';
322 else if (c == '\\')
323 c = '\\';
324 else if (c == 'x') { /* hexadecimal goo follows */
325 c = hexstr(&p); /* this adds a null if number is invalid */
326 } else if (isoctdigit(c)) { /* \d \dd \ddd */
327 int n = c - '0';
328 if (isoctdigit(*p)) {
329 n = 8 * n + *p++ - '0';
330 if (isoctdigit(*p))
331 n = 8 * n + *p++ - '0';
332 }
333 c = n;
334 } /* else */
335 /* c = c; */
336 *pp = p;
337 return (c);
338 }
339
340 uchar *
341 cclenter(const uchar *argp) /* add a character class */
342 {
343 int i, c, c2;
344 uchar *p = (uchar *)argp;
345 uchar *op, *bp;
346 static uchar *buf = NULL;
347 size_t bufsz = 100;
348
349 op = p;
350 if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL)
351 FATAL("out of space for character class [%.10s...] 1", p);
352 bp = buf;
353 for (i = 0; (c = *p++) != 0; ) {
354 if (c == '\\') {
355 c = quoted(&p);
356 } else if (c == '-' && i > 0 && bp[-1] != 0) {
357 if (*p != 0) {
358 c = bp[-1];
359 c2 = *p++;
360 if (c2 == '\\')
361 c2 = quoted(&p);
362 if (c > c2) { /* empty; ignore */
363 bp--;
364 i--;
365 continue;
366 }
367 while (c < c2) {
368 if (adjbuf(&buf, &bufsz,
369 bp - buf + 2, 100, &bp,
370 "cclenter1") == 0)
371 FATAL(
372 "out of space for character class [%.10s...] 2", p);
373 *bp++ = ++c;
374 i++;
375 }
376 continue;
377 }
378 }
379 if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter2"))
380 FATAL(
381 "out of space for character class [%.10s...] 3", p);
382 *bp++ = c;
383 i++;
384 }
385 *bp = 0;
386 dprintf(("cclenter: in = |%s|, out = |%s|\n", op, buf));
387 xfree(op);
388 return (tostring(buf));
389 }
390
391 static void
392 overflo(const char *s)
393 {
394 FATAL("regular expression too big: %.30s...", gettext((char *)s));
395 }
396
397 /* enter follow set of each leaf of vertex v into lfollow[leaf] */
398 static void
399 cfoll(fa *f, Node *v)
400 {
401 int i;
402 int *p;
403
404 switch (type(v)) {
405 ELEAF
406 LEAF
407 f->re[info(v)].ltype = type(v);
408 f->re[info(v)].lval.np = right(v);
409 while (f->accept >= maxsetvec) { /* guessing here! */
410 maxsetvec *= 4;
411 setvec = (int *)realloc(setvec, maxsetvec *
412 sizeof (int));
413 tmpset = (int *)realloc(tmpset, maxsetvec *
414 sizeof (int));
415 if (setvec == 0 || tmpset == 0)
416 overflo("out of space in cfoll()");
417 }
418 for (i = 0; i <= f->accept; i++)
419 setvec[i] = 0;
420 setcnt = 0;
421 follow(v); /* computes setvec and setcnt */
422 if ((p = (int *)calloc(1, (setcnt+1) * sizeof (int))) == NULL)
423 overflo("out of space building follow set");
424 f->re[info(v)].lfollow = p;
425 *p = setcnt;
426 for (i = f->accept; i >= 0; i--) {
427 if (setvec[i] == 1)
428 *++p = i;
429 }
430 break;
431 UNARY
432 cfoll(f, left(v));
433 break;
434 case CAT:
435 case OR:
436 cfoll(f, left(v));
437 cfoll(f, right(v));
438 break;
439 default: /* can't happen */
440 FATAL("can't happen: unknown type %d in cfoll", type(v));
441 }
442 }
443
444 /*
445 * collects initially active leaves of p into setvec
446 * returns 0 or 1 depending on whether p matches empty string
447 */
448 static int
449 first(Node *p)
450 {
451 int b, lp;
452
453 switch (type(p)) {
454 ELEAF
455 LEAF
456 /* look for high-water mark of subscripts */
457 lp = info(p);
458 /* guessing here! */
459 while (setcnt >= maxsetvec || lp >= maxsetvec) {
460 maxsetvec *= 4;
461 setvec = (int *)realloc(setvec, maxsetvec *
462 sizeof (int));
463 tmpset = (int *)realloc(tmpset, maxsetvec *
464 sizeof (int));
465 if (setvec == 0 || tmpset == 0)
466 overflo("out of space in first()");
467 }
468 if (type(p) == EMPTYRE) {
469 setvec[lp] = 0;
470 return (0);
471 }
472 if (setvec[lp] != 1) {
473 setvec[lp] = 1;
474 setcnt++;
475 }
476 if (type(p) == CCL && (*(uchar *)right(p)) == '\0')
477 return (0); /* empty CCL */
478 else
479 return (1);
480 case PLUS:
481 if (first(left(p)) == 0)
482 return (0);
483 return (1);
484 case STAR:
485 case QUEST:
486 (void) first(left(p));
487 return (0);
488 case CAT:
489 if (first(left(p)) == 0 && first(right(p)) == 0)
490 return (0);
491 return (1);
492 case OR:
493 b = first(right(p));
494 if (first(left(p)) == 0 || b == 0)
495 return (0);
496 return (1);
497 }
498 /* can't happen */
499 FATAL("can't happen: unknown type %d in first", type(p));
500 /*NOTREACHED*/
501 return (-1);
502 }
503
504 /* collects leaves that can follow v into setvec */
505 static void
506 follow(Node *v)
507 {
508 Node *p;
509
510 if (type(v) == FINAL)
511 return;
512 p = parent(v);
513 switch (type(p)) {
514 case STAR:
515 case PLUS:
516 (void) first(v);
517 follow(p);
518 return;
519
520 case OR:
521 case QUEST:
522 follow(p);
523 return;
524
525 case CAT:
526 if (v == left(p)) { /* v is left child of p */
527 if (first(right(p)) == 0) {
528 follow(p);
529 return;
530 }
531 } else /* v is right child */
532 follow(p);
533 return;
534 }
535 }
536
537 static int
538 member(int c, const uchar *sarg) /* is c in s? */
539 {
540 uchar *s = (uchar *)sarg;
541
542 while (*s)
543 if (c == *s++)
544 return (1);
545 return (0);
546 }
547
548
549 int
550 match(fa *f, const uchar *p0) /* shortest match ? */
551 {
552 int s, ns;
553 uchar *p = (uchar *)p0;
554
555 s = f->reset ? makeinit(f, 0) : f->initstat;
556 if (f->out[s])
557 return (1);
558 do {
559 /* assert(*p < NCHARS); */
560 if ((ns = f->gototab[s][*p]) != 0)
561 s = ns;
562 else
563 s = cgoto(f, s, *p);
564 if (f->out[s])
565 return (1);
566 } while (*p++ != 0);
567 return (0);
568 }
569
570 int
571 pmatch(fa *f, const uchar *p0) /* longest match, for sub */
572 {
573 int s, ns;
574 uchar *p = (uchar *)p0;
575 uchar *q;
576 int i, k;
577
578 /* s = f->reset ? makeinit(f,1) : f->initstat; */
579 if (f->reset) {
580 f->initstat = s = makeinit(f, 1);
581 } else {
582 s = f->initstat;
583 }
584 patbeg = p;
585 patlen = -1;
586 do {
587 q = p;
588 do {
589 if (f->out[s]) /* final state */
590 patlen = q-p;
591 /* assert(*q < NCHARS); */
592 if ((ns = f->gototab[s][*q]) != 0)
593 s = ns;
594 else
595 s = cgoto(f, s, *q);
596 if (s == 1) { /* no transition */
597 if (patlen >= 0) {
598 patbeg = p;
599 return (1);
600 } else
601 goto nextin; /* no match */
602 }
603 } while (*q++ != 0);
604 if (f->out[s])
605 patlen = q - p - 1; /* don't count $ */
606 if (patlen >= 0) {
607 patbeg = p;
608 return (1);
609 }
610 nextin:
611 s = 2;
612 if (f->reset) {
613 for (i = 2; i <= f->curstat; i++)
614 xfree(f->posns[i]);
615 k = *f->posns[0];
616 if ((f->posns[2] =
617 (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
618 overflo("out of space in pmatch");
619 }
620 for (i = 0; i <= k; i++)
621 (f->posns[2])[i] = (f->posns[0])[i];
622 f->initstat = f->curstat = 2;
623 f->out[2] = f->out[0];
624 for (i = 0; i < NCHARS; i++)
625 f->gototab[2][i] = 0;
626 }
627 } while (*p++ != 0);
628 return (0);
629 }
630
631 int
632 nematch(fa *f, const uchar *p0) /* non-empty match, for sub */
633 {
634 int s, ns;
635 uchar *p = (uchar *)p0;
636 uchar *q;
637 int i, k;
638
639 /* s = f->reset ? makeinit(f,1) : f->initstat; */
640 if (f->reset) {
641 f->initstat = s = makeinit(f, 1);
642 } else {
643 s = f->initstat;
644 }
645 patlen = -1;
646 while (*p) {
647 q = p;
648 do {
649 if (f->out[s]) /* final state */
650 patlen = q - p;
651 /* assert(*q < NCHARS); */
652 if ((ns = f->gototab[s][*q]) != 0)
653 s = ns;
654 else
655 s = cgoto(f, s, *q);
656 if (s == 1) { /* no transition */
657 if (patlen > 0) {
658 patbeg = p;
659 return (1);
660 } else
661 goto nnextin; /* no nonempty match */
662 }
663 } while (*q++ != 0);
664 if (f->out[s])
665 patlen = q - p - 1; /* don't count $ */
666 if (patlen > 0) {
667 patbeg = p;
668 return (1);
669 }
670 nnextin:
671 s = 2;
672 if (f->reset) {
673 for (i = 2; i <= f->curstat; i++)
674 xfree(f->posns[i]);
675 k = *f->posns[0];
676 if ((f->posns[2] =
677 (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
678 overflo("out of state space");
679 }
680 for (i = 0; i <= k; i++)
681 (f->posns[2])[i] = (f->posns[0])[i];
682 f->initstat = f->curstat = 2;
683 f->out[2] = f->out[0];
684 for (i = 0; i < NCHARS; i++)
685 f->gototab[2][i] = 0;
686 }
687 p++;
688 }
689 return (0);
690 }
691
692 static Node *
693 reparse(const uchar *p)
694 {
695 /* parses regular expression pointed to by p */
696 /* uses relex() to scan regular expression */
697 Node *np;
698
699 dprintf(("reparse <%s>\n", p));
700 /* prestr points to string to be parsed */
701 lastre = prestr = (uchar *)p;
702 rtok = relex();
703 /* GNU compatibility: an empty regexp matches anything */
704 if (rtok == '\0') {
705 /* FATAL("empty regular expression"); previous */
706 return (op2(EMPTYRE, NIL, NIL));
707 }
708 np = regexp();
709 if (rtok != '\0') {
710 FATAL("syntax error in regular expression %s at %s",
711 lastre, prestr);
712 }
713 return (np);
714 }
715
716 static Node *
717 regexp(void) /* top-level parse of reg expr */
718 {
719 return (alt(concat(primary())));
720 }
721
722 static Node *
723 primary(void)
724 {
725 Node *np;
726
727 switch (rtok) {
728 case CHAR:
729 np = op2(CHAR, NIL, itonp(rlxval));
730 rtok = relex();
731 return (unary(np));
732 case ALL:
733 rtok = relex();
734 return (unary(op2(ALL, NIL, NIL)));
735 case EMPTYRE:
736 rtok = relex();
737 return (unary(op2(ALL, NIL, NIL)));
738 case DOT:
739 rtok = relex();
740 return (unary(op2(DOT, NIL, NIL)));
741 case CCL:
742 /*LINTED align*/
743 np = op2(CCL, NIL, (Node *)cclenter(rlxstr));
744 rtok = relex();
745 return (unary(np));
746 case NCCL:
747 /*LINTED align*/
748 np = op2(NCCL, NIL, (Node *)cclenter(rlxstr));
749 rtok = relex();
750 return (unary(np));
751 case '^':
752 rtok = relex();
753 return (unary(op2(CHAR, NIL, itonp(HAT))));
754 case '$':
755 rtok = relex();
756 return (unary(op2(CHAR, NIL, NIL)));
757 case '(':
758 rtok = relex();
759 if (rtok == ')') { /* special pleading for () */
760 rtok = relex();
761 return (unary(op2(CCL, NIL,
762 /*LINTED align*/
763 (Node *)tostring((uchar *)""))));
764 }
765 np = regexp();
766 if (rtok == ')') {
767 rtok = relex();
768 return (unary(np));
769 } else {
770 FATAL("syntax error in regular expression %s at %s",
771 lastre, prestr);
772 }
773 default:
774 FATAL("illegal primary in regular expression %s at %s",
775 lastre, prestr);
776 }
777 /*NOTREACHED*/
778 return (NULL);
779 }
780
781 static Node *
782 concat(Node *np)
783 {
784 switch (rtok) {
785 case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL:
786 case '$': case '(':
787 return (concat(op2(CAT, np, primary())));
788 default:
789 return (np);
790 }
791 }
792
793 static Node *
794 alt(Node *np)
795 {
796 if (rtok == OR) {
797 rtok = relex();
798 return (alt(op2(OR, np, concat(primary()))));
799 }
800 return (np);
801 }
802
803 static Node *
804 unary(Node *np)
805 {
806 switch (rtok) {
807 case STAR:
808 rtok = relex();
809 return (unary(op2(STAR, np, NIL)));
810 case PLUS:
811 rtok = relex();
812 return (unary(op2(PLUS, np, NIL)));
813 case QUEST:
814 rtok = relex();
815 return (unary(op2(QUEST, np, NIL)));
816 default:
817 return (np);
818 }
819 }
820
821 /*
822 * Character class definitions conformant to the POSIX locale as
823 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
824 * and operating character sets are both ASCII (ISO646) or supersets
825 * thereof.
826 *
827 * Note that to avoid overflowing the temporary buffer used in
828 * relex(), the expanded character class (prior to range expansion)
829 * must be less than twice the size of their full name.
830 */
831
832 /*
833 * Because isblank doesn't show up in any of the header files on any
834 * system i use, it's defined here. if some other locale has a richer
835 * definition of "blank", define HAS_ISBLANK and provide your own
836 * version.
837 * the parentheses here are an attempt to find a path through the maze
838 * of macro definition and/or function and/or version provided. thanks
839 * to nelson beebe for the suggestion; let's see if it works everywhere.
840 */
841
842 /* #define HAS_ISBLANK */
843 #ifndef HAS_ISBLANK
844
845 int
846 (xisblank)(int c)
847 {
848 return (c == ' ' || c == '\t');
849 }
850
851 #endif
852
853 struct charclass {
854 const char *cc_name;
855 int cc_namelen;
856 int (*cc_func)(int);
857 } charclasses[] = {
858 { "alnum", 5, isalnum },
859 { "alpha", 5, isalpha },
860 #ifndef HAS_ISBLANK
861 { "blank", 5, isspace }, /* was isblank */
862 #else
863 { "blank", 5, isblank },
864 #endif
865 { "cntrl", 5, iscntrl },
866 { "digit", 5, isdigit },
867 { "graph", 5, isgraph },
868 { "lower", 5, islower },
869 { "print", 5, isprint },
870 { "punct", 5, ispunct },
871 { "space", 5, isspace },
872 { "upper", 5, isupper },
873 { "xdigit", 6, isxdigit },
874 { NULL, 0, NULL },
875 };
876
877 static int
878 relex(void) /* lexical analyzer for reparse */
879 {
880 int c, n;
881 int cflag;
882 uchar *buf = 0;
883 size_t bufsz = 100;
884 uchar *bp;
885 struct charclass *cc;
886 int i;
887
888 switch (c = *prestr++) {
889 case '|': return OR;
890 case '*': return STAR;
891 case '+': return PLUS;
892 case '?': return QUEST;
893 case '.': return DOT;
894 case '\0': prestr--; return '\0';
895 case '^':
896 case '$':
897 case '(':
898 case ')':
899 return (c);
900 case '\\':
901 rlxval = quoted(&prestr);
902 return (CHAR);
903 default:
904 rlxval = c;
905 return (CHAR);
906 case '[':
907 if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL)
908 FATAL("out of space in reg expr %.10s..", lastre);
909 bp = buf;
910 if (*prestr == '^') {
911 cflag = 1;
912 prestr++;
913 } else
914 cflag = 0;
915 n = 2 * strlen((const char *) prestr)+1;
916 if (!adjbuf(&buf, &bufsz, n, n, &bp, "relex1"))
917 FATAL("out of space for reg expr %.10s...", lastre);
918 for (;;) {
919 if ((c = *prestr++) == '\\') {
920 *bp++ = '\\';
921 if ((c = *prestr++) == '\0') {
922 FATAL(
923 "nonterminated character class %s", lastre);
924 }
925 *bp++ = c;
926 } else if (c == '[' && *prestr == ':') {
927 /*
928 * POSIX char class names, Dag-Erling
929 * Smorgrav, des@ofug.org
930 */
931 for (cc = charclasses; cc->cc_name; cc++)
932 if (strncmp((const char *) prestr + 1,
933 (const char *) cc->cc_name,
934 cc->cc_namelen) == 0)
935 break;
936 if (cc->cc_name != NULL &&
937 prestr[1 + cc->cc_namelen] == ':' &&
938 prestr[2 + cc->cc_namelen] == ']') {
939 prestr += cc->cc_namelen + 3;
940 for (i = 0; i < NCHARS; i++) {
941 if (!adjbuf(&buf, &bufsz,
942 bp - buf + 1, 100, &bp,
943 "relex2"))
944 FATAL(
945 "out of space for reg expr %.10s...", lastre);
946 if (cc->cc_func(i)) {
947 *bp++ = i;
948 n++;
949 }
950 }
951 } else
952 *bp++ = c;
953 } else if (c == '\0') {
954 FATAL("nonterminated character class %.20s",
955 lastre);
956 } else if (bp == buf) { /* 1st char is special */
957 *bp++ = c;
958 } else if (c == ']') {
959 *bp++ = 0;
960 rlxstr = tostring(buf);
961 if (cflag == 0)
962 return (CCL);
963 else
964 return (NCCL);
965 } else
966 *bp++ = c;
967 }
968 /*NOTREACHED*/
969 }
970 return (0);
971 }
972
973 static int
974 cgoto(fa *f, int s, int c)
975 {
976 int i, j, k;
977 int *p, *q;
978
979 assert(c == HAT || c < NCHARS);
980 while (f->accept >= maxsetvec) { /* guessing here! */
981 maxsetvec *= 4;
982 setvec = (int *)realloc(setvec, maxsetvec * sizeof (int));
983 tmpset = (int *)realloc(tmpset, maxsetvec * sizeof (int));
984 if (setvec == 0 || tmpset == 0)
985 overflo("out of space in cgoto()");
986 }
987 for (i = 0; i <= f->accept; i++)
988 setvec[i] = 0;
989 setcnt = 0;
990 /* compute positions of gototab[s,c] into setvec */
991 p = f->posns[s];
992 for (i = 1; i <= *p; i++) {
993 if ((k = f->re[p[i]].ltype) != FINAL) {
994 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) ||
995 (k == DOT && c != 0 && c != HAT) ||
996 (k == ALL && c != 0) ||
997 (k == EMPTYRE && c != 0) ||
998 (k == CCL &&
999 member(c, (uchar *)f->re[p[i]].lval.up)) ||
1000 ((k == NCCL &&
1001 !member(c, (uchar *)f->re[p[i]].lval.up)) &&
1002 c != 0 && c != HAT)) {
1003 q = f->re[p[i]].lfollow;
1004 for (j = 1; j <= *q; j++) {
1005 if (q[j] >= maxsetvec) {
1006 maxsetvec *= 4;
1007 setvec = (int *)realloc(setvec,
1008 maxsetvec * sizeof (int));
1009 tmpset = (int *)realloc(tmpset,
1010 maxsetvec * sizeof (int));
1011 if (setvec == 0 ||
1012 tmpset == 0)
1013 overflo(
1014 "cgoto overflow");
1015 }
1016 if (setvec[q[j]] == 0) {
1017 setcnt++;
1018 setvec[q[j]] = 1;
1019 }
1020 }
1021 }
1022 }
1023 }
1024 /* determine if setvec is a previous state */
1025 tmpset[0] = setcnt;
1026 j = 1;
1027 for (i = f->accept; i >= 0; i--)
1028 if (setvec[i]) {
1029 tmpset[j++] = i;
1030 }
1031 /* tmpset == previous state? */
1032 for (i = 1; i <= f->curstat; i++) {
1033 p = f->posns[i];
1034 if ((k = tmpset[0]) != p[0])
1035 goto different;
1036 for (j = 1; j <= k; j++)
1037 if (tmpset[j] != p[j])
1038 goto different;
1039 /* setvec is state i */
1040 f->gototab[s][c] = i;
1041 return (i);
1042 different:;
1043 }
1044
1045 /* add tmpset to current set of states */
1046 if (f->curstat >= NSTATES - 1) {
1047 f->curstat = 2;
1048 f->reset = 1;
1049 for (i = 2; i < NSTATES; i++)
1050 xfree(f->posns[i]);
1051 } else
1052 ++(f->curstat);
1053 for (i = 0; i < NCHARS; i++)
1054 f->gototab[f->curstat][i] = 0;
1055 xfree(f->posns[f->curstat]);
1056 if ((p = (int *)calloc(1, (setcnt + 1) * sizeof (int))) == NULL)
1057 overflo("out of space in cgoto");
1058
1059 f->posns[f->curstat] = p;
1060 f->gototab[s][c] = f->curstat;
1061 for (i = 0; i <= setcnt; i++)
1062 p[i] = tmpset[i];
1063 if (setvec[f->accept])
1064 f->out[f->curstat] = 1;
1065 else
1066 f->out[f->curstat] = 0;
1067 return (f->curstat);
1068 }
1069
1070 static void
1071 freefa(fa *f) /* free a finite automaton */
1072 {
1073 int i;
1074
1075 if (f == NULL)
1076 return;
1077 for (i = 0; i <= f->curstat; i++)
1078 xfree(f->posns[i]);
1079 for (i = 0; i <= f->accept; i++) {
1080 xfree(f->re[i].lfollow);
1081 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1082 xfree((f->re[i].lval.np));
1083 }
1084 xfree(f->restr);
1085 xfree(f);
1086 }