8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
25 * Use is subject to license terms.
26 */
27
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
30
31 #pragma ident "%Z%%M% %I% %E% SMI"
32
33 #define DEBUG
34
35 #include "awk.h"
36 #include "y.tab.h"
37
38 #define HAT (NCHARS-1) /* matches ^ in regular expr */
39 /* NCHARS is 2**n */
40 #define MAXLIN (3 * LINE_MAX)
41
42 #define type(v) (v)->nobj
43 #define left(v) (v)->narg[0]
44 #define right(v) (v)->narg[1]
45 #define parent(v) (v)->nnext
46
47 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
48 #define UNARY case STAR: case PLUS: case QUEST:
49
50 /*
51 * encoding in tree Nodes:
52 * leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
53 * left is index, right contains value or pointer to value
54 * unary (STAR, PLUS, QUEST): left is child, right is null
55 * binary (CAT, OR): left and right are children
56 * parent contains pointer to parent
57 */
58
59 int setvec[MAXLIN];
60 int tmpset[MAXLIN];
61 Node *point[MAXLIN];
62
63 int rtok; /* next token in current re */
64 int rlxval;
65 uchar *rlxstr;
66 uchar *prestr; /* current position in current re */
67 uchar *lastre; /* origin of last re */
68
69 static int setcnt;
70 static int poscnt;
71
72 uchar *patbeg;
73 int patlen;
74
75 #define NFA 20 /* cache this many dynamic fa's */
76 fa *fatab[NFA];
77 int nfatab = 0; /* entries in fatab */
78
79 static fa *mkdfa(uchar *, int);
80 static int makeinit(fa *, int);
81 static void penter(Node *);
82 static void freetr(Node *);
83 static void overflo(char *);
84 static void cfoll(fa *, Node *);
85 static void follow(Node *);
86 static Node *reparse(uchar *);
87 static int relex(void);
88 static void freefa(fa *);
89 static int cgoto(fa *, int, int);
90
91 fa *
92 makedfa(uchar *s, int anchor) /* returns dfa for reg expr s */
93 {
94 int i, use, nuse;
95 fa *pfa;
96
97 if (compile_time) /* a constant for sure */
98 return (mkdfa(s, anchor));
99 for (i = 0; i < nfatab; i++) { /* is it there already? */
100 if (fatab[i]->anchor == anchor &&
101 strcmp((char *)fatab[i]->restr, (char *)s) == 0) {
102 fatab[i]->use++;
103 return (fatab[i]);
104 }
105 }
106 pfa = mkdfa(s, anchor);
107 if (nfatab < NFA) { /* room for another */
108 fatab[nfatab] = pfa;
109 fatab[nfatab]->use = 1;
110 nfatab++;
111 return (pfa);
112 }
113 use = fatab[0]->use; /* replace least-recently used */
114 nuse = 0;
115 for (i = 1; i < nfatab; i++)
116 if (fatab[i]->use < use) {
117 use = fatab[i]->use;
118 nuse = i;
119 }
120 freefa(fatab[nuse]);
121 fatab[nuse] = pfa;
122 pfa->use = 1;
123 return (pfa);
124 }
125
126 fa *
127 mkdfa(uchar *s, int anchor) /* does the real work of making a dfa */
128 /* anchor = 1 for anchored matches, else 0 */
129 {
130 Node *p, *p1;
131 fa *f;
132
133 p = reparse(s);
134 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
135 /* put ALL STAR in front of reg. exp. */
136 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
137 /* put FINAL after reg. exp. */
138
139 poscnt = 0;
140 penter(p1); /* enter parent pointers and leaf indices */
141 if ((f = (fa *)calloc(1, sizeof (fa) + poscnt * sizeof (rrow))) == NULL)
142 overflo("no room for fa");
143 /* penter has computed number of positions in re */
144 f->accept = poscnt-1;
145 cfoll(f, p1); /* set up follow sets */
146 freetr(p1);
147 if ((f->posns[0] =
148 (int *)calloc(1, *(f->re[0].lfollow) * sizeof (int))) == NULL) {
149 overflo("out of space in makedfa");
150 }
151 if ((f->posns[1] = (int *)calloc(1, sizeof (int))) == NULL)
152 overflo("out of space in makedfa");
153 *f->posns[1] = 0;
154 f->initstat = makeinit(f, anchor);
155 f->anchor = anchor;
156 f->restr = tostring(s);
157 return (f);
158 }
159
160 static int
161 makeinit(fa *f, int anchor)
162 {
163 register int i, k;
164
165 f->curstat = 2;
166 f->out[2] = 0;
167 f->reset = 0;
168 k = *(f->re[0].lfollow);
169 xfree(f->posns[2]);
170 if ((f->posns[2] = (int *)calloc(1, (k+1) * sizeof (int))) == NULL)
171 overflo("out of space in makeinit");
172 for (i = 0; i <= k; i++) {
173 (f->posns[2])[i] = (f->re[0].lfollow)[i];
174 }
175 if ((f->posns[2])[1] == f->accept)
176 f->out[2] = 1;
177 for (i = 0; i < NCHARS; i++)
178 f->gototab[2][i] = 0;
179 f->curstat = cgoto(f, 2, HAT);
180 if (anchor) {
181 *f->posns[2] = k-1; /* leave out position 0 */
182 for (i = 0; i < k; i++) {
183 (f->posns[0])[i] = (f->posns[2])[i];
184 }
185
186 f->out[0] = f->out[2];
187 if (f->curstat != 2)
188 --(*f->posns[f->curstat]);
189 }
190 return (f->curstat);
191 }
192
193 void
194 penter(Node *p) /* set up parent pointers and leaf indices */
195 {
196 switch (type(p)) {
197 LEAF
198 left(p) = (Node *) poscnt;
199 point[poscnt++] = p;
200 break;
201 UNARY
202 penter(left(p));
203 parent(left(p)) = p;
204 break;
205 case CAT:
206 case OR:
207 penter(left(p));
208 penter(right(p));
209 parent(left(p)) = p;
210 parent(right(p)) = p;
211 break;
212 default:
213 ERROR "unknown type %d in penter", type(p) FATAL;
214 break;
215 }
216 }
217
218 static void
219 freetr(Node *p) /* free parse tree */
220 {
221 switch (type(p)) {
222 LEAF
223 xfree(p);
224 break;
225 UNARY
226 freetr(left(p));
227 xfree(p);
228 break;
229 case CAT:
230 case OR:
231 freetr(left(p));
232 freetr(right(p));
233 xfree(p);
234 break;
235 default:
236 ERROR "unknown type %d in freetr", type(p) FATAL;
237 break;
238 }
239 }
240
241 uchar *
242 cclenter(uchar *p)
243 {
244 register int i, c;
245 uchar *op, *chars, *ret;
246 size_t bsize;
247
248 init_buf(&chars, &bsize, LINE_INCR);
249 op = p;
250 i = 0;
251 while ((c = *p++) != 0) {
252 if (c == '\\') {
253 if ((c = *p++) == 't')
254 c = '\t';
255 else if (c == 'n')
256 c = '\n';
257 else if (c == 'f')
258 c = '\f';
259 else if (c == 'r')
260 c = '\r';
261 else if (c == 'b')
262 c = '\b';
263 else if (c == '\\')
264 c = '\\';
265 else if (isdigit(c)) {
266 int n = c - '0';
267 if (isdigit(*p)) {
268 n = 8 * n + *p++ - '0';
269 if (isdigit(*p))
270 n = 8 * n + *p++ - '0';
271 }
272 c = n;
273 } /* else */
274 /* c = c; */
275 } else if (c == '-' && i > 0 && chars[i-1] != 0) {
276 if (*p != 0) {
277 c = chars[i-1];
278 while ((uchar)c < *p) { /* fails if *p is \\ */
279 expand_buf(&chars, &bsize, i);
280 chars[i++] = ++c;
281 }
282 p++;
283 continue;
284 }
285 }
286 expand_buf(&chars, &bsize, i);
287 chars[i++] = c;
288 }
289 chars[i++] = '\0';
290 dprintf(("cclenter: in = |%s|, out = |%s|\n", op, chars));
291 xfree(op);
292 ret = tostring(chars);
293 free(chars);
294 return (ret);
295 }
296
297 static void
298 overflo(char *s)
299 {
300 ERROR "regular expression too big: %s", gettext((char *)s) FATAL;
301 }
302
303 /* enter follow set of each leaf of vertex v into lfollow[leaf] */
304 static void
305 cfoll(fa *f, Node *v)
306 {
307 register int i;
308 register int *p;
309
310 switch (type(v)) {
311 LEAF
312 f->re[(int)left(v)].ltype = type(v);
313 f->re[(int)left(v)].lval = (int)right(v);
314 for (i = 0; i <= f->accept; i++)
315 setvec[i] = 0;
316 setcnt = 0;
317 follow(v); /* computes setvec and setcnt */
318 if ((p = (int *)calloc(1, (setcnt+1) * sizeof (int))) == NULL)
319 overflo("follow set overflow");
320 f->re[(int)left(v)].lfollow = p;
321 *p = setcnt;
322 for (i = f->accept; i >= 0; i--) {
323 if (setvec[i] == 1)
324 *++p = i;
325 }
326 break;
327 UNARY
328 cfoll(f, left(v));
329 break;
330 case CAT:
331 case OR:
332 cfoll(f, left(v));
333 cfoll(f, right(v));
334 break;
335 default:
336 ERROR "unknown type %d in cfoll", type(v) FATAL;
337 }
338 }
339
340 /*
341 * collects initially active leaves of p into setvec
342 * returns 0 or 1 depending on whether p matches empty string
343 */
344 static int
345 first(Node *p)
346 {
347 register int b;
348
349 switch (type(p)) {
350 LEAF
351 if (setvec[(int)left(p)] != 1) {
352 setvec[(int)left(p)] = 1;
353 setcnt++;
354 }
355 if (type(p) == CCL && (*(uchar *)right(p)) == '\0')
356 return (0); /* empty CCL */
357 else
358 return (1);
359 case PLUS:
360 if (first(left(p)) == 0)
361 return (0);
362 return (1);
363 case STAR:
364 case QUEST:
365 (void) first(left(p));
366 return (0);
367 case CAT:
368 if (first(left(p)) == 0 && first(right(p)) == 0)
369 return (0);
370 return (1);
371 case OR:
372 b = first(right(p));
373 if (first(left(p)) == 0 || b == 0)
374 return (0);
375 return (1);
376 }
377 ERROR "unknown type %d in first", type(p) FATAL;
378 return (-1);
379 }
380
381 /* collects leaves that can follow v into setvec */
382 static void
383 follow(Node *v)
384 {
385 Node *p;
386
387 if (type(v) == FINAL)
388 return;
389 p = parent(v);
390 switch (type(p)) {
391 case STAR:
392 case PLUS:
393 (void) first(v);
394 follow(p);
395 return;
396
397 case OR:
398 case QUEST:
399 follow(p);
400 return;
401
402 case CAT:
403 if (v == left(p)) { /* v is left child of p */
404 if (first(right(p)) == 0) {
405 follow(p);
406 return;
407 }
408 } else /* v is right child */
409 follow(p);
410 return;
411 default:
412 ERROR "unknown type %d in follow", type(p) FATAL;
413 break;
414 }
415 }
416
417 static int
418 member(uchar c, uchar *s) /* is c in s? */
419 {
420 while (*s)
421 if (c == *s++)
422 return (1);
423 return (0);
424 }
425
426
427 int
428 match(fa *f, uchar *p)
429 {
430 register int s, ns;
431
432 s = f->reset ? makeinit(f, 0) : f->initstat;
433 if (f->out[s])
434 return (1);
435 do {
436 if ((ns = f->gototab[s][*p]) != 0)
437 s = ns;
438 else
439 s = cgoto(f, s, *p);
440 if (f->out[s])
441 return (1);
442 } while (*p++ != 0);
443 return (0);
444 }
445
446 int
447 pmatch(fa *f, uchar *p)
448 {
449 register int s, ns;
450 register uchar *q;
451 int i, k;
452
453 if (f->reset) {
454 f->initstat = s = makeinit(f, 1);
455 } else {
456 s = f->initstat;
457 }
458 patbeg = p;
459 patlen = -1;
460 do {
461 q = p;
462 do {
463 if (f->out[s]) /* final state */
464 patlen = q-p;
465 if ((ns = f->gototab[s][*q]) != 0)
466 s = ns;
467 else
468 s = cgoto(f, s, *q);
469 if (s == 1) { /* no transition */
470 if (patlen >= 0) {
471 patbeg = p;
472 return (1);
473 } else
474 goto nextin; /* no match */
475 }
476 } while (*q++ != 0);
477 if (f->out[s])
478 patlen = q - p - 1; /* don't count $ */
479 if (patlen >= 0) {
480 patbeg = p;
481 return (1);
482 }
483 nextin:
484 s = 2;
485 if (f->reset) {
486 for (i = 2; i <= f->curstat; i++)
487 xfree(f->posns[i]);
488 k = *f->posns[0];
489 if ((f->posns[2] =
490 (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
491 overflo("out of space in pmatch");
492 }
493 for (i = 0; i <= k; i++)
494 (f->posns[2])[i] = (f->posns[0])[i];
495 f->initstat = f->curstat = 2;
496 f->out[2] = f->out[0];
497 for (i = 0; i < NCHARS; i++)
498 f->gototab[2][i] = 0;
499 }
500 } while (*p++ != 0);
501 return (0);
502 }
503
504 int
505 nematch(fa *f, uchar *p)
506 {
507 register int s, ns;
508 register uchar *q;
509 int i, k;
510
511 if (f->reset) {
512 f->initstat = s = makeinit(f, 1);
513 } else {
514 s = f->initstat;
515 }
516 patlen = -1;
517 while (*p) {
518 q = p;
519 do {
520 if (f->out[s]) /* final state */
521 patlen = q-p;
522 if ((ns = f->gototab[s][*q]) != 0)
523 s = ns;
524 else
525 s = cgoto(f, s, *q);
526 if (s == 1) { /* no transition */
527 if (patlen > 0) {
528 patbeg = p;
529 return (1);
530 } else
531 goto nnextin; /* no nonempty match */
532 }
533 } while (*q++ != 0);
534 if (f->out[s])
535 patlen = q - p - 1; /* don't count $ */
536 if (patlen > 0) {
537 patbeg = p;
538 return (1);
539 }
540 nnextin:
541 s = 2;
542 if (f->reset) {
543 for (i = 2; i <= f->curstat; i++)
544 xfree(f->posns[i]);
545 k = *f->posns[0];
546 if ((f->posns[2] =
547 (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
548 overflo("out of state space");
549 }
550 for (i = 0; i <= k; i++)
551 (f->posns[2])[i] = (f->posns[0])[i];
552 f->initstat = f->curstat = 2;
553 f->out[2] = f->out[0];
554 for (i = 0; i < NCHARS; i++)
555 f->gototab[2][i] = 0;
556 }
557 p++;
558 }
559 return (0);
560 }
561
562 static Node *regexp(void), *primary(void), *concat(Node *);
563 static Node *alt(Node *), *unary(Node *);
564
565 static Node *
566 reparse(uchar *p)
567 {
568 /* parses regular expression pointed to by p */
569 /* uses relex() to scan regular expression */
570 Node *np;
571
572 dprintf(("reparse <%s>\n", p));
573 lastre = prestr = p; /* prestr points to string to be parsed */
574 rtok = relex();
575 if (rtok == '\0')
576 ERROR "empty regular expression" FATAL;
577 np = regexp();
578 if (rtok == '\0') {
579 return (np);
580 } else {
581 ERROR "syntax error in regular expression %s at %s",
582 lastre, prestr FATAL;
583 }
584 /*NOTREACHED*/
585 return (NULL);
586 }
587
588 static Node *
589 regexp(void)
590 {
591 return (alt(concat(primary())));
592 }
593
594 static Node *
595 primary(void)
596 {
597 Node *np;
598
599 switch (rtok) {
600 case CHAR:
601 np = op2(CHAR, NIL, (Node *)rlxval);
602 rtok = relex();
603 return (unary(np));
604 case ALL:
605 rtok = relex();
606 return (unary(op2(ALL, NIL, NIL)));
607 case DOT:
608 rtok = relex();
609 return (unary(op2(DOT, NIL, NIL)));
610 case CCL:
611 /*LINTED align*/
612 np = op2(CCL, NIL, (Node *)cclenter(rlxstr));
613 rtok = relex();
614 return (unary(np));
615 case NCCL:
616 /*LINTED align*/
617 np = op2(NCCL, NIL, (Node *)cclenter(rlxstr));
618 rtok = relex();
619 return (unary(np));
620 case '^':
621 rtok = relex();
622 return (unary(op2(CHAR, NIL, (Node *)HAT)));
623 case '$':
624 rtok = relex();
625 return (unary(op2(CHAR, NIL, NIL)));
626 case '(':
627 rtok = relex();
628 if (rtok == ')') { /* special pleading for () */
629 rtok = relex();
630 return (unary(op2(CCL, NIL,
631 /*LINTED align*/
632 (Node *)tostring((uchar *)""))));
633 }
634 np = regexp();
635 if (rtok == ')') {
636 rtok = relex();
637 return (unary(np));
638 } else {
639 ERROR "syntax error in regular expression %s at %s",
640 lastre, prestr FATAL;
641 }
642 default:
643 ERROR "illegal primary in regular expression %s at %s",
644 lastre, prestr FATAL;
645 }
646 /*NOTREACHED*/
647 return (NULL);
648 }
649
650 static Node *
651 concat(Node *np)
652 {
653 switch (rtok) {
654 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
655 return (concat(op2(CAT, np, primary())));
656 default:
657 return (np);
658 }
659 }
660
661 static Node *
662 alt(Node *np)
663 {
664 if (rtok == OR) {
665 rtok = relex();
666 return (alt(op2(OR, np, concat(primary()))));
667 }
668 return (np);
669 }
670
671 static Node *
672 unary(Node *np)
673 {
674 switch (rtok) {
675 case STAR:
676 rtok = relex();
677 return (unary(op2(STAR, np, NIL)));
678 case PLUS:
679 rtok = relex();
680 return (unary(op2(PLUS, np, NIL)));
681 case QUEST:
682 rtok = relex();
683 return (unary(op2(QUEST, np, NIL)));
684 default:
685 return (np);
686 }
687 }
688
689 static int
690 relex(void) /* lexical analyzer for reparse */
691 {
692 register int c;
693 uchar *cbuf;
694 int clen, cflag;
695
696 switch (c = *prestr++) {
697 case '|': return OR;
698 case '*': return STAR;
699 case '+': return PLUS;
700 case '?': return QUEST;
701 case '.': return DOT;
702 case '\0': prestr--; return '\0';
703 case '^':
704 case '$':
705 case '(':
706 case ')':
707 return (c);
708 case '\\':
709 if ((c = *prestr++) == 't')
710 c = '\t';
711 else if (c == 'n')
712 c = '\n';
713 else if (c == 'f')
714 c = '\f';
715 else if (c == 'r')
716 c = '\r';
717 else if (c == 'b')
718 c = '\b';
719 else if (c == '\\')
720 c = '\\';
721 else if (isdigit(c)) {
722 int n = c - '0';
723 if (isdigit(*prestr)) {
724 n = 8 * n + *prestr++ - '0';
725 if (isdigit(*prestr))
726 n = 8 * n + *prestr++ - '0';
727 }
728 c = n;
729 } /* else it's now in c */
730 rlxval = c;
731 return (CHAR);
732 default:
733 rlxval = c;
734 return (CHAR);
735 case '[':
736 clen = 0;
737 if (*prestr == '^') {
738 cflag = 1;
739 prestr++;
740 } else
741 cflag = 0;
742 init_buf(&cbuf, NULL, strlen((char *)prestr) * 2 + 1);
743 for (;;) {
744 if ((c = *prestr++) == '\\') {
745 cbuf[clen++] = '\\';
746 if ((c = *prestr++) == '\0') {
747 ERROR
748 "nonterminated character class %s", lastre FATAL;
749 }
750 cbuf[clen++] = c;
751 } else if (c == ']') {
752 cbuf[clen] = 0;
753 rlxstr = tostring(cbuf);
754 free(cbuf);
755 if (cflag == 0)
756 return (CCL);
757 else
758 return (NCCL);
759 } else if (c == '\n') {
760 ERROR "newline in character class %s...",
761 lastre FATAL;
762 } else if (c == '\0') {
763 ERROR "nonterminated character class %s",
764 lastre FATAL;
765 } else
766 cbuf[clen++] = c;
767 }
768 /*NOTREACHED*/
769 }
770 return (0);
771 }
772
773 static int
774 cgoto(fa *f, int s, int c)
775 {
776 register int i, j, k;
777 register int *p, *q;
778
779 for (i = 0; i <= f->accept; i++)
780 setvec[i] = 0;
781 setcnt = 0;
782 /* compute positions of gototab[s,c] into setvec */
783 p = f->posns[s];
784 for (i = 1; i <= *p; i++) {
785 if ((k = f->re[p[i]].ltype) != FINAL) {
786 if (k == CHAR && c == f->re[p[i]].lval ||
787 k == DOT && c != 0 && c != HAT ||
788 k == ALL && c != 0 ||
789 k == CCL &&
790 member(c, (uchar *)f->re[p[i]].lval) ||
791 k == NCCL &&
792 !member(c, (uchar *)f->re[p[i]].lval) &&
793 c != 0 && c != HAT) {
794 q = f->re[p[i]].lfollow;
795 for (j = 1; j <= *q; j++) {
796 if (setvec[q[j]] == 0) {
797 setcnt++;
798 setvec[q[j]] = 1;
799 }
800 }
801 }
802 }
803 }
804 /* determine if setvec is a previous state */
805 tmpset[0] = setcnt;
806 j = 1;
807 for (i = f->accept; i >= 0; i--)
808 if (setvec[i]) {
809 tmpset[j++] = i;
810 }
811 /* tmpset == previous state? */
812 for (i = 1; i <= f->curstat; i++) {
813 p = f->posns[i];
814 if ((k = tmpset[0]) != p[0])
815 goto different;
816 for (j = 1; j <= k; j++)
817 if (tmpset[j] != p[j])
818 goto different;
819 /* setvec is state i */
820 f->gototab[s][c] = i;
821 return (i);
822 different:;
823 }
824
825 /* add tmpset to current set of states */
826 if (f->curstat >= NSTATES-1) {
827 f->curstat = 2;
828 f->reset = 1;
829 for (i = 2; i < NSTATES; i++)
830 xfree(f->posns[i]);
831 } else
832 ++(f->curstat);
833 for (i = 0; i < NCHARS; i++)
834 f->gototab[f->curstat][i] = 0;
835 xfree(f->posns[f->curstat]);
836 if ((p = (int *)calloc(1, (setcnt + 1) * sizeof (int))) == NULL)
837 overflo("out of space in cgoto");
838
839 f->posns[f->curstat] = p;
840 f->gototab[s][c] = f->curstat;
841 for (i = 0; i <= setcnt; i++)
842 p[i] = tmpset[i];
843 if (setvec[f->accept])
844 f->out[f->curstat] = 1;
845 else
846 f->out[f->curstat] = 0;
847 return (f->curstat);
848 }
849
850 static void
851 freefa(fa *f)
852 {
853
854 register int i;
855
856 if (f == NULL)
857 return;
858 for (i = 0; i <= f->curstat; i++)
859 xfree(f->posns[i]);
860 for (i = 0; i <= f->accept; i++)
861 xfree(f->re[i].lfollow);
862 xfree(f->restr);
863 xfree(f);
864 }
|
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
25 * Use is subject to license terms.
26 */
27
28 /*
29 * Copyright (C) Lucent Technologies 1997
30 * All Rights Reserved
31 *
32 * Permission to use, copy, modify, and distribute this software and
33 * its documentation for any purpose and without fee is hereby
34 * granted, provided that the above copyright notice appear in all
35 * copies and that both that the copyright notice and this
36 * permission notice and warranty disclaimer appear in supporting
37 * documentation, and that the name Lucent Technologies or any of
38 * its entities not be used in advertising or publicity pertaining
39 * to distribution of the software without specific, written prior
40 * permission.
41 *
42 * LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
43 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
44 * IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
45 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
46 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
47 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
48 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
49 * THIS SOFTWARE.
50 */
51
52 #define DEBUG
53
54 #include "awk.h"
55 #include "y.tab.h"
56
57 #define HAT (NCHARS+2) /* matches ^ in regular expr */
58 /* NCHARS is 2**n */
59 #define MAXLIN 22
60
61 #define type(v) (v)->nobj /* badly overloaded here */
62 #define info(v) (v)->ntype /* badly overloaded here */
63 #define left(v) (v)->narg[0]
64 #define right(v) (v)->narg[1]
65 #define parent(v) (v)->nnext
66
67 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
68 #define ELEAF case EMPTYRE: /* empty string in regexp */
69 #define UNARY case STAR: case PLUS: case QUEST:
70
71 /*
72 * encoding in tree Nodes:
73 * leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
74 * left is index, right contains value or pointer to value
75 * unary (STAR, PLUS, QUEST): left is child, right is null
76 * binary (CAT, OR): left and right are children
77 * parent contains pointer to parent
78 */
79
80 int *setvec;
81 int *tmpset;
82 int maxsetvec = 0;
83
84 int rtok; /* next token in current re */
85 int rlxval;
86 static uchar *rlxstr;
87 static uchar *prestr; /* current position in current re */
88 static uchar *lastre; /* origin of last re */
89
90 static int setcnt;
91 static int poscnt;
92
93 uchar *patbeg;
94 int patlen;
95
96 #define NFA 20 /* cache this many dynamic fa's */
97 fa *fatab[NFA];
98 int nfatab = 0; /* entries in fatab */
99
100 static fa *mkdfa(const uchar *, int);
101 static int makeinit(fa *, int);
102 static void penter(Node *);
103 static void freetr(Node *);
104 static void overflo(const char *);
105 static void cfoll(fa *, Node *);
106 static void follow(Node *);
107 static Node *reparse(const uchar *);
108 static int relex(void);
109 static void freefa(fa *);
110 static int cgoto(fa *, int, int);
111 static Node *regexp(void);
112 static Node *primary(void);
113 static Node *concat(Node *);
114 static Node *alt(Node *);
115 static Node *unary(Node *);
116
117 fa *
118 makedfa(const uchar *s, int anchor) /* returns dfa for reg expr s */
119 {
120 int i, use, nuse;
121 fa *pfa;
122 static int now = 1;
123
124 if (setvec == 0) { /* first time through any RE */
125 maxsetvec = MAXLIN;
126 setvec = (int *)malloc(maxsetvec * sizeof (int));
127 tmpset = (int *)malloc(maxsetvec * sizeof (int));
128 if (setvec == 0 || tmpset == 0)
129 overflo("out of space initializing makedfa");
130 }
131
132 if (compile_time) /* a constant for sure */
133 return (mkdfa(s, anchor));
134 for (i = 0; i < nfatab; i++) { /* is it there already? */
135 if (fatab[i]->anchor == anchor &&
136 strcmp((const char *)fatab[i]->restr, (char *)s) == 0) {
137 fatab[i]->use = now++;
138 return (fatab[i]);
139 }
140 }
141 pfa = mkdfa(s, anchor);
142 if (nfatab < NFA) { /* room for another */
143 fatab[nfatab] = pfa;
144 fatab[nfatab]->use = now++;
145 nfatab++;
146 return (pfa);
147 }
148 use = fatab[0]->use; /* replace least-recently used */
149 nuse = 0;
150 for (i = 1; i < nfatab; i++) {
151 if (fatab[i]->use < use) {
152 use = fatab[i]->use;
153 nuse = i;
154 }
155 }
156 freefa(fatab[nuse]);
157 fatab[nuse] = pfa;
158 pfa->use = now++;
159 return (pfa);
160 }
161
162 fa *
163 mkdfa(const uchar *s, int anchor)
164 { /* does the real work of making a dfa */
165 /* anchor = 1 for anchored matches, else 0 */
166 Node *p, *p1;
167 fa *f;
168
169 p = reparse(s);
170 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
171 /* put ALL STAR in front of reg. exp. */
172 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
173 /* put FINAL after reg. exp. */
174
175 poscnt = 0;
176 penter(p1); /* enter parent pointers and leaf indices */
177 if ((f = (fa *)calloc(1, sizeof (fa) + poscnt * sizeof (rrow))) == NULL)
178 overflo("out of space for fa");
179 /* penter has computed number of positions in re */
180 f->accept = poscnt-1;
181 cfoll(f, p1); /* set up follow sets */
182 freetr(p1);
183 if ((f->posns[0] =
184 (int *)calloc(1, *(f->re[0].lfollow) * sizeof (int))) == NULL) {
185 overflo("out of space in makedfa");
186 }
187 if ((f->posns[1] = (int *)calloc(1, sizeof (int))) == NULL)
188 overflo("out of space in makedfa");
189 *f->posns[1] = 0;
190 f->initstat = makeinit(f, anchor);
191 f->anchor = anchor;
192 f->restr = tostring(s);
193 return (f);
194 }
195
196 static int
197 makeinit(fa *f, int anchor)
198 {
199 int i, k;
200
201 f->curstat = 2;
202 f->out[2] = 0;
203 f->reset = 0;
204 k = *(f->re[0].lfollow);
205 xfree(f->posns[2]);
206 if ((f->posns[2] = (int *)calloc(1, (k+1) * sizeof (int))) == NULL)
207 overflo("out of space in makeinit");
208 for (i = 0; i <= k; i++) {
209 (f->posns[2])[i] = (f->re[0].lfollow)[i];
210 }
211 if ((f->posns[2])[1] == f->accept)
212 f->out[2] = 1;
213 for (i = 0; i < NCHARS; i++)
214 f->gototab[2][i] = 0;
215 f->curstat = cgoto(f, 2, HAT);
216 if (anchor) {
217 *f->posns[2] = k-1; /* leave out position 0 */
218 for (i = 0; i < k; i++) {
219 (f->posns[0])[i] = (f->posns[2])[i];
220 }
221
222 f->out[0] = f->out[2];
223 if (f->curstat != 2)
224 --(*f->posns[f->curstat]);
225 }
226 return (f->curstat);
227 }
228
229 void
230 penter(Node *p) /* set up parent pointers and leaf indices */
231 {
232 switch (type(p)) {
233 ELEAF
234 LEAF
235 info(p) = poscnt;
236 poscnt++;
237 break;
238 UNARY
239 penter(left(p));
240 parent(left(p)) = p;
241 break;
242 case CAT:
243 case OR:
244 penter(left(p));
245 penter(right(p));
246 parent(left(p)) = p;
247 parent(right(p)) = p;
248 break;
249 default: /* can't happen */
250 FATAL("can't happen: unknown type %d in penter", type(p));
251 break;
252 }
253 }
254
255 static void
256 freetr(Node *p) /* free parse tree */
257 {
258 switch (type(p)) {
259 ELEAF
260 LEAF
261 xfree(p);
262 break;
263 UNARY
264 freetr(left(p));
265 xfree(p);
266 break;
267 case CAT:
268 case OR:
269 freetr(left(p));
270 freetr(right(p));
271 xfree(p);
272 break;
273 default: /* can't happen */
274 FATAL("can't happen: unknown type %d in freetr", type(p));
275 break;
276 }
277 }
278
279 /*
280 * in the parsing of regular expressions, metacharacters like . have
281 * to be seen literally; \056 is not a metacharacter.
282 */
283 int
284 hexstr(uchar **pp) /* find and eval hex string at pp, return new p */
285 { /* only pick up one 8-bit byte (2 chars) */
286 uchar *p;
287 int n = 0;
288 int i;
289
290 for (i = 0, p = (uchar *)*pp; i < 2 && isxdigit(*p); i++, p++) {
291 if (isdigit(*p))
292 n = 16 * n + *p - '0';
293 else if (*p >= 'a' && *p <= 'f')
294 n = 16 * n + *p - 'a' + 10;
295 else if (*p >= 'A' && *p <= 'F')
296 n = 16 * n + *p - 'A' + 10;
297 }
298 *pp = (uchar *)p;
299 return (n);
300 }
301
302 /* multiple use of arg */
303 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')
304
305 int
306 quoted(uchar **pp) /* pick up next thing after a \\ */
307 { /* and increment *pp */
308
309 uchar *p = *pp;
310 int c;
311
312 if ((c = *p++) == 't')
313 c = '\t';
314 else if (c == 'n')
315 c = '\n';
316 else if (c == 'f')
317 c = '\f';
318 else if (c == 'r')
319 c = '\r';
320 else if (c == 'b')
321 c = '\b';
322 else if (c == '\\')
323 c = '\\';
324 else if (c == 'x') { /* hexadecimal goo follows */
325 c = hexstr(&p); /* this adds a null if number is invalid */
326 } else if (isoctdigit(c)) { /* \d \dd \ddd */
327 int n = c - '0';
328 if (isoctdigit(*p)) {
329 n = 8 * n + *p++ - '0';
330 if (isoctdigit(*p))
331 n = 8 * n + *p++ - '0';
332 }
333 c = n;
334 } /* else */
335 /* c = c; */
336 *pp = p;
337 return (c);
338 }
339
340 uchar *
341 cclenter(const uchar *argp) /* add a character class */
342 {
343 int i, c, c2;
344 uchar *p = (uchar *)argp;
345 uchar *op, *bp;
346 static uchar *buf = NULL;
347 size_t bufsz = 100;
348
349 op = p;
350 if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL)
351 FATAL("out of space for character class [%.10s...] 1", p);
352 bp = buf;
353 for (i = 0; (c = *p++) != 0; ) {
354 if (c == '\\') {
355 c = quoted(&p);
356 } else if (c == '-' && i > 0 && bp[-1] != 0) {
357 if (*p != 0) {
358 c = bp[-1];
359 c2 = *p++;
360 if (c2 == '\\')
361 c2 = quoted(&p);
362 if (c > c2) { /* empty; ignore */
363 bp--;
364 i--;
365 continue;
366 }
367 while (c < c2) {
368 if (adjbuf(&buf, &bufsz,
369 bp - buf + 2, 100, &bp,
370 "cclenter1") == 0)
371 FATAL(
372 "out of space for character class [%.10s...] 2", p);
373 *bp++ = ++c;
374 i++;
375 }
376 continue;
377 }
378 }
379 if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter2"))
380 FATAL(
381 "out of space for character class [%.10s...] 3", p);
382 *bp++ = c;
383 i++;
384 }
385 *bp = 0;
386 dprintf(("cclenter: in = |%s|, out = |%s|\n", op, buf));
387 xfree(op);
388 return (tostring(buf));
389 }
390
391 static void
392 overflo(const char *s)
393 {
394 FATAL("regular expression too big: %.30s...", gettext((char *)s));
395 }
396
397 /* enter follow set of each leaf of vertex v into lfollow[leaf] */
398 static void
399 cfoll(fa *f, Node *v)
400 {
401 int i;
402 int *p;
403
404 switch (type(v)) {
405 ELEAF
406 LEAF
407 f->re[info(v)].ltype = type(v);
408 f->re[info(v)].lval.np = right(v);
409 while (f->accept >= maxsetvec) { /* guessing here! */
410 maxsetvec *= 4;
411 setvec = (int *)realloc(setvec, maxsetvec *
412 sizeof (int));
413 tmpset = (int *)realloc(tmpset, maxsetvec *
414 sizeof (int));
415 if (setvec == 0 || tmpset == 0)
416 overflo("out of space in cfoll()");
417 }
418 for (i = 0; i <= f->accept; i++)
419 setvec[i] = 0;
420 setcnt = 0;
421 follow(v); /* computes setvec and setcnt */
422 if ((p = (int *)calloc(1, (setcnt+1) * sizeof (int))) == NULL)
423 overflo("out of space building follow set");
424 f->re[info(v)].lfollow = p;
425 *p = setcnt;
426 for (i = f->accept; i >= 0; i--) {
427 if (setvec[i] == 1)
428 *++p = i;
429 }
430 break;
431 UNARY
432 cfoll(f, left(v));
433 break;
434 case CAT:
435 case OR:
436 cfoll(f, left(v));
437 cfoll(f, right(v));
438 break;
439 default: /* can't happen */
440 FATAL("can't happen: unknown type %d in cfoll", type(v));
441 }
442 }
443
444 /*
445 * collects initially active leaves of p into setvec
446 * returns 0 or 1 depending on whether p matches empty string
447 */
448 static int
449 first(Node *p)
450 {
451 int b, lp;
452
453 switch (type(p)) {
454 ELEAF
455 LEAF
456 /* look for high-water mark of subscripts */
457 lp = info(p);
458 /* guessing here! */
459 while (setcnt >= maxsetvec || lp >= maxsetvec) {
460 maxsetvec *= 4;
461 setvec = (int *)realloc(setvec, maxsetvec *
462 sizeof (int));
463 tmpset = (int *)realloc(tmpset, maxsetvec *
464 sizeof (int));
465 if (setvec == 0 || tmpset == 0)
466 overflo("out of space in first()");
467 }
468 if (type(p) == EMPTYRE) {
469 setvec[lp] = 0;
470 return (0);
471 }
472 if (setvec[lp] != 1) {
473 setvec[lp] = 1;
474 setcnt++;
475 }
476 if (type(p) == CCL && (*(uchar *)right(p)) == '\0')
477 return (0); /* empty CCL */
478 else
479 return (1);
480 case PLUS:
481 if (first(left(p)) == 0)
482 return (0);
483 return (1);
484 case STAR:
485 case QUEST:
486 (void) first(left(p));
487 return (0);
488 case CAT:
489 if (first(left(p)) == 0 && first(right(p)) == 0)
490 return (0);
491 return (1);
492 case OR:
493 b = first(right(p));
494 if (first(left(p)) == 0 || b == 0)
495 return (0);
496 return (1);
497 }
498 /* can't happen */
499 FATAL("can't happen: unknown type %d in first", type(p));
500 /*NOTREACHED*/
501 return (-1);
502 }
503
504 /* collects leaves that can follow v into setvec */
505 static void
506 follow(Node *v)
507 {
508 Node *p;
509
510 if (type(v) == FINAL)
511 return;
512 p = parent(v);
513 switch (type(p)) {
514 case STAR:
515 case PLUS:
516 (void) first(v);
517 follow(p);
518 return;
519
520 case OR:
521 case QUEST:
522 follow(p);
523 return;
524
525 case CAT:
526 if (v == left(p)) { /* v is left child of p */
527 if (first(right(p)) == 0) {
528 follow(p);
529 return;
530 }
531 } else /* v is right child */
532 follow(p);
533 return;
534 }
535 }
536
537 static int
538 member(int c, const uchar *sarg) /* is c in s? */
539 {
540 uchar *s = (uchar *)sarg;
541
542 while (*s)
543 if (c == *s++)
544 return (1);
545 return (0);
546 }
547
548
549 int
550 match(fa *f, const uchar *p0) /* shortest match ? */
551 {
552 int s, ns;
553 uchar *p = (uchar *)p0;
554
555 s = f->reset ? makeinit(f, 0) : f->initstat;
556 if (f->out[s])
557 return (1);
558 do {
559 /* assert(*p < NCHARS); */
560 if ((ns = f->gototab[s][*p]) != 0)
561 s = ns;
562 else
563 s = cgoto(f, s, *p);
564 if (f->out[s])
565 return (1);
566 } while (*p++ != 0);
567 return (0);
568 }
569
570 int
571 pmatch(fa *f, const uchar *p0) /* longest match, for sub */
572 {
573 int s, ns;
574 uchar *p = (uchar *)p0;
575 uchar *q;
576 int i, k;
577
578 /* s = f->reset ? makeinit(f,1) : f->initstat; */
579 if (f->reset) {
580 f->initstat = s = makeinit(f, 1);
581 } else {
582 s = f->initstat;
583 }
584 patbeg = p;
585 patlen = -1;
586 do {
587 q = p;
588 do {
589 if (f->out[s]) /* final state */
590 patlen = q-p;
591 /* assert(*q < NCHARS); */
592 if ((ns = f->gototab[s][*q]) != 0)
593 s = ns;
594 else
595 s = cgoto(f, s, *q);
596 if (s == 1) { /* no transition */
597 if (patlen >= 0) {
598 patbeg = p;
599 return (1);
600 } else
601 goto nextin; /* no match */
602 }
603 } while (*q++ != 0);
604 if (f->out[s])
605 patlen = q - p - 1; /* don't count $ */
606 if (patlen >= 0) {
607 patbeg = p;
608 return (1);
609 }
610 nextin:
611 s = 2;
612 if (f->reset) {
613 for (i = 2; i <= f->curstat; i++)
614 xfree(f->posns[i]);
615 k = *f->posns[0];
616 if ((f->posns[2] =
617 (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
618 overflo("out of space in pmatch");
619 }
620 for (i = 0; i <= k; i++)
621 (f->posns[2])[i] = (f->posns[0])[i];
622 f->initstat = f->curstat = 2;
623 f->out[2] = f->out[0];
624 for (i = 0; i < NCHARS; i++)
625 f->gototab[2][i] = 0;
626 }
627 } while (*p++ != 0);
628 return (0);
629 }
630
631 int
632 nematch(fa *f, const uchar *p0) /* non-empty match, for sub */
633 {
634 int s, ns;
635 uchar *p = (uchar *)p0;
636 uchar *q;
637 int i, k;
638
639 /* s = f->reset ? makeinit(f,1) : f->initstat; */
640 if (f->reset) {
641 f->initstat = s = makeinit(f, 1);
642 } else {
643 s = f->initstat;
644 }
645 patlen = -1;
646 while (*p) {
647 q = p;
648 do {
649 if (f->out[s]) /* final state */
650 patlen = q - p;
651 /* assert(*q < NCHARS); */
652 if ((ns = f->gototab[s][*q]) != 0)
653 s = ns;
654 else
655 s = cgoto(f, s, *q);
656 if (s == 1) { /* no transition */
657 if (patlen > 0) {
658 patbeg = p;
659 return (1);
660 } else
661 goto nnextin; /* no nonempty match */
662 }
663 } while (*q++ != 0);
664 if (f->out[s])
665 patlen = q - p - 1; /* don't count $ */
666 if (patlen > 0) {
667 patbeg = p;
668 return (1);
669 }
670 nnextin:
671 s = 2;
672 if (f->reset) {
673 for (i = 2; i <= f->curstat; i++)
674 xfree(f->posns[i]);
675 k = *f->posns[0];
676 if ((f->posns[2] =
677 (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
678 overflo("out of state space");
679 }
680 for (i = 0; i <= k; i++)
681 (f->posns[2])[i] = (f->posns[0])[i];
682 f->initstat = f->curstat = 2;
683 f->out[2] = f->out[0];
684 for (i = 0; i < NCHARS; i++)
685 f->gototab[2][i] = 0;
686 }
687 p++;
688 }
689 return (0);
690 }
691
692 static Node *
693 reparse(const uchar *p)
694 {
695 /* parses regular expression pointed to by p */
696 /* uses relex() to scan regular expression */
697 Node *np;
698
699 dprintf(("reparse <%s>\n", p));
700 /* prestr points to string to be parsed */
701 lastre = prestr = (uchar *)p;
702 rtok = relex();
703 /* GNU compatibility: an empty regexp matches anything */
704 if (rtok == '\0') {
705 /* FATAL("empty regular expression"); previous */
706 return (op2(EMPTYRE, NIL, NIL));
707 }
708 np = regexp();
709 if (rtok != '\0') {
710 FATAL("syntax error in regular expression %s at %s",
711 lastre, prestr);
712 }
713 return (np);
714 }
715
716 static Node *
717 regexp(void) /* top-level parse of reg expr */
718 {
719 return (alt(concat(primary())));
720 }
721
722 static Node *
723 primary(void)
724 {
725 Node *np;
726
727 switch (rtok) {
728 case CHAR:
729 np = op2(CHAR, NIL, itonp(rlxval));
730 rtok = relex();
731 return (unary(np));
732 case ALL:
733 rtok = relex();
734 return (unary(op2(ALL, NIL, NIL)));
735 case EMPTYRE:
736 rtok = relex();
737 return (unary(op2(ALL, NIL, NIL)));
738 case DOT:
739 rtok = relex();
740 return (unary(op2(DOT, NIL, NIL)));
741 case CCL:
742 /*LINTED align*/
743 np = op2(CCL, NIL, (Node *)cclenter(rlxstr));
744 rtok = relex();
745 return (unary(np));
746 case NCCL:
747 /*LINTED align*/
748 np = op2(NCCL, NIL, (Node *)cclenter(rlxstr));
749 rtok = relex();
750 return (unary(np));
751 case '^':
752 rtok = relex();
753 return (unary(op2(CHAR, NIL, itonp(HAT))));
754 case '$':
755 rtok = relex();
756 return (unary(op2(CHAR, NIL, NIL)));
757 case '(':
758 rtok = relex();
759 if (rtok == ')') { /* special pleading for () */
760 rtok = relex();
761 return (unary(op2(CCL, NIL,
762 /*LINTED align*/
763 (Node *)tostring((uchar *)""))));
764 }
765 np = regexp();
766 if (rtok == ')') {
767 rtok = relex();
768 return (unary(np));
769 } else {
770 FATAL("syntax error in regular expression %s at %s",
771 lastre, prestr);
772 }
773 default:
774 FATAL("illegal primary in regular expression %s at %s",
775 lastre, prestr);
776 }
777 /*NOTREACHED*/
778 return (NULL);
779 }
780
781 static Node *
782 concat(Node *np)
783 {
784 switch (rtok) {
785 case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL:
786 case '$': case '(':
787 return (concat(op2(CAT, np, primary())));
788 default:
789 return (np);
790 }
791 }
792
793 static Node *
794 alt(Node *np)
795 {
796 if (rtok == OR) {
797 rtok = relex();
798 return (alt(op2(OR, np, concat(primary()))));
799 }
800 return (np);
801 }
802
803 static Node *
804 unary(Node *np)
805 {
806 switch (rtok) {
807 case STAR:
808 rtok = relex();
809 return (unary(op2(STAR, np, NIL)));
810 case PLUS:
811 rtok = relex();
812 return (unary(op2(PLUS, np, NIL)));
813 case QUEST:
814 rtok = relex();
815 return (unary(op2(QUEST, np, NIL)));
816 default:
817 return (np);
818 }
819 }
820
821 /*
822 * Character class definitions conformant to the POSIX locale as
823 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
824 * and operating character sets are both ASCII (ISO646) or supersets
825 * thereof.
826 *
827 * Note that to avoid overflowing the temporary buffer used in
828 * relex(), the expanded character class (prior to range expansion)
829 * must be less than twice the size of their full name.
830 */
831
832 /*
833 * Because isblank doesn't show up in any of the header files on any
834 * system i use, it's defined here. if some other locale has a richer
835 * definition of "blank", define HAS_ISBLANK and provide your own
836 * version.
837 * the parentheses here are an attempt to find a path through the maze
838 * of macro definition and/or function and/or version provided. thanks
839 * to nelson beebe for the suggestion; let's see if it works everywhere.
840 */
841
842 /* #define HAS_ISBLANK */
843 #ifndef HAS_ISBLANK
844
845 int
846 (xisblank)(int c)
847 {
848 return (c == ' ' || c == '\t');
849 }
850
851 #endif
852
853 struct charclass {
854 const char *cc_name;
855 int cc_namelen;
856 int (*cc_func)(int);
857 } charclasses[] = {
858 { "alnum", 5, isalnum },
859 { "alpha", 5, isalpha },
860 #ifndef HAS_ISBLANK
861 { "blank", 5, isspace }, /* was isblank */
862 #else
863 { "blank", 5, isblank },
864 #endif
865 { "cntrl", 5, iscntrl },
866 { "digit", 5, isdigit },
867 { "graph", 5, isgraph },
868 { "lower", 5, islower },
869 { "print", 5, isprint },
870 { "punct", 5, ispunct },
871 { "space", 5, isspace },
872 { "upper", 5, isupper },
873 { "xdigit", 6, isxdigit },
874 { NULL, 0, NULL },
875 };
876
877 static int
878 relex(void) /* lexical analyzer for reparse */
879 {
880 int c, n;
881 int cflag;
882 uchar *buf = 0;
883 size_t bufsz = 100;
884 uchar *bp;
885 struct charclass *cc;
886 int i;
887
888 switch (c = *prestr++) {
889 case '|': return OR;
890 case '*': return STAR;
891 case '+': return PLUS;
892 case '?': return QUEST;
893 case '.': return DOT;
894 case '\0': prestr--; return '\0';
895 case '^':
896 case '$':
897 case '(':
898 case ')':
899 return (c);
900 case '\\':
901 rlxval = quoted(&prestr);
902 return (CHAR);
903 default:
904 rlxval = c;
905 return (CHAR);
906 case '[':
907 if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL)
908 FATAL("out of space in reg expr %.10s..", lastre);
909 bp = buf;
910 if (*prestr == '^') {
911 cflag = 1;
912 prestr++;
913 } else
914 cflag = 0;
915 n = 2 * strlen((const char *) prestr)+1;
916 if (!adjbuf(&buf, &bufsz, n, n, &bp, "relex1"))
917 FATAL("out of space for reg expr %.10s...", lastre);
918 for (;;) {
919 if ((c = *prestr++) == '\\') {
920 *bp++ = '\\';
921 if ((c = *prestr++) == '\0') {
922 FATAL(
923 "nonterminated character class %s", lastre);
924 }
925 *bp++ = c;
926 } else if (c == '[' && *prestr == ':') {
927 /*
928 * POSIX char class names, Dag-Erling
929 * Smorgrav, des@ofug.org
930 */
931 for (cc = charclasses; cc->cc_name; cc++)
932 if (strncmp((const char *) prestr + 1,
933 (const char *) cc->cc_name,
934 cc->cc_namelen) == 0)
935 break;
936 if (cc->cc_name != NULL &&
937 prestr[1 + cc->cc_namelen] == ':' &&
938 prestr[2 + cc->cc_namelen] == ']') {
939 prestr += cc->cc_namelen + 3;
940 for (i = 0; i < NCHARS; i++) {
941 if (!adjbuf(&buf, &bufsz,
942 bp - buf + 1, 100, &bp,
943 "relex2"))
944 FATAL(
945 "out of space for reg expr %.10s...", lastre);
946 if (cc->cc_func(i)) {
947 *bp++ = i;
948 n++;
949 }
950 }
951 } else
952 *bp++ = c;
953 } else if (c == '\0') {
954 FATAL("nonterminated character class %.20s",
955 lastre);
956 } else if (bp == buf) { /* 1st char is special */
957 *bp++ = c;
958 } else if (c == ']') {
959 *bp++ = 0;
960 rlxstr = tostring(buf);
961 if (cflag == 0)
962 return (CCL);
963 else
964 return (NCCL);
965 } else
966 *bp++ = c;
967 }
968 /*NOTREACHED*/
969 }
970 return (0);
971 }
972
973 static int
974 cgoto(fa *f, int s, int c)
975 {
976 int i, j, k;
977 int *p, *q;
978
979 assert(c == HAT || c < NCHARS);
980 while (f->accept >= maxsetvec) { /* guessing here! */
981 maxsetvec *= 4;
982 setvec = (int *)realloc(setvec, maxsetvec * sizeof (int));
983 tmpset = (int *)realloc(tmpset, maxsetvec * sizeof (int));
984 if (setvec == 0 || tmpset == 0)
985 overflo("out of space in cgoto()");
986 }
987 for (i = 0; i <= f->accept; i++)
988 setvec[i] = 0;
989 setcnt = 0;
990 /* compute positions of gototab[s,c] into setvec */
991 p = f->posns[s];
992 for (i = 1; i <= *p; i++) {
993 if ((k = f->re[p[i]].ltype) != FINAL) {
994 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) ||
995 (k == DOT && c != 0 && c != HAT) ||
996 (k == ALL && c != 0) ||
997 (k == EMPTYRE && c != 0) ||
998 (k == CCL &&
999 member(c, (uchar *)f->re[p[i]].lval.up)) ||
1000 ((k == NCCL &&
1001 !member(c, (uchar *)f->re[p[i]].lval.up)) &&
1002 c != 0 && c != HAT)) {
1003 q = f->re[p[i]].lfollow;
1004 for (j = 1; j <= *q; j++) {
1005 if (q[j] >= maxsetvec) {
1006 maxsetvec *= 4;
1007 setvec = (int *)realloc(setvec,
1008 maxsetvec * sizeof (int));
1009 tmpset = (int *)realloc(tmpset,
1010 maxsetvec * sizeof (int));
1011 if (setvec == 0 ||
1012 tmpset == 0)
1013 overflo(
1014 "cgoto overflow");
1015 }
1016 if (setvec[q[j]] == 0) {
1017 setcnt++;
1018 setvec[q[j]] = 1;
1019 }
1020 }
1021 }
1022 }
1023 }
1024 /* determine if setvec is a previous state */
1025 tmpset[0] = setcnt;
1026 j = 1;
1027 for (i = f->accept; i >= 0; i--)
1028 if (setvec[i]) {
1029 tmpset[j++] = i;
1030 }
1031 /* tmpset == previous state? */
1032 for (i = 1; i <= f->curstat; i++) {
1033 p = f->posns[i];
1034 if ((k = tmpset[0]) != p[0])
1035 goto different;
1036 for (j = 1; j <= k; j++)
1037 if (tmpset[j] != p[j])
1038 goto different;
1039 /* setvec is state i */
1040 f->gototab[s][c] = i;
1041 return (i);
1042 different:;
1043 }
1044
1045 /* add tmpset to current set of states */
1046 if (f->curstat >= NSTATES - 1) {
1047 f->curstat = 2;
1048 f->reset = 1;
1049 for (i = 2; i < NSTATES; i++)
1050 xfree(f->posns[i]);
1051 } else
1052 ++(f->curstat);
1053 for (i = 0; i < NCHARS; i++)
1054 f->gototab[f->curstat][i] = 0;
1055 xfree(f->posns[f->curstat]);
1056 if ((p = (int *)calloc(1, (setcnt + 1) * sizeof (int))) == NULL)
1057 overflo("out of space in cgoto");
1058
1059 f->posns[f->curstat] = p;
1060 f->gototab[s][c] = f->curstat;
1061 for (i = 0; i <= setcnt; i++)
1062 p[i] = tmpset[i];
1063 if (setvec[f->accept])
1064 f->out[f->curstat] = 1;
1065 else
1066 f->out[f->curstat] = 0;
1067 return (f->curstat);
1068 }
1069
1070 static void
1071 freefa(fa *f) /* free a finite automaton */
1072 {
1073 int i;
1074
1075 if (f == NULL)
1076 return;
1077 for (i = 0; i <= f->curstat; i++)
1078 xfree(f->posns[i]);
1079 for (i = 0; i <= f->accept; i++) {
1080 xfree(f->re[i].lfollow);
1081 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1082 xfree((f->re[i].lval.np));
1083 }
1084 xfree(f->restr);
1085 xfree(f);
1086 }
|