Print this page
3731 Update nawk to version 20121220
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/cmd/awk/b.c
+++ new/usr/src/cmd/awk/b.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License, Version 1.0 only
6 6 * (the "License"). You may not use this file except in compliance
7 7 * with the License.
8 8 *
9 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 10 * or http://www.opensolaris.org/os/licensing.
11 11 * See the License for the specific language governing permissions
12 12 * and limitations under the License.
13 13 *
14 14 * When distributing Covered Code, include this CDDL HEADER in each
15 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 16 * If applicable, add the following below this CDDL HEADER, with the
17 17 * fields enclosed by brackets "[]" replaced with your own identifying
↓ open down ↓ |
17 lines elided |
↑ open up ↑ |
18 18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 19 *
20 20 * CDDL HEADER END
21 21 */
22 22
23 23 /*
24 24 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
25 25 * Use is subject to license terms.
26 26 */
27 27
28 -/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 -/* All Rights Reserved */
30 -
31 -#pragma ident "%Z%%M% %I% %E% SMI"
28 +/*
29 + * Copyright (C) Lucent Technologies 1997
30 + * All Rights Reserved
31 + *
32 + * Permission to use, copy, modify, and distribute this software and
33 + * its documentation for any purpose and without fee is hereby
34 + * granted, provided that the above copyright notice appear in all
35 + * copies and that both that the copyright notice and this
36 + * permission notice and warranty disclaimer appear in supporting
37 + * documentation, and that the name Lucent Technologies or any of
38 + * its entities not be used in advertising or publicity pertaining
39 + * to distribution of the software without specific, written prior
40 + * permission.
41 + *
42 + * LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
43 + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
44 + * IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
45 + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
46 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
47 + * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
48 + * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
49 + * THIS SOFTWARE.
50 + */
32 51
33 52 #define DEBUG
34 53
35 54 #include "awk.h"
36 55 #include "y.tab.h"
37 56
38 -#define HAT (NCHARS-1) /* matches ^ in regular expr */
57 +#define HAT (NCHARS+2) /* matches ^ in regular expr */
39 58 /* NCHARS is 2**n */
40 -#define MAXLIN (3 * LINE_MAX)
59 +#define MAXLIN 22
41 60
42 -#define type(v) (v)->nobj
61 +#define type(v) (v)->nobj /* badly overloaded here */
62 +#define info(v) (v)->ntype /* badly overloaded here */
43 63 #define left(v) (v)->narg[0]
44 64 #define right(v) (v)->narg[1]
45 65 #define parent(v) (v)->nnext
46 66
47 67 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
68 +#define ELEAF case EMPTYRE: /* empty string in regexp */
48 69 #define UNARY case STAR: case PLUS: case QUEST:
49 70
50 71 /*
51 72 * encoding in tree Nodes:
52 73 * leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
53 74 * left is index, right contains value or pointer to value
54 75 * unary (STAR, PLUS, QUEST): left is child, right is null
55 76 * binary (CAT, OR): left and right are children
56 77 * parent contains pointer to parent
57 78 */
58 79
59 -int setvec[MAXLIN];
60 -int tmpset[MAXLIN];
61 -Node *point[MAXLIN];
80 +int *setvec;
81 +int *tmpset;
82 +int maxsetvec = 0;
62 83
63 84 int rtok; /* next token in current re */
64 85 int rlxval;
65 -uchar *rlxstr;
66 -uchar *prestr; /* current position in current re */
67 -uchar *lastre; /* origin of last re */
86 +static uchar *rlxstr;
87 +static uchar *prestr; /* current position in current re */
88 +static uchar *lastre; /* origin of last re */
68 89
69 90 static int setcnt;
70 91 static int poscnt;
71 92
72 93 uchar *patbeg;
73 94 int patlen;
74 95
75 96 #define NFA 20 /* cache this many dynamic fa's */
76 97 fa *fatab[NFA];
77 98 int nfatab = 0; /* entries in fatab */
78 99
79 -static fa *mkdfa(uchar *, int);
100 +static fa *mkdfa(const uchar *, int);
80 101 static int makeinit(fa *, int);
81 102 static void penter(Node *);
82 103 static void freetr(Node *);
83 -static void overflo(char *);
104 +static void overflo(const char *);
84 105 static void cfoll(fa *, Node *);
85 106 static void follow(Node *);
86 -static Node *reparse(uchar *);
107 +static Node *reparse(const uchar *);
87 108 static int relex(void);
88 109 static void freefa(fa *);
89 110 static int cgoto(fa *, int, int);
111 +static Node *regexp(void);
112 +static Node *primary(void);
113 +static Node *concat(Node *);
114 +static Node *alt(Node *);
115 +static Node *unary(Node *);
90 116
91 117 fa *
92 -makedfa(uchar *s, int anchor) /* returns dfa for reg expr s */
118 +makedfa(const uchar *s, int anchor) /* returns dfa for reg expr s */
93 119 {
94 120 int i, use, nuse;
95 121 fa *pfa;
122 + static int now = 1;
123 +
124 + if (setvec == 0) { /* first time through any RE */
125 + maxsetvec = MAXLIN;
126 + setvec = (int *)malloc(maxsetvec * sizeof (int));
127 + tmpset = (int *)malloc(maxsetvec * sizeof (int));
128 + if (setvec == 0 || tmpset == 0)
129 + overflo("out of space initializing makedfa");
130 + }
96 131
97 132 if (compile_time) /* a constant for sure */
98 133 return (mkdfa(s, anchor));
99 134 for (i = 0; i < nfatab; i++) { /* is it there already? */
100 135 if (fatab[i]->anchor == anchor &&
101 - strcmp((char *)fatab[i]->restr, (char *)s) == 0) {
102 - fatab[i]->use++;
136 + strcmp((const char *)fatab[i]->restr, (char *)s) == 0) {
137 + fatab[i]->use = now++;
103 138 return (fatab[i]);
104 139 }
105 140 }
106 141 pfa = mkdfa(s, anchor);
107 142 if (nfatab < NFA) { /* room for another */
108 143 fatab[nfatab] = pfa;
109 - fatab[nfatab]->use = 1;
144 + fatab[nfatab]->use = now++;
110 145 nfatab++;
111 146 return (pfa);
112 147 }
113 148 use = fatab[0]->use; /* replace least-recently used */
114 149 nuse = 0;
115 - for (i = 1; i < nfatab; i++)
150 + for (i = 1; i < nfatab; i++) {
116 151 if (fatab[i]->use < use) {
117 152 use = fatab[i]->use;
118 153 nuse = i;
119 154 }
155 + }
120 156 freefa(fatab[nuse]);
121 157 fatab[nuse] = pfa;
122 - pfa->use = 1;
158 + pfa->use = now++;
123 159 return (pfa);
124 160 }
125 161
126 162 fa *
127 -mkdfa(uchar *s, int anchor) /* does the real work of making a dfa */
128 - /* anchor = 1 for anchored matches, else 0 */
129 -{
163 +mkdfa(const uchar *s, int anchor)
164 +{ /* does the real work of making a dfa */
165 + /* anchor = 1 for anchored matches, else 0 */
130 166 Node *p, *p1;
131 167 fa *f;
132 168
133 169 p = reparse(s);
134 170 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
135 171 /* put ALL STAR in front of reg. exp. */
136 172 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
137 173 /* put FINAL after reg. exp. */
138 174
139 175 poscnt = 0;
140 176 penter(p1); /* enter parent pointers and leaf indices */
141 177 if ((f = (fa *)calloc(1, sizeof (fa) + poscnt * sizeof (rrow))) == NULL)
142 - overflo("no room for fa");
178 + overflo("out of space for fa");
143 179 /* penter has computed number of positions in re */
144 180 f->accept = poscnt-1;
145 181 cfoll(f, p1); /* set up follow sets */
146 182 freetr(p1);
147 183 if ((f->posns[0] =
148 184 (int *)calloc(1, *(f->re[0].lfollow) * sizeof (int))) == NULL) {
149 185 overflo("out of space in makedfa");
150 186 }
151 187 if ((f->posns[1] = (int *)calloc(1, sizeof (int))) == NULL)
152 188 overflo("out of space in makedfa");
153 189 *f->posns[1] = 0;
154 190 f->initstat = makeinit(f, anchor);
155 191 f->anchor = anchor;
156 192 f->restr = tostring(s);
157 193 return (f);
158 194 }
159 195
160 196 static int
161 197 makeinit(fa *f, int anchor)
162 198 {
163 - register int i, k;
199 + int i, k;
164 200
165 201 f->curstat = 2;
166 202 f->out[2] = 0;
167 203 f->reset = 0;
168 204 k = *(f->re[0].lfollow);
169 205 xfree(f->posns[2]);
170 206 if ((f->posns[2] = (int *)calloc(1, (k+1) * sizeof (int))) == NULL)
171 207 overflo("out of space in makeinit");
172 208 for (i = 0; i <= k; i++) {
173 209 (f->posns[2])[i] = (f->re[0].lfollow)[i];
174 210 }
175 211 if ((f->posns[2])[1] == f->accept)
176 212 f->out[2] = 1;
177 213 for (i = 0; i < NCHARS; i++)
178 214 f->gototab[2][i] = 0;
179 215 f->curstat = cgoto(f, 2, HAT);
180 216 if (anchor) {
181 217 *f->posns[2] = k-1; /* leave out position 0 */
182 218 for (i = 0; i < k; i++) {
183 219 (f->posns[0])[i] = (f->posns[2])[i];
184 220 }
185 221
186 222 f->out[0] = f->out[2];
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
187 223 if (f->curstat != 2)
188 224 --(*f->posns[f->curstat]);
189 225 }
190 226 return (f->curstat);
191 227 }
192 228
193 229 void
194 230 penter(Node *p) /* set up parent pointers and leaf indices */
195 231 {
196 232 switch (type(p)) {
233 + ELEAF
197 234 LEAF
198 - left(p) = (Node *) poscnt;
199 - point[poscnt++] = p;
235 + info(p) = poscnt;
236 + poscnt++;
200 237 break;
201 238 UNARY
202 239 penter(left(p));
203 240 parent(left(p)) = p;
204 241 break;
205 242 case CAT:
206 243 case OR:
207 244 penter(left(p));
208 245 penter(right(p));
209 246 parent(left(p)) = p;
210 247 parent(right(p)) = p;
211 248 break;
212 - default:
213 - ERROR "unknown type %d in penter", type(p) FATAL;
249 + default: /* can't happen */
250 + FATAL("can't happen: unknown type %d in penter", type(p));
214 251 break;
215 252 }
216 253 }
217 254
218 255 static void
219 256 freetr(Node *p) /* free parse tree */
220 257 {
221 258 switch (type(p)) {
259 + ELEAF
222 260 LEAF
223 261 xfree(p);
224 262 break;
225 263 UNARY
226 264 freetr(left(p));
227 265 xfree(p);
228 266 break;
229 267 case CAT:
230 268 case OR:
231 269 freetr(left(p));
232 270 freetr(right(p));
233 271 xfree(p);
234 272 break;
235 - default:
236 - ERROR "unknown type %d in freetr", type(p) FATAL;
273 + default: /* can't happen */
274 + FATAL("can't happen: unknown type %d in freetr", type(p));
237 275 break;
238 276 }
239 277 }
240 278
279 +/*
280 + * in the parsing of regular expressions, metacharacters like . have
281 + * to be seen literally; \056 is not a metacharacter.
282 + */
283 +int
284 +hexstr(uchar **pp) /* find and eval hex string at pp, return new p */
285 +{ /* only pick up one 8-bit byte (2 chars) */
286 + uchar *p;
287 + int n = 0;
288 + int i;
289 +
290 + for (i = 0, p = (uchar *)*pp; i < 2 && isxdigit(*p); i++, p++) {
291 + if (isdigit(*p))
292 + n = 16 * n + *p - '0';
293 + else if (*p >= 'a' && *p <= 'f')
294 + n = 16 * n + *p - 'a' + 10;
295 + else if (*p >= 'A' && *p <= 'F')
296 + n = 16 * n + *p - 'A' + 10;
297 + }
298 + *pp = (uchar *)p;
299 + return (n);
300 +}
301 +
302 +/* multiple use of arg */
303 +#define isoctdigit(c) ((c) >= '0' && (c) <= '7')
304 +
305 +int
306 +quoted(uchar **pp) /* pick up next thing after a \\ */
307 +{ /* and increment *pp */
308 +
309 + uchar *p = *pp;
310 + int c;
311 +
312 + if ((c = *p++) == 't')
313 + c = '\t';
314 + else if (c == 'n')
315 + c = '\n';
316 + else if (c == 'f')
317 + c = '\f';
318 + else if (c == 'r')
319 + c = '\r';
320 + else if (c == 'b')
321 + c = '\b';
322 + else if (c == '\\')
323 + c = '\\';
324 + else if (c == 'x') { /* hexadecimal goo follows */
325 + c = hexstr(&p); /* this adds a null if number is invalid */
326 + } else if (isoctdigit(c)) { /* \d \dd \ddd */
327 + int n = c - '0';
328 + if (isoctdigit(*p)) {
329 + n = 8 * n + *p++ - '0';
330 + if (isoctdigit(*p))
331 + n = 8 * n + *p++ - '0';
332 + }
333 + c = n;
334 + } /* else */
335 + /* c = c; */
336 + *pp = p;
337 + return (c);
338 +}
339 +
241 340 uchar *
242 -cclenter(uchar *p)
341 +cclenter(const uchar *argp) /* add a character class */
243 342 {
244 - register int i, c;
245 - uchar *op, *chars, *ret;
246 - size_t bsize;
343 + int i, c, c2;
344 + uchar *p = (uchar *)argp;
345 + uchar *op, *bp;
346 + static uchar *buf = NULL;
347 + size_t bufsz = 100;
247 348
248 - init_buf(&chars, &bsize, LINE_INCR);
249 349 op = p;
250 - i = 0;
251 - while ((c = *p++) != 0) {
350 + if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL)
351 + FATAL("out of space for character class [%.10s...] 1", p);
352 + bp = buf;
353 + for (i = 0; (c = *p++) != 0; ) {
252 354 if (c == '\\') {
253 - if ((c = *p++) == 't')
254 - c = '\t';
255 - else if (c == 'n')
256 - c = '\n';
257 - else if (c == 'f')
258 - c = '\f';
259 - else if (c == 'r')
260 - c = '\r';
261 - else if (c == 'b')
262 - c = '\b';
263 - else if (c == '\\')
264 - c = '\\';
265 - else if (isdigit(c)) {
266 - int n = c - '0';
267 - if (isdigit(*p)) {
268 - n = 8 * n + *p++ - '0';
269 - if (isdigit(*p))
270 - n = 8 * n + *p++ - '0';
271 - }
272 - c = n;
273 - } /* else */
274 - /* c = c; */
275 - } else if (c == '-' && i > 0 && chars[i-1] != 0) {
355 + c = quoted(&p);
356 + } else if (c == '-' && i > 0 && bp[-1] != 0) {
276 357 if (*p != 0) {
277 - c = chars[i-1];
278 - while ((uchar)c < *p) { /* fails if *p is \\ */
279 - expand_buf(&chars, &bsize, i);
280 - chars[i++] = ++c;
358 + c = bp[-1];
359 + c2 = *p++;
360 + if (c2 == '\\')
361 + c2 = quoted(&p);
362 + if (c > c2) { /* empty; ignore */
363 + bp--;
364 + i--;
365 + continue;
366 + }
367 + while (c < c2) {
368 + if (adjbuf(&buf, &bufsz,
369 + bp - buf + 2, 100, &bp,
370 + "cclenter1") == 0)
371 + FATAL(
372 + "out of space for character class [%.10s...] 2", p);
373 + *bp++ = ++c;
374 + i++;
281 375 }
282 - p++;
283 376 continue;
284 377 }
285 378 }
286 - expand_buf(&chars, &bsize, i);
287 - chars[i++] = c;
379 + if (!adjbuf(&buf, &bufsz, bp-buf+2, 100, &bp, "cclenter2"))
380 + FATAL(
381 + "out of space for character class [%.10s...] 3", p);
382 + *bp++ = c;
383 + i++;
288 384 }
289 - chars[i++] = '\0';
290 - dprintf(("cclenter: in = |%s|, out = |%s|\n", op, chars));
385 + *bp = 0;
386 + dprintf(("cclenter: in = |%s|, out = |%s|\n", op, buf));
291 387 xfree(op);
292 - ret = tostring(chars);
293 - free(chars);
294 - return (ret);
388 + return (tostring(buf));
295 389 }
296 390
297 391 static void
298 -overflo(char *s)
392 +overflo(const char *s)
299 393 {
300 - ERROR "regular expression too big: %s", gettext((char *)s) FATAL;
394 + FATAL("regular expression too big: %.30s...", gettext((char *)s));
301 395 }
302 396
303 397 /* enter follow set of each leaf of vertex v into lfollow[leaf] */
304 398 static void
305 399 cfoll(fa *f, Node *v)
306 400 {
307 - register int i;
308 - register int *p;
401 + int i;
402 + int *p;
309 403
310 404 switch (type(v)) {
405 + ELEAF
311 406 LEAF
312 - f->re[(int)left(v)].ltype = type(v);
313 - f->re[(int)left(v)].lval = (int)right(v);
407 + f->re[info(v)].ltype = type(v);
408 + f->re[info(v)].lval.np = right(v);
409 + while (f->accept >= maxsetvec) { /* guessing here! */
410 + maxsetvec *= 4;
411 + setvec = (int *)realloc(setvec, maxsetvec *
412 + sizeof (int));
413 + tmpset = (int *)realloc(tmpset, maxsetvec *
414 + sizeof (int));
415 + if (setvec == 0 || tmpset == 0)
416 + overflo("out of space in cfoll()");
417 + }
314 418 for (i = 0; i <= f->accept; i++)
315 419 setvec[i] = 0;
316 420 setcnt = 0;
317 421 follow(v); /* computes setvec and setcnt */
318 422 if ((p = (int *)calloc(1, (setcnt+1) * sizeof (int))) == NULL)
319 - overflo("follow set overflow");
320 - f->re[(int)left(v)].lfollow = p;
423 + overflo("out of space building follow set");
424 + f->re[info(v)].lfollow = p;
321 425 *p = setcnt;
322 426 for (i = f->accept; i >= 0; i--) {
323 427 if (setvec[i] == 1)
324 428 *++p = i;
325 429 }
326 430 break;
327 431 UNARY
328 432 cfoll(f, left(v));
329 433 break;
330 434 case CAT:
331 435 case OR:
332 436 cfoll(f, left(v));
333 437 cfoll(f, right(v));
334 438 break;
335 - default:
336 - ERROR "unknown type %d in cfoll", type(v) FATAL;
439 + default: /* can't happen */
440 + FATAL("can't happen: unknown type %d in cfoll", type(v));
337 441 }
338 442 }
339 443
340 444 /*
341 445 * collects initially active leaves of p into setvec
342 446 * returns 0 or 1 depending on whether p matches empty string
343 447 */
344 448 static int
345 449 first(Node *p)
346 450 {
347 - register int b;
451 + int b, lp;
348 452
349 453 switch (type(p)) {
454 + ELEAF
350 455 LEAF
351 - if (setvec[(int)left(p)] != 1) {
352 - setvec[(int)left(p)] = 1;
456 + /* look for high-water mark of subscripts */
457 + lp = info(p);
458 + /* guessing here! */
459 + while (setcnt >= maxsetvec || lp >= maxsetvec) {
460 + maxsetvec *= 4;
461 + setvec = (int *)realloc(setvec, maxsetvec *
462 + sizeof (int));
463 + tmpset = (int *)realloc(tmpset, maxsetvec *
464 + sizeof (int));
465 + if (setvec == 0 || tmpset == 0)
466 + overflo("out of space in first()");
467 + }
468 + if (type(p) == EMPTYRE) {
469 + setvec[lp] = 0;
470 + return (0);
471 + }
472 + if (setvec[lp] != 1) {
473 + setvec[lp] = 1;
353 474 setcnt++;
354 475 }
355 476 if (type(p) == CCL && (*(uchar *)right(p)) == '\0')
356 477 return (0); /* empty CCL */
357 478 else
358 479 return (1);
359 480 case PLUS:
360 481 if (first(left(p)) == 0)
361 482 return (0);
362 483 return (1);
363 484 case STAR:
364 485 case QUEST:
365 486 (void) first(left(p));
366 487 return (0);
↓ open down ↓ |
4 lines elided |
↑ open up ↑ |
367 488 case CAT:
368 489 if (first(left(p)) == 0 && first(right(p)) == 0)
369 490 return (0);
370 491 return (1);
371 492 case OR:
372 493 b = first(right(p));
373 494 if (first(left(p)) == 0 || b == 0)
374 495 return (0);
375 496 return (1);
376 497 }
377 - ERROR "unknown type %d in first", type(p) FATAL;
498 + /* can't happen */
499 + FATAL("can't happen: unknown type %d in first", type(p));
500 + /*NOTREACHED*/
378 501 return (-1);
379 502 }
380 503
381 504 /* collects leaves that can follow v into setvec */
382 505 static void
383 506 follow(Node *v)
384 507 {
385 508 Node *p;
386 509
387 510 if (type(v) == FINAL)
388 511 return;
389 512 p = parent(v);
390 513 switch (type(p)) {
391 514 case STAR:
392 515 case PLUS:
393 516 (void) first(v);
394 517 follow(p);
395 518 return;
396 519
397 520 case OR:
398 521 case QUEST:
399 522 follow(p);
400 523 return;
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
401 524
402 525 case CAT:
403 526 if (v == left(p)) { /* v is left child of p */
404 527 if (first(right(p)) == 0) {
405 528 follow(p);
406 529 return;
407 530 }
408 531 } else /* v is right child */
409 532 follow(p);
410 533 return;
411 - default:
412 - ERROR "unknown type %d in follow", type(p) FATAL;
413 - break;
414 534 }
415 535 }
416 536
417 537 static int
418 -member(uchar c, uchar *s) /* is c in s? */
538 +member(int c, const uchar *sarg) /* is c in s? */
419 539 {
540 + uchar *s = (uchar *)sarg;
541 +
420 542 while (*s)
421 543 if (c == *s++)
422 544 return (1);
423 545 return (0);
424 546 }
425 547
426 548
427 549 int
428 -match(fa *f, uchar *p)
550 +match(fa *f, const uchar *p0) /* shortest match ? */
429 551 {
430 - register int s, ns;
552 + int s, ns;
553 + uchar *p = (uchar *)p0;
431 554
432 555 s = f->reset ? makeinit(f, 0) : f->initstat;
433 556 if (f->out[s])
434 557 return (1);
435 558 do {
559 + /* assert(*p < NCHARS); */
436 560 if ((ns = f->gototab[s][*p]) != 0)
437 561 s = ns;
438 562 else
439 563 s = cgoto(f, s, *p);
440 564 if (f->out[s])
441 565 return (1);
442 566 } while (*p++ != 0);
443 567 return (0);
444 568 }
445 569
446 570 int
447 -pmatch(fa *f, uchar *p)
571 +pmatch(fa *f, const uchar *p0) /* longest match, for sub */
448 572 {
449 - register int s, ns;
450 - register uchar *q;
573 + int s, ns;
574 + uchar *p = (uchar *)p0;
575 + uchar *q;
451 576 int i, k;
452 577
578 + /* s = f->reset ? makeinit(f,1) : f->initstat; */
453 579 if (f->reset) {
454 580 f->initstat = s = makeinit(f, 1);
455 581 } else {
456 582 s = f->initstat;
457 583 }
458 584 patbeg = p;
459 585 patlen = -1;
460 586 do {
461 587 q = p;
462 588 do {
463 589 if (f->out[s]) /* final state */
464 590 patlen = q-p;
591 + /* assert(*q < NCHARS); */
465 592 if ((ns = f->gototab[s][*q]) != 0)
466 593 s = ns;
467 594 else
468 595 s = cgoto(f, s, *q);
469 596 if (s == 1) { /* no transition */
470 597 if (patlen >= 0) {
471 598 patbeg = p;
472 599 return (1);
473 600 } else
474 601 goto nextin; /* no match */
475 602 }
476 603 } while (*q++ != 0);
477 604 if (f->out[s])
478 605 patlen = q - p - 1; /* don't count $ */
479 606 if (patlen >= 0) {
480 607 patbeg = p;
481 608 return (1);
482 609 }
483 - nextin:
610 +nextin:
484 611 s = 2;
485 612 if (f->reset) {
486 613 for (i = 2; i <= f->curstat; i++)
487 614 xfree(f->posns[i]);
488 615 k = *f->posns[0];
489 616 if ((f->posns[2] =
490 617 (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
491 618 overflo("out of space in pmatch");
492 619 }
493 620 for (i = 0; i <= k; i++)
494 621 (f->posns[2])[i] = (f->posns[0])[i];
↓ open down ↓ |
1 lines elided |
↑ open up ↑ |
495 622 f->initstat = f->curstat = 2;
496 623 f->out[2] = f->out[0];
497 624 for (i = 0; i < NCHARS; i++)
498 625 f->gototab[2][i] = 0;
499 626 }
500 627 } while (*p++ != 0);
501 628 return (0);
502 629 }
503 630
504 631 int
505 -nematch(fa *f, uchar *p)
632 +nematch(fa *f, const uchar *p0) /* non-empty match, for sub */
506 633 {
507 - register int s, ns;
508 - register uchar *q;
634 + int s, ns;
635 + uchar *p = (uchar *)p0;
636 + uchar *q;
509 637 int i, k;
510 638
639 + /* s = f->reset ? makeinit(f,1) : f->initstat; */
511 640 if (f->reset) {
512 641 f->initstat = s = makeinit(f, 1);
513 642 } else {
514 643 s = f->initstat;
515 644 }
516 645 patlen = -1;
517 646 while (*p) {
518 647 q = p;
519 648 do {
520 649 if (f->out[s]) /* final state */
521 - patlen = q-p;
650 + patlen = q - p;
651 + /* assert(*q < NCHARS); */
522 652 if ((ns = f->gototab[s][*q]) != 0)
523 653 s = ns;
524 654 else
525 655 s = cgoto(f, s, *q);
526 656 if (s == 1) { /* no transition */
527 657 if (patlen > 0) {
528 658 patbeg = p;
529 659 return (1);
530 660 } else
531 661 goto nnextin; /* no nonempty match */
532 662 }
533 663 } while (*q++ != 0);
534 664 if (f->out[s])
535 665 patlen = q - p - 1; /* don't count $ */
536 666 if (patlen > 0) {
537 667 patbeg = p;
538 668 return (1);
539 669 }
540 - nnextin:
670 +nnextin:
541 671 s = 2;
542 672 if (f->reset) {
543 673 for (i = 2; i <= f->curstat; i++)
544 674 xfree(f->posns[i]);
545 675 k = *f->posns[0];
546 676 if ((f->posns[2] =
547 677 (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
548 678 overflo("out of state space");
549 679 }
550 680 for (i = 0; i <= k; i++)
551 681 (f->posns[2])[i] = (f->posns[0])[i];
↓ open down ↓ |
1 lines elided |
↑ open up ↑ |
552 682 f->initstat = f->curstat = 2;
553 683 f->out[2] = f->out[0];
554 684 for (i = 0; i < NCHARS; i++)
555 685 f->gototab[2][i] = 0;
556 686 }
557 687 p++;
558 688 }
559 689 return (0);
560 690 }
561 691
562 -static Node *regexp(void), *primary(void), *concat(Node *);
563 -static Node *alt(Node *), *unary(Node *);
564 -
565 692 static Node *
566 -reparse(uchar *p)
693 +reparse(const uchar *p)
567 694 {
568 695 /* parses regular expression pointed to by p */
569 696 /* uses relex() to scan regular expression */
570 697 Node *np;
571 698
572 699 dprintf(("reparse <%s>\n", p));
573 - lastre = prestr = p; /* prestr points to string to be parsed */
700 + /* prestr points to string to be parsed */
701 + lastre = prestr = (uchar *)p;
574 702 rtok = relex();
575 - if (rtok == '\0')
576 - ERROR "empty regular expression" FATAL;
577 - np = regexp();
703 + /* GNU compatibility: an empty regexp matches anything */
578 704 if (rtok == '\0') {
579 - return (np);
580 - } else {
581 - ERROR "syntax error in regular expression %s at %s",
582 - lastre, prestr FATAL;
705 + /* FATAL("empty regular expression"); previous */
706 + return (op2(EMPTYRE, NIL, NIL));
583 707 }
584 - /*NOTREACHED*/
585 - return (NULL);
708 + np = regexp();
709 + if (rtok != '\0') {
710 + FATAL("syntax error in regular expression %s at %s",
711 + lastre, prestr);
712 + }
713 + return (np);
586 714 }
587 715
588 716 static Node *
589 -regexp(void)
717 +regexp(void) /* top-level parse of reg expr */
590 718 {
591 719 return (alt(concat(primary())));
592 720 }
593 721
594 722 static Node *
595 723 primary(void)
596 724 {
597 725 Node *np;
598 726
599 727 switch (rtok) {
600 728 case CHAR:
601 - np = op2(CHAR, NIL, (Node *)rlxval);
729 + np = op2(CHAR, NIL, itonp(rlxval));
602 730 rtok = relex();
603 731 return (unary(np));
604 732 case ALL:
605 733 rtok = relex();
606 734 return (unary(op2(ALL, NIL, NIL)));
735 + case EMPTYRE:
736 + rtok = relex();
737 + return (unary(op2(ALL, NIL, NIL)));
607 738 case DOT:
608 739 rtok = relex();
609 740 return (unary(op2(DOT, NIL, NIL)));
610 741 case CCL:
611 742 /*LINTED align*/
612 743 np = op2(CCL, NIL, (Node *)cclenter(rlxstr));
613 744 rtok = relex();
614 745 return (unary(np));
615 746 case NCCL:
616 747 /*LINTED align*/
617 748 np = op2(NCCL, NIL, (Node *)cclenter(rlxstr));
618 749 rtok = relex();
619 750 return (unary(np));
620 751 case '^':
621 752 rtok = relex();
622 - return (unary(op2(CHAR, NIL, (Node *)HAT)));
753 + return (unary(op2(CHAR, NIL, itonp(HAT))));
623 754 case '$':
624 755 rtok = relex();
625 756 return (unary(op2(CHAR, NIL, NIL)));
626 757 case '(':
627 758 rtok = relex();
628 759 if (rtok == ')') { /* special pleading for () */
629 760 rtok = relex();
630 761 return (unary(op2(CCL, NIL,
631 762 /*LINTED align*/
632 763 (Node *)tostring((uchar *)""))));
633 764 }
634 765 np = regexp();
635 766 if (rtok == ')') {
636 767 rtok = relex();
637 768 return (unary(np));
638 769 } else {
639 - ERROR "syntax error in regular expression %s at %s",
640 - lastre, prestr FATAL;
770 + FATAL("syntax error in regular expression %s at %s",
771 + lastre, prestr);
641 772 }
642 773 default:
643 - ERROR "illegal primary in regular expression %s at %s",
644 - lastre, prestr FATAL;
774 + FATAL("illegal primary in regular expression %s at %s",
775 + lastre, prestr);
645 776 }
646 777 /*NOTREACHED*/
647 778 return (NULL);
648 779 }
649 780
650 781 static Node *
651 782 concat(Node *np)
652 783 {
653 784 switch (rtok) {
654 - case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
785 + case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL:
786 + case '$': case '(':
655 787 return (concat(op2(CAT, np, primary())));
656 788 default:
657 789 return (np);
658 790 }
659 791 }
660 792
661 793 static Node *
662 794 alt(Node *np)
663 795 {
664 796 if (rtok == OR) {
665 797 rtok = relex();
666 798 return (alt(op2(OR, np, concat(primary()))));
667 799 }
668 800 return (np);
669 801 }
670 802
671 803 static Node *
672 804 unary(Node *np)
673 805 {
674 806 switch (rtok) {
675 807 case STAR:
676 808 rtok = relex();
677 809 return (unary(op2(STAR, np, NIL)));
678 810 case PLUS:
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
679 811 rtok = relex();
680 812 return (unary(op2(PLUS, np, NIL)));
681 813 case QUEST:
682 814 rtok = relex();
683 815 return (unary(op2(QUEST, np, NIL)));
684 816 default:
685 817 return (np);
686 818 }
687 819 }
688 820
821 +/*
822 + * Character class definitions conformant to the POSIX locale as
823 + * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
824 + * and operating character sets are both ASCII (ISO646) or supersets
825 + * thereof.
826 + *
827 + * Note that to avoid overflowing the temporary buffer used in
828 + * relex(), the expanded character class (prior to range expansion)
829 + * must be less than twice the size of their full name.
830 + */
831 +
832 +/*
833 + * Because isblank doesn't show up in any of the header files on any
834 + * system i use, it's defined here. if some other locale has a richer
835 + * definition of "blank", define HAS_ISBLANK and provide your own
836 + * version.
837 + * the parentheses here are an attempt to find a path through the maze
838 + * of macro definition and/or function and/or version provided. thanks
839 + * to nelson beebe for the suggestion; let's see if it works everywhere.
840 + */
841 +
842 +/* #define HAS_ISBLANK */
843 +#ifndef HAS_ISBLANK
844 +
845 +int
846 +(xisblank)(int c)
847 +{
848 + return (c == ' ' || c == '\t');
849 +}
850 +
851 +#endif
852 +
853 +struct charclass {
854 + const char *cc_name;
855 + int cc_namelen;
856 + int (*cc_func)(int);
857 +} charclasses[] = {
858 + { "alnum", 5, isalnum },
859 + { "alpha", 5, isalpha },
860 +#ifndef HAS_ISBLANK
861 + { "blank", 5, isspace }, /* was isblank */
862 +#else
863 + { "blank", 5, isblank },
864 +#endif
865 + { "cntrl", 5, iscntrl },
866 + { "digit", 5, isdigit },
867 + { "graph", 5, isgraph },
868 + { "lower", 5, islower },
869 + { "print", 5, isprint },
870 + { "punct", 5, ispunct },
871 + { "space", 5, isspace },
872 + { "upper", 5, isupper },
873 + { "xdigit", 6, isxdigit },
874 + { NULL, 0, NULL },
875 +};
876 +
689 877 static int
690 878 relex(void) /* lexical analyzer for reparse */
691 879 {
692 - register int c;
693 - uchar *cbuf;
694 - int clen, cflag;
880 + int c, n;
881 + int cflag;
882 + uchar *buf = 0;
883 + size_t bufsz = 100;
884 + uchar *bp;
885 + struct charclass *cc;
886 + int i;
695 887
696 888 switch (c = *prestr++) {
697 889 case '|': return OR;
698 890 case '*': return STAR;
699 891 case '+': return PLUS;
700 892 case '?': return QUEST;
701 893 case '.': return DOT;
702 894 case '\0': prestr--; return '\0';
703 895 case '^':
704 896 case '$':
705 897 case '(':
706 898 case ')':
707 899 return (c);
708 900 case '\\':
709 - if ((c = *prestr++) == 't')
710 - c = '\t';
711 - else if (c == 'n')
712 - c = '\n';
713 - else if (c == 'f')
714 - c = '\f';
715 - else if (c == 'r')
716 - c = '\r';
717 - else if (c == 'b')
718 - c = '\b';
719 - else if (c == '\\')
720 - c = '\\';
721 - else if (isdigit(c)) {
722 - int n = c - '0';
723 - if (isdigit(*prestr)) {
724 - n = 8 * n + *prestr++ - '0';
725 - if (isdigit(*prestr))
726 - n = 8 * n + *prestr++ - '0';
727 - }
728 - c = n;
729 - } /* else it's now in c */
730 - rlxval = c;
901 + rlxval = quoted(&prestr);
731 902 return (CHAR);
732 903 default:
733 904 rlxval = c;
734 905 return (CHAR);
735 906 case '[':
736 - clen = 0;
907 + if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL)
908 + FATAL("out of space in reg expr %.10s..", lastre);
909 + bp = buf;
737 910 if (*prestr == '^') {
738 911 cflag = 1;
739 912 prestr++;
740 913 } else
741 914 cflag = 0;
742 - init_buf(&cbuf, NULL, strlen((char *)prestr) * 2 + 1);
915 + n = 2 * strlen((const char *) prestr)+1;
916 + if (!adjbuf(&buf, &bufsz, n, n, &bp, "relex1"))
917 + FATAL("out of space for reg expr %.10s...", lastre);
743 918 for (;;) {
744 919 if ((c = *prestr++) == '\\') {
745 - cbuf[clen++] = '\\';
920 + *bp++ = '\\';
746 921 if ((c = *prestr++) == '\0') {
747 - ERROR
748 - "nonterminated character class %s", lastre FATAL;
922 + FATAL(
923 + "nonterminated character class %s", lastre);
749 924 }
750 - cbuf[clen++] = c;
925 + *bp++ = c;
926 + } else if (c == '[' && *prestr == ':') {
927 + /*
928 + * POSIX char class names, Dag-Erling
929 + * Smorgrav, des@ofug.org
930 + */
931 + for (cc = charclasses; cc->cc_name; cc++)
932 + if (strncmp((const char *) prestr + 1,
933 + (const char *) cc->cc_name,
934 + cc->cc_namelen) == 0)
935 + break;
936 + if (cc->cc_name != NULL &&
937 + prestr[1 + cc->cc_namelen] == ':' &&
938 + prestr[2 + cc->cc_namelen] == ']') {
939 + prestr += cc->cc_namelen + 3;
940 + for (i = 0; i < NCHARS; i++) {
941 + if (!adjbuf(&buf, &bufsz,
942 + bp - buf + 1, 100, &bp,
943 + "relex2"))
944 + FATAL(
945 + "out of space for reg expr %.10s...", lastre);
946 + if (cc->cc_func(i)) {
947 + *bp++ = i;
948 + n++;
949 + }
950 + }
951 + } else
952 + *bp++ = c;
953 + } else if (c == '\0') {
954 + FATAL("nonterminated character class %.20s",
955 + lastre);
956 + } else if (bp == buf) { /* 1st char is special */
957 + *bp++ = c;
751 958 } else if (c == ']') {
752 - cbuf[clen] = 0;
753 - rlxstr = tostring(cbuf);
754 - free(cbuf);
959 + *bp++ = 0;
960 + rlxstr = tostring(buf);
755 961 if (cflag == 0)
756 962 return (CCL);
757 963 else
758 964 return (NCCL);
759 - } else if (c == '\n') {
760 - ERROR "newline in character class %s...",
761 - lastre FATAL;
762 - } else if (c == '\0') {
763 - ERROR "nonterminated character class %s",
764 - lastre FATAL;
765 965 } else
766 - cbuf[clen++] = c;
966 + *bp++ = c;
767 967 }
768 968 /*NOTREACHED*/
769 969 }
770 970 return (0);
771 971 }
772 972
773 973 static int
774 974 cgoto(fa *f, int s, int c)
775 975 {
776 - register int i, j, k;
777 - register int *p, *q;
976 + int i, j, k;
977 + int *p, *q;
778 978
979 + assert(c == HAT || c < NCHARS);
980 + while (f->accept >= maxsetvec) { /* guessing here! */
981 + maxsetvec *= 4;
982 + setvec = (int *)realloc(setvec, maxsetvec * sizeof (int));
983 + tmpset = (int *)realloc(tmpset, maxsetvec * sizeof (int));
984 + if (setvec == 0 || tmpset == 0)
985 + overflo("out of space in cgoto()");
986 + }
779 987 for (i = 0; i <= f->accept; i++)
780 988 setvec[i] = 0;
781 989 setcnt = 0;
782 990 /* compute positions of gototab[s,c] into setvec */
783 991 p = f->posns[s];
784 992 for (i = 1; i <= *p; i++) {
785 993 if ((k = f->re[p[i]].ltype) != FINAL) {
786 - if (k == CHAR && c == f->re[p[i]].lval ||
787 - k == DOT && c != 0 && c != HAT ||
788 - k == ALL && c != 0 ||
789 - k == CCL &&
790 - member(c, (uchar *)f->re[p[i]].lval) ||
791 - k == NCCL &&
792 - !member(c, (uchar *)f->re[p[i]].lval) &&
793 - c != 0 && c != HAT) {
994 + if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) ||
995 + (k == DOT && c != 0 && c != HAT) ||
996 + (k == ALL && c != 0) ||
997 + (k == EMPTYRE && c != 0) ||
998 + (k == CCL &&
999 + member(c, (uchar *)f->re[p[i]].lval.up)) ||
1000 + ((k == NCCL &&
1001 + !member(c, (uchar *)f->re[p[i]].lval.up)) &&
1002 + c != 0 && c != HAT)) {
794 1003 q = f->re[p[i]].lfollow;
795 1004 for (j = 1; j <= *q; j++) {
1005 + if (q[j] >= maxsetvec) {
1006 + maxsetvec *= 4;
1007 + setvec = (int *)realloc(setvec,
1008 + maxsetvec * sizeof (int));
1009 + tmpset = (int *)realloc(tmpset,
1010 + maxsetvec * sizeof (int));
1011 + if (setvec == 0 ||
1012 + tmpset == 0)
1013 + overflo(
1014 + "cgoto overflow");
1015 + }
796 1016 if (setvec[q[j]] == 0) {
797 1017 setcnt++;
798 1018 setvec[q[j]] = 1;
799 1019 }
800 1020 }
801 1021 }
802 1022 }
803 1023 }
804 1024 /* determine if setvec is a previous state */
805 1025 tmpset[0] = setcnt;
806 1026 j = 1;
807 1027 for (i = f->accept; i >= 0; i--)
808 1028 if (setvec[i]) {
809 1029 tmpset[j++] = i;
810 1030 }
811 1031 /* tmpset == previous state? */
812 1032 for (i = 1; i <= f->curstat; i++) {
813 1033 p = f->posns[i];
814 1034 if ((k = tmpset[0]) != p[0])
815 1035 goto different;
↓ open down ↓ |
10 lines elided |
↑ open up ↑ |
816 1036 for (j = 1; j <= k; j++)
817 1037 if (tmpset[j] != p[j])
818 1038 goto different;
819 1039 /* setvec is state i */
820 1040 f->gototab[s][c] = i;
821 1041 return (i);
822 1042 different:;
823 1043 }
824 1044
825 1045 /* add tmpset to current set of states */
826 - if (f->curstat >= NSTATES-1) {
1046 + if (f->curstat >= NSTATES - 1) {
827 1047 f->curstat = 2;
828 1048 f->reset = 1;
829 1049 for (i = 2; i < NSTATES; i++)
830 1050 xfree(f->posns[i]);
831 1051 } else
832 1052 ++(f->curstat);
833 1053 for (i = 0; i < NCHARS; i++)
834 1054 f->gototab[f->curstat][i] = 0;
835 1055 xfree(f->posns[f->curstat]);
836 1056 if ((p = (int *)calloc(1, (setcnt + 1) * sizeof (int))) == NULL)
837 1057 overflo("out of space in cgoto");
838 1058
839 1059 f->posns[f->curstat] = p;
840 1060 f->gototab[s][c] = f->curstat;
↓ open down ↓ |
4 lines elided |
↑ open up ↑ |
841 1061 for (i = 0; i <= setcnt; i++)
842 1062 p[i] = tmpset[i];
843 1063 if (setvec[f->accept])
844 1064 f->out[f->curstat] = 1;
845 1065 else
846 1066 f->out[f->curstat] = 0;
847 1067 return (f->curstat);
848 1068 }
849 1069
850 1070 static void
851 -freefa(fa *f)
1071 +freefa(fa *f) /* free a finite automaton */
852 1072 {
853 -
854 - register int i;
1073 + int i;
855 1074
856 1075 if (f == NULL)
857 1076 return;
858 1077 for (i = 0; i <= f->curstat; i++)
859 1078 xfree(f->posns[i]);
860 - for (i = 0; i <= f->accept; i++)
1079 + for (i = 0; i <= f->accept; i++) {
861 1080 xfree(f->re[i].lfollow);
1081 + if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1082 + xfree((f->re[i].lval.np));
1083 + }
862 1084 xfree(f->restr);
863 1085 xfree(f);
864 1086 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX