Print this page
8993 sync regcomp(3C) with upstream
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/lib/libc/port/regex/regcomp.c
+++ new/usr/src/lib/libc/port/regex/regcomp.c
1 1 /*
2 2 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3 3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
4 4 * Copyright 2012 Milan Jurik. All rights reserved.
5 5 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
6 6 * Copyright (c) 1992, 1993, 1994
7 7 * The Regents of the University of California. All rights reserved.
8 8 *
9 9 * This code is derived from software contributed to Berkeley by
10 10 * Henry Spencer.
11 11 *
12 12 * Redistribution and use in source and binary forms, with or without
13 13 * modification, are permitted provided that the following conditions
14 14 * are met:
15 15 * 1. Redistributions of source code must retain the above copyright
16 16 * notice, this list of conditions and the following disclaimer.
17 17 * 2. Redistributions in binary form must reproduce the above copyright
18 18 * notice, this list of conditions and the following disclaimer in the
19 19 * documentation and/or other materials provided with the distribution.
20 20 * 3. Neither the name of the University nor the names of its contributors
21 21 * may be used to endorse or promote products derived from this software
22 22 * without specific prior written permission.
23 23 *
24 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
↓ open down ↓ |
33 lines elided |
↑ open up ↑ |
34 34 * SUCH DAMAGE.
35 35 */
36 36
37 37 #include "lint.h"
38 38 #include "file64.h"
39 39 #include <sys/types.h>
40 40 #include <stdio.h>
41 41 #include <string.h>
42 42 #include <ctype.h>
43 43 #include <limits.h>
44 -#include <stdlib.h>
45 44 #include <regex.h>
45 +#include <stdlib.h>
46 +#include <stdbool.h>
46 47 #include <wchar.h>
47 48 #include <wctype.h>
48 49
49 50 #include "../locale/runetype.h"
50 51 #include "../locale/collate.h"
51 52
52 53 #include "utils.h"
53 54 #include "regex2.h"
54 55
55 56 #include "cname.h"
56 57 #include "../locale/mblocal.h"
57 58
58 59 /*
60 + * Branching context, used to keep track of branch state for all of the branch-
61 + * aware functions. In addition to keeping track of branch positions for the
62 + * p_branch_* functions, we use this to simplify some clumsiness in BREs for
63 + * detection of whether ^ is acting as an anchor or being used erroneously and
64 + * also for whether we're in a sub-expression or not.
65 + */
66 +struct branchc {
67 + sopno start;
68 + sopno back;
69 + sopno fwd;
70 +
71 + int nbranch;
72 + int nchain;
73 + bool outer;
74 + bool terminate;
75 +};
76 +
77 +/*
59 78 * parse structure, passed up and down to avoid global variables and
60 79 * other clumsinesses
61 80 */
62 81 struct parse {
63 82 const char *next; /* next character in RE */
64 83 const char *end; /* end of string (-> NUL normally) */
65 84 int error; /* has an error been seen? */
66 85 sop *strip; /* malloced strip */
67 86 sopno ssize; /* malloced strip size (allocated) */
68 87 sopno slen; /* malloced strip length (used) */
69 88 int ncsalloc; /* number of csets allocated */
70 89 struct re_guts *g;
71 90 #define NPAREN 10 /* we need to remember () 1-9 for back refs */
72 91 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
73 92 sopno pend[NPAREN]; /* -> ) ([0] unused) */
93 + bool allowbranch; /* can this expression branch? */
94 + bool bre; /* convenience; is this a BRE? */
95 + bool (*parse_expr)(struct parse *, struct branchc *);
96 + void (*pre_parse)(struct parse *, struct branchc *);
97 + void (*post_parse)(struct parse *, struct branchc *);
74 98 };
75 99
76 100 /* ========= begin header generated by ./mkh ========= */
77 101 #ifdef __cplusplus
78 102 extern "C" {
79 103 #endif
80 104
81 105 /* === regcomp.c === */
82 -static void p_ere(struct parse *p, int stop);
83 -static void p_ere_exp(struct parse *p);
106 +static bool p_ere_exp(struct parse *p, struct branchc *bc);
84 107 static void p_str(struct parse *p);
85 -static void p_bre(struct parse *p, int end1, int end2);
86 -static int p_simp_re(struct parse *p, int starordinary);
108 +static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
109 +static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
110 +static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
111 +static bool p_branch_empty(struct parse *p, struct branchc *bc);
112 +static bool p_branch_do(struct parse *p, struct branchc *bc);
113 +static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
114 +static void p_bre_post_parse(struct parse *p, struct branchc *bc);
115 +static void p_re(struct parse *p, int end1, int end2);
116 +static bool p_simp_re(struct parse *p, struct branchc *bc);
87 117 static int p_count(struct parse *p);
88 118 static void p_bracket(struct parse *p);
89 119 static void p_b_term(struct parse *p, cset *cs);
90 120 static void p_b_cclass(struct parse *p, cset *cs);
91 121 static void p_b_eclass(struct parse *p, cset *cs);
92 122 static wint_t p_b_symbol(struct parse *p);
93 123 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
94 124 static wint_t othercase(wint_t ch);
95 125 static void bothcases(struct parse *p, wint_t ch);
96 126 static void ordinary(struct parse *p, wint_t ch);
97 127 static void nonnewline(struct parse *p);
98 128 static void repeat(struct parse *p, sopno start, int from, int to);
99 129 static int seterr(struct parse *p, int e);
100 130 static cset *allocset(struct parse *p);
101 131 static void freeset(struct parse *p, cset *cs);
102 132 static void CHadd(struct parse *p, cset *cs, wint_t ch);
103 133 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
104 134 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
105 135 static wint_t singleton(cset *cs);
106 136 static sopno dupl(struct parse *p, sopno start, sopno finish);
107 137 static void doemit(struct parse *p, sop op, size_t opnd);
108 138 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
109 139 static void dofwd(struct parse *p, sopno pos, sop value);
110 140 static int enlarge(struct parse *p, sopno size);
111 141 static void stripsnug(struct parse *p, struct re_guts *g);
112 142 static void findmust(struct parse *p, struct re_guts *g);
113 143 static int altoffset(sop *scan, int offset);
114 144 static void computejumps(struct parse *p, struct re_guts *g);
115 145 static void computematchjumps(struct parse *p, struct re_guts *g);
116 146 static sopno pluscount(struct parse *p, struct re_guts *g);
117 147 static wint_t wgetnext(struct parse *p);
118 148
119 149 #ifdef __cplusplus
120 150 }
121 151 #endif
122 152 /* ========= end header generated by ./mkh ========= */
123 153
124 154 static char nuls[10]; /* place to point scanner in event of error */
125 155
↓ open down ↓ |
29 lines elided |
↑ open up ↑ |
126 156 /*
127 157 * macros for use with parse structure
128 158 * BEWARE: these know that the parse structure is named `p' !!!
129 159 */
130 160 #define PEEK() (*p->next)
131 161 #define PEEK2() (*(p->next+1))
132 162 #define MORE() (p->next < p->end)
133 163 #define MORE2() (p->next+1 < p->end)
134 164 #define SEE(c) (MORE() && PEEK() == (c))
135 165 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
166 +#define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a))
136 167 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
137 168 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
138 169 #define NEXT() (p->next++)
139 170 #define NEXT2() (p->next += 2)
140 171 #define NEXTn(n) (p->next += (n))
141 172 #define GETNEXT() (*p->next++)
142 173 #define WGETNEXT() wgetnext(p)
143 174 #define SETERROR(e) ((void)seterr(p, (e)))
144 175 #define REQUIRE(co, e) ((co) || seterr(p, e))
145 176 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
146 177 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
147 178 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
148 179 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
149 180 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
150 181 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
151 182 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
152 183 #define HERE() (p->slen)
153 184 #define THERE() (p->slen - 1)
154 185 #define THERETHERE() (p->slen - 2)
155 186 #define DROP(n) (p->slen -= (n))
156 187
157 188 #ifndef NDEBUG
158 189 static int never = 0; /* for use in asserts; shuts lint up */
159 190 #else
160 191 #define never 0 /* some <assert.h>s have bugs too */
161 192 #endif
162 193
163 194 /*
164 195 * regcomp - interface for parser and compilation
165 196 */
166 197 int /* 0 success, otherwise REG_something */
167 198 regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern,
168 199 int cflags)
169 200 {
170 201 struct parse pa;
171 202 struct re_guts *g;
172 203 struct parse *p = &pa;
173 204 int i;
174 205 size_t len;
175 206 size_t maxlen;
176 207 #ifdef REDEBUG
177 208 #define GOODFLAGS(f) (f)
178 209 #else
179 210 #define GOODFLAGS(f) ((f)&~REG_DUMP)
180 211 #endif
181 212
182 213 /* We had REG_INVARG, but we don't have that on Solaris. */
183 214 cflags = GOODFLAGS(cflags);
184 215 if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
185 216 return (REG_EFATAL);
186 217
187 218 if (cflags®_PEND) {
188 219 if (preg->re_endp < pattern)
189 220 return (REG_EFATAL);
190 221 len = preg->re_endp - pattern;
191 222 } else
192 223 len = strlen(pattern);
193 224
194 225 /* do the mallocs early so failure handling is easy */
195 226 g = (struct re_guts *)malloc(sizeof (struct re_guts));
196 227 if (g == NULL)
197 228 return (REG_ESPACE);
198 229 /*
199 230 * Limit the pattern space to avoid a 32-bit overflow on buffer
200 231 * extension. Also avoid any signed overflow in case of conversion
201 232 * so make the real limit based on a 31-bit overflow.
202 233 *
203 234 * Likely not applicable on 64-bit systems but handle the case
204 235 * generically (who are we to stop people from using ~715MB+
205 236 * patterns?).
206 237 */
207 238 maxlen = ((size_t)-1 >> 1) / sizeof (sop) * 2 / 3;
208 239 if (len >= maxlen) {
209 240 free((char *)g);
210 241 return (REG_ESPACE);
211 242 }
212 243 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
213 244 assert(p->ssize >= len);
214 245
215 246 p->strip = (sop *)malloc(p->ssize * sizeof (sop));
216 247 p->slen = 0;
217 248 if (p->strip == NULL) {
218 249 free((char *)g);
219 250 return (REG_ESPACE);
220 251 }
221 252
↓ open down ↓ |
76 lines elided |
↑ open up ↑ |
222 253 /* set things up */
223 254 p->g = g;
224 255 p->next = pattern; /* convenience; we do not modify it */
225 256 p->end = p->next + len;
226 257 p->error = 0;
227 258 p->ncsalloc = 0;
228 259 for (i = 0; i < NPAREN; i++) {
229 260 p->pbegin[i] = 0;
230 261 p->pend[i] = 0;
231 262 }
263 + if (cflags & REG_EXTENDED) {
264 + p->allowbranch = true;
265 + p->bre = false;
266 + p->parse_expr = p_ere_exp;
267 + p->pre_parse = NULL;
268 + p->post_parse = NULL;
269 + } else {
270 + p->allowbranch = false;
271 + p->bre = true;
272 + p->parse_expr = p_simp_re;
273 + p->pre_parse = p_bre_pre_parse;
274 + p->post_parse = p_bre_post_parse;
275 + }
232 276 g->sets = NULL;
233 277 g->ncsets = 0;
234 278 g->cflags = cflags;
235 279 g->iflags = 0;
236 280 g->nbol = 0;
237 281 g->neol = 0;
238 282 g->must = NULL;
239 283 g->moffset = -1;
240 284 g->charjump = NULL;
241 285 g->matchjump = NULL;
242 286 g->mlen = 0;
243 287 g->nsub = 0;
244 288 g->backrefs = 0;
245 289
246 290 /* do it */
247 291 EMIT(OEND, 0);
248 292 g->firststate = THERE();
249 - if (cflags®_EXTENDED)
250 - p_ere(p, OUT);
251 - else if (cflags®_NOSPEC)
293 + if (cflags & REG_NOSPEC)
252 294 p_str(p);
253 295 else
254 - p_bre(p, OUT, OUT);
296 + p_re(p, OUT, OUT);
255 297 EMIT(OEND, 0);
256 298 g->laststate = THERE();
257 299
258 300 /* tidy up loose ends and fill things in */
259 301 stripsnug(p, g);
260 302 findmust(p, g);
261 303 /*
262 304 * only use Boyer-Moore algorithm if the pattern is bigger
263 305 * than three characters
264 306 */
265 307 if (g->mlen > 3) {
266 308 computejumps(p, g);
267 309 computematchjumps(p, g);
268 310 if (g->matchjump == NULL && g->charjump != NULL) {
269 311 free(g->charjump);
270 312 g->charjump = NULL;
271 313 }
272 314 }
273 315 g->nplus = pluscount(p, g);
274 316 g->magic = MAGIC2;
275 317 preg->re_nsub = g->nsub;
276 318 preg->re_g = g;
277 319 preg->re_magic = MAGIC1;
278 320 #ifndef REDEBUG
279 321 /* not debugging, so can't rely on the assert() in regexec() */
280 322 if (g->iflags&BAD)
↓ open down ↓ |
16 lines elided |
↑ open up ↑ |
281 323 SETERROR(REG_EFATAL);
282 324 #endif
283 325
284 326 /* win or lose, we're done */
285 327 if (p->error != 0) /* lose */
286 328 regfree(preg);
287 329 return (p->error);
288 330 }
289 331
290 332 /*
291 - * p_ere - ERE parser top level, concatenation and alternation
333 + * Parse one subERE, an atom possibly followed by a repetition op,
334 + * return whether we should terminate or not.
292 335 */
293 -static void
294 -p_ere(struct parse *p,
295 - int stop) /* character this ERE should end at */
336 +static bool
337 +p_ere_exp(struct parse *p, struct branchc *bc)
296 338 {
297 339 char c;
298 - sopno prevback;
299 - sopno prevfwd;
300 - sopno conc;
301 - int first = 1; /* is this the first alternative? */
302 -
303 - for (;;) {
304 - /* do a bunch of concatenated expressions */
305 - conc = HERE();
306 - while (MORE() && (c = PEEK()) != '|' && c != stop)
307 - p_ere_exp(p);
308 - /* require nonempty */
309 - (void) REQUIRE(HERE() != conc, REG_BADPAT);
310 -
311 - if (!EAT('|'))
312 - break; /* NOTE BREAK OUT */
313 -
314 - if (first) {
315 - INSERT(OCH_, conc); /* offset is wrong */
316 - prevfwd = conc;
317 - prevback = conc;
318 - first = 0;
319 - }
320 - ASTERN(OOR1, prevback);
321 - prevback = THERE();
322 - AHEAD(prevfwd); /* fix previous offset */
323 - prevfwd = HERE();
324 - EMIT(OOR2, 0); /* offset is very wrong */
325 - }
326 -
327 - if (!first) { /* tail-end fixups */
328 - AHEAD(prevfwd);
329 - ASTERN(O_CH, prevback);
330 - }
331 -
332 - assert(!MORE() || SEE(stop));
333 -}
334 -
335 -/*
336 - * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
337 - */
338 -static void
339 -p_ere_exp(struct parse *p)
340 -{
341 - char c;
342 340 wint_t wc;
343 341 sopno pos;
344 342 int count;
345 343 int count2;
346 344 sopno subno;
347 345 int wascaret = 0;
348 346
347 + (void) bc;
349 348 assert(MORE()); /* caller should have ensured this */
350 349 c = GETNEXT();
351 350
352 351 pos = HERE();
353 352 switch (c) {
354 353 case '(':
355 354 (void) REQUIRE(MORE(), REG_EPAREN);
356 355 p->g->nsub++;
357 356 subno = p->g->nsub;
358 357 if (subno < NPAREN)
359 358 p->pbegin[subno] = HERE();
360 359 EMIT(OLPAREN, subno);
361 360 if (!SEE(')'))
362 - p_ere(p, ')');
361 + p_re(p, ')', IGN);
363 362 if (subno < NPAREN) {
364 363 p->pend[subno] = HERE();
365 364 assert(p->pend[subno] != 0);
366 365 }
367 366 EMIT(ORPAREN, subno);
368 367 (void) MUSTEAT(')', REG_EPAREN);
369 368 break;
370 369 #ifndef POSIX_MISTAKE
371 370 case ')': /* happens only if no current unmatched ( */
372 371 /*
373 372 * You may ask, why the ifndef? Because I didn't notice
374 373 * this until slightly too late for 1003.2, and none of the
375 374 * other 1003.2 regular-expression reviewers noticed it at
376 375 * all. So an unmatched ) is legal POSIX, at least until
377 376 * we can get it fixed.
378 377 */
379 378 SETERROR(REG_EPAREN);
380 379 break;
381 380 #endif
382 381 case '^':
383 382 EMIT(OBOL, 0);
384 383 p->g->iflags |= USEBOL;
385 384 p->g->nbol++;
386 385 wascaret = 1;
387 386 break;
388 387 case '$':
389 388 EMIT(OEOL, 0);
390 389 p->g->iflags |= USEEOL;
391 390 p->g->neol++;
392 391 break;
393 392 case '|':
394 393 SETERROR(REG_BADPAT);
395 394 break;
396 395 case '*':
397 396 case '+':
398 397 case '?':
399 398 case '{':
400 399 SETERROR(REG_BADRPT);
401 400 break;
402 401 case '.':
403 402 if (p->g->cflags®_NEWLINE)
404 403 nonnewline(p);
405 404 else
406 405 EMIT(OANY, 0);
407 406 break;
408 407 case '[':
409 408 p_bracket(p);
410 409 break;
411 410 case '\\':
412 411 (void) REQUIRE(MORE(), REG_EESCAPE);
413 412 wc = WGETNEXT();
414 413 switch (wc) {
415 414 case '<':
416 415 EMIT(OBOW, 0);
417 416 break;
↓ open down ↓ |
45 lines elided |
↑ open up ↑ |
418 417 case '>':
419 418 EMIT(OEOW, 0);
420 419 break;
421 420 default:
422 421 ordinary(p, wc);
423 422 break;
424 423 }
425 424 break;
426 425 default:
427 426 if (p->error != 0)
428 - return;
427 + return (false);
429 428 p->next--;
430 429 wc = WGETNEXT();
431 430 ordinary(p, wc);
432 431 break;
433 432 }
434 433
435 434 if (!MORE())
436 - return;
435 + return (false);
437 436 c = PEEK();
438 437 /* we call { a repetition if followed by a digit */
439 438 if (!(c == '*' || c == '+' || c == '?' || c == '{'))
440 - return; /* no repetition, we're done */
439 + return (false); /* no repetition, we're done */
441 440 else if (c == '{')
442 441 (void) REQUIRE(MORE2() && \
443 442 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
444 443 NEXT();
445 444
446 445 (void) REQUIRE(!wascaret, REG_BADRPT);
447 446 switch (c) {
448 447 case '*': /* implemented as +? */
449 448 /* this case does not require the (y|) trick, noKLUDGE */
450 449 INSERT(OPLUS_, pos);
451 450 ASTERN(O_PLUS, pos);
452 451 INSERT(OQUEST_, pos);
453 452 ASTERN(O_QUEST, pos);
454 453 break;
455 454 case '+':
456 455 INSERT(OPLUS_, pos);
457 456 ASTERN(O_PLUS, pos);
458 457 break;
459 458 case '?':
460 459 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
461 460 INSERT(OCH_, pos); /* offset slightly wrong */
462 461 ASTERN(OOR1, pos); /* this one's right */
463 462 AHEAD(pos); /* fix the OCH_ */
464 463 EMIT(OOR2, 0); /* offset very wrong... */
465 464 AHEAD(THERE()); /* ...so fix it */
466 465 ASTERN(O_CH, THERETHERE());
467 466 break;
468 467 case '{':
469 468 count = p_count(p);
470 469 if (EAT(',')) {
471 470 if (isdigit((uch)PEEK())) {
472 471 count2 = p_count(p);
473 472 (void) REQUIRE(count <= count2, REG_BADBR);
474 473 } else /* single number with comma */
475 474 count2 = INFINITY;
476 475 } else /* just a single number */
477 476 count2 = count;
478 477 repeat(p, pos, count, count2);
↓ open down ↓ |
28 lines elided |
↑ open up ↑ |
479 478 if (!EAT('}')) { /* error heuristics */
480 479 while (MORE() && PEEK() != '}')
481 480 NEXT();
482 481 (void) REQUIRE(MORE(), REG_EBRACE);
483 482 SETERROR(REG_BADBR);
484 483 }
485 484 break;
486 485 }
487 486
488 487 if (!MORE())
489 - return;
488 + return (false);
490 489 c = PEEK();
491 490 if (!(c == '*' || c == '+' || c == '?' ||
492 491 (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
493 - return;
492 + return (false);
494 493 SETERROR(REG_BADRPT);
494 + return (false);
495 495 }
496 496
497 497 /*
498 498 * p_str - string (no metacharacters) "parser"
499 499 */
500 500 static void
501 501 p_str(struct parse *p)
502 502 {
503 503 (void) REQUIRE(MORE(), REG_BADPAT);
504 504 while (MORE())
505 505 ordinary(p, WGETNEXT());
506 506 }
507 507
508 508 /*
509 - * p_bre - BRE parser top level, anchoring and concatenation
510 - * Giving end1 as OUT essentially eliminates the end1/end2 check.
511 - *
512 - * This implementation is a bit of a kludge, in that a trailing $ is first
513 - * taken as an ordinary character and then revised to be an anchor.
514 - * The amount of lookahead needed to avoid this kludge is excessive.
509 + * Eat consecutive branch delimiters for the kind of expression that we are
510 + * parsing, return the number of delimiters that we ate.
515 511 */
512 +static int
513 +p_branch_eat_delim(struct parse *p, struct branchc *bc)
514 +{
515 + int nskip;
516 +
517 + (void) bc;
518 + nskip = 0;
519 + while (EAT('|'))
520 + ++nskip;
521 + return (nskip);
522 +}
523 +
524 +/*
525 + * Insert necessary branch book-keeping operations. This emits a
526 + * bogus 'next' offset, since we still have more to parse
527 + */
516 528 static void
517 -p_bre(struct parse *p,
518 - int end1, /* first terminating character */
519 - int end2) /* second terminating character */
529 +p_branch_ins_offset(struct parse *p, struct branchc *bc)
520 530 {
521 - sopno start = HERE();
522 - int first = 1; /* first subexpression? */
523 - int wasdollar = 0;
531 + if (bc->nbranch == 0) {
532 + INSERT(OCH_, bc->start); /* offset is wrong */
533 + bc->fwd = bc->start;
534 + bc->back = bc->start;
535 + }
524 536
537 + ASTERN(OOR1, bc->back);
538 + bc->back = THERE();
539 + AHEAD(bc->fwd); /* fix previous offset */
540 + bc->fwd = HERE();
541 + EMIT(OOR2, 0); /* offset is very wrong */
542 + ++bc->nbranch;
543 +}
544 +
545 +/*
546 + * Fix the offset of the tail branch, if we actually had any branches.
547 + * This is to correct the bogus placeholder offset that we use.
548 + */
549 +static void
550 +p_branch_fix_tail(struct parse *p, struct branchc *bc)
551 +{
552 + /* Fix bogus offset at the tail if we actually have branches */
553 + if (bc->nbranch > 0) {
554 + AHEAD(bc->fwd);
555 + ASTERN(O_CH, bc->back);
556 + }
557 +}
558 +
559 +/*
560 + * Signal to the parser that an empty branch has been encountered; this will,
561 + * in the future, be used to allow for more permissive behavior with empty
562 + * branches. The return value should indicate whether parsing may continue
563 + * or not.
564 + */
565 +static bool
566 +p_branch_empty(struct parse *p, struct branchc *bc)
567 +{
568 + (void) bc;
569 + SETERROR(REG_BADPAT);
570 + return (false);
571 +}
572 +
573 +/*
574 + * Take care of any branching requirements. This includes inserting the
575 + * appropriate branching instructions as well as eating all of the branch
576 + * delimiters until we either run out of pattern or need to parse more pattern.
577 + */
578 +static bool
579 +p_branch_do(struct parse *p, struct branchc *bc)
580 +{
581 + int ate = 0;
582 +
583 + ate = p_branch_eat_delim(p, bc);
584 + if (ate == 0)
585 + return (false);
586 + else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
587 + /*
588 + * Halt parsing only if we have an empty branch and
589 + * p_branch_empty indicates that we must not continue.
590 + * In the future, this will not necessarily be an error.
591 + */
592 + return (false);
593 + p_branch_ins_offset(p, bc);
594 +
595 + return (true);
596 +}
597 +
598 +static void
599 +p_bre_pre_parse(struct parse *p, struct branchc *bc)
600 +{
601 + (void) bc;
602 + /*
603 + * Does not move cleanly into expression parser because of
604 + * ordinary interpration of * at the beginning position of
605 + * an expression.
606 + */
525 607 if (EAT('^')) {
526 608 EMIT(OBOL, 0);
527 609 p->g->iflags |= USEBOL;
528 610 p->g->nbol++;
529 611 }
530 - while (MORE() && !SEETWO(end1, end2)) {
531 - wasdollar = p_simp_re(p, first);
532 - first = 0;
533 - }
534 - if (wasdollar) { /* oops, that was a trailing anchor */
612 +}
613 +
614 +static void
615 +p_bre_post_parse(struct parse *p, struct branchc *bc)
616 +{
617 + /* Expression is terminating due to EOL token */
618 + if (bc->terminate) {
535 619 DROP(1);
536 620 EMIT(OEOL, 0);
537 621 p->g->iflags |= USEEOL;
538 622 p->g->neol++;
539 623 }
624 +}
540 625
541 - (void) REQUIRE(HERE() != start, REG_BADPAT); /* require nonempty */
626 +/*
627 + * Top level parser, concatenation and BRE anchoring.
628 + * Giving end1 as OUT essentially eliminates the end1/end2 check.
629 + *
630 + * This implementation is a bit of a kludge, in that a trailing $ is first
631 + * taken as an ordinary character and then revised to be an anchor.
632 + * The amount of lookahead needed to avoid this kludge is excessive.
633 + */
634 +static void
635 +p_re(struct parse *p,
636 + int end1, /* first terminating character */
637 + int end2) /* second terminating character; ignored for EREs */
638 +{
639 + struct branchc bc;
640 +
641 + bc.nbranch = 0;
642 + if (end1 == OUT && end2 == OUT)
643 + bc.outer = true;
644 + else
645 + bc.outer = false;
646 +#define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2))
647 + for (;;) {
648 + bc.start = HERE();
649 + bc.nchain = 0;
650 + bc.terminate = false;
651 + if (p->pre_parse != NULL)
652 + p->pre_parse(p, &bc);
653 + while (MORE() && (!p->allowbranch || !SEESPEC('|')) &&
654 + !SEEEND()) {
655 + bc.terminate = p->parse_expr(p, &bc);
656 + ++bc.nchain;
657 + }
658 + if (p->post_parse != NULL)
659 + p->post_parse(p, &bc);
660 + (void) REQUIRE(HERE() != bc.start, REG_BADPAT);
661 + if (!p->allowbranch)
662 + break;
663 + /*
664 + * p_branch_do's return value indicates whether we should
665 + * continue parsing or not. This is both for correctness and
666 + * a slight optimization, because it will check if we've
667 + * encountered an empty branch or the end of the string
668 + * immediately following a branch delimiter.
669 + */
670 + if (!p_branch_do(p, &bc))
671 + break;
672 + }
673 +#undef SEE_END
674 + if (p->allowbranch)
675 + p_branch_fix_tail(p, &bc);
676 + assert(!MORE() || SEE(end1));
542 677 }
543 678
544 679 /*
545 680 * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
546 681 */
547 -static int /* was the simple RE an unbackslashed $? */
548 -p_simp_re(struct parse *p,
549 - int starordinary) /* is a leading * an ordinary character? */
682 +static bool /* was the simple RE an unbackslashed $? */
683 +p_simp_re(struct parse *p, struct branchc *bc)
550 684 {
551 685 int c;
552 686 int count;
553 687 int count2;
554 688 sopno pos;
555 689 int i;
556 690 wint_t wc;
557 691 sopno subno;
558 692 #define BACKSL (1<<CHAR_BIT)
559 693
560 694 pos = HERE(); /* repetition op, if any, covers from here */
561 695
562 696 assert(MORE()); /* caller should have ensured this */
563 697 c = GETNEXT();
564 698 if (c == '\\') {
565 699 (void) REQUIRE(MORE(), REG_EESCAPE);
566 700 c = BACKSL | GETNEXT();
567 701 }
568 702 switch (c) {
569 703 case '.':
570 704 if (p->g->cflags®_NEWLINE)
571 705 nonnewline(p);
572 706 else
573 707 EMIT(OANY, 0);
574 708 break;
575 709 case '[':
576 710 p_bracket(p);
577 711 break;
578 712 case BACKSL|'<':
579 713 EMIT(OBOW, 0);
580 714 break;
581 715 case BACKSL|'>':
582 716 EMIT(OEOW, 0);
583 717 break;
584 718 case BACKSL|'{':
↓ open down ↓ |
25 lines elided |
↑ open up ↑ |
585 719 SETERROR(REG_BADRPT);
586 720 break;
587 721 case BACKSL|'(':
588 722 p->g->nsub++;
589 723 subno = p->g->nsub;
590 724 if (subno < NPAREN)
591 725 p->pbegin[subno] = HERE();
592 726 EMIT(OLPAREN, subno);
593 727 /* the MORE here is an error heuristic */
594 728 if (MORE() && !SEETWO('\\', ')'))
595 - p_bre(p, '\\', ')');
729 + p_re(p, '\\', ')');
596 730 if (subno < NPAREN) {
597 731 p->pend[subno] = HERE();
598 732 assert(p->pend[subno] != 0);
599 733 }
600 734 EMIT(ORPAREN, subno);
601 735 (void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
602 736 break;
603 737 case BACKSL|')': /* should not get here -- must be user */
604 738 SETERROR(REG_EPAREN);
605 739 break;
606 740 case BACKSL|'1':
607 741 case BACKSL|'2':
608 742 case BACKSL|'3':
609 743 case BACKSL|'4':
610 744 case BACKSL|'5':
611 745 case BACKSL|'6':
612 746 case BACKSL|'7':
613 747 case BACKSL|'8':
614 748 case BACKSL|'9':
615 749 i = (c&~BACKSL) - '0';
616 750 assert(i < NPAREN);
617 751 if (p->pend[i] != 0) {
618 752 assert(i <= p->g->nsub);
619 753 EMIT(OBACK_, i);
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
620 754 assert(p->pbegin[i] != 0);
621 755 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
622 756 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
623 757 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
624 758 EMIT(O_BACK, i);
625 759 } else
626 760 SETERROR(REG_ESUBREG);
627 761 p->g->backrefs = 1;
628 762 break;
629 763 case '*':
630 - (void) REQUIRE(starordinary, REG_BADRPT);
764 + /*
765 + * Ordinary if used as the first character beyond BOL anchor of
766 + * a (sub-)expression, counts as a bad repetition operator if it
767 + * appears otherwise.
768 + */
769 + (void) REQUIRE(bc->nchain == 0, REG_BADRPT);
631 770 /* FALLTHROUGH */
632 771 default:
633 772 if (p->error != 0)
634 - return (0); /* Definitely not $... */
773 + return (false); /* Definitely not $... */
635 774 p->next--;
636 775 wc = WGETNEXT();
637 776 ordinary(p, wc);
638 777 break;
639 778 }
640 779
641 780 if (EAT('*')) { /* implemented as +? */
642 781 /* this case does not require the (y|) trick, noKLUDGE */
643 782 INSERT(OPLUS_, pos);
644 783 ASTERN(O_PLUS, pos);
645 784 INSERT(OQUEST_, pos);
646 785 ASTERN(O_QUEST, pos);
647 786 } else if (EATTWO('\\', '{')) {
648 787 count = p_count(p);
649 788 if (EAT(',')) {
650 789 if (MORE() && isdigit((uch)PEEK())) {
651 790 count2 = p_count(p);
652 791 (void) REQUIRE(count <= count2, REG_BADBR);
653 792 } else /* single number with comma */
654 793 count2 = INFINITY;
↓ open down ↓ |
10 lines elided |
↑ open up ↑ |
655 794 } else /* just a single number */
656 795 count2 = count;
657 796 repeat(p, pos, count, count2);
658 797 if (!EATTWO('\\', '}')) { /* error heuristics */
659 798 while (MORE() && !SEETWO('\\', '}'))
660 799 NEXT();
661 800 (void) REQUIRE(MORE(), REG_EBRACE);
662 801 SETERROR(REG_BADBR);
663 802 }
664 803 } else if (c == '$') /* $ (but not \$) ends it */
665 - return (1);
804 + return (true);
666 805
667 - return (0);
806 + return (false);
668 807 }
669 808
670 809 /*
671 810 * p_count - parse a repetition count
672 811 */
673 812 static int /* the value */
674 813 p_count(struct parse *p)
675 814 {
676 815 int count = 0;
677 816 int ndigits = 0;
678 817
679 818 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
680 819 count = count*10 + (GETNEXT() - '0');
681 820 ndigits++;
682 821 }
683 822
684 823 (void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
685 824 return (count);
686 825 }
687 826
688 827 /*
689 828 * p_bracket - parse a bracketed character list
690 829 */
691 830 static void
692 831 p_bracket(struct parse *p)
693 832 {
694 833 cset *cs;
695 834 wint_t ch;
696 835
697 836 /* Dept of Truly Sickening Special-Case Kludges */
698 837 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
699 838 EMIT(OBOW, 0);
700 839 NEXTn(6);
701 840 return;
702 841 }
703 842 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
704 843 EMIT(OEOW, 0);
705 844 NEXTn(6);
706 845 return;
707 846 }
708 847
709 848 if ((cs = allocset(p)) == NULL)
710 849 return;
711 850
712 851 if (p->g->cflags®_ICASE)
713 852 cs->icase = 1;
714 853 if (EAT('^'))
715 854 cs->invert = 1;
716 855 if (EAT(']'))
717 856 CHadd(p, cs, ']');
718 857 else if (EAT('-'))
719 858 CHadd(p, cs, '-');
720 859 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
721 860 p_b_term(p, cs);
722 861 if (EAT('-'))
723 862 CHadd(p, cs, '-');
724 863 (void) MUSTEAT(']', REG_EBRACK);
725 864
726 865 if (p->error != 0) /* don't mess things up further */
727 866 return;
728 867
729 868 if (cs->invert && p->g->cflags®_NEWLINE)
730 869 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
731 870
732 871 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
733 872 ordinary(p, ch);
734 873 freeset(p, cs);
735 874 } else
736 875 EMIT(OANYOF, (int)(cs - p->g->sets));
737 876 }
738 877
739 878 /*
740 879 * p_b_term - parse one term of a bracketed character list
741 880 */
742 881 static void
743 882 p_b_term(struct parse *p, cset *cs)
744 883 {
745 884 char c;
746 885 wint_t start, finish;
747 886 wint_t i;
748 887 locale_t loc = uselocale(NULL);
749 888
750 889 /* classify what we've got */
751 890 switch ((MORE()) ? PEEK() : '\0') {
752 891 case '[':
753 892 c = (MORE2()) ? PEEK2() : '\0';
754 893 break;
755 894 case '-':
756 895 SETERROR(REG_ERANGE);
757 896 return; /* NOTE RETURN */
758 897 default:
759 898 c = '\0';
760 899 break;
761 900 }
762 901
763 902 switch (c) {
764 903 case ':': /* character class */
765 904 NEXT2();
766 905 (void) REQUIRE(MORE(), REG_EBRACK);
767 906 c = PEEK();
768 907 (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);
769 908 p_b_cclass(p, cs);
770 909 (void) REQUIRE(MORE(), REG_EBRACK);
771 910 (void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
772 911 break;
773 912 case '=': /* equivalence class */
774 913 NEXT2();
775 914 (void) REQUIRE(MORE(), REG_EBRACK);
776 915 c = PEEK();
777 916 (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
778 917 p_b_eclass(p, cs);
779 918 (void) REQUIRE(MORE(), REG_EBRACK);
780 919 (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
781 920 break;
782 921 default: /* symbol, ordinary character, or range */
783 922 start = p_b_symbol(p);
784 923 if (SEE('-') && MORE2() && PEEK2() != ']') {
785 924 /* range */
786 925 NEXT();
787 926 if (EAT('-'))
788 927 finish = '-';
789 928 else
790 929 finish = p_b_symbol(p);
791 930 } else
792 931 finish = start;
793 932 if (start == finish)
794 933 CHadd(p, cs, start);
795 934 else {
796 935 if (loc->collate->lc_is_posix) {
797 936 (void) REQUIRE((uch)start <= (uch)finish,
798 937 REG_ERANGE);
799 938 CHaddrange(p, cs, start, finish);
800 939 } else {
801 940 (void) REQUIRE(_collate_range_cmp(start,
802 941 finish, loc) <= 0, REG_ERANGE);
803 942 for (i = 0; i <= UCHAR_MAX; i++) {
804 943 if (_collate_range_cmp(start, i, loc)
805 944 <= 0 &&
806 945 _collate_range_cmp(i, finish, loc)
807 946 <= 0)
808 947 CHadd(p, cs, i);
809 948 }
810 949 }
811 950 }
812 951 break;
813 952 }
814 953 }
815 954
816 955 /*
817 956 * p_b_cclass - parse a character-class name and deal with it
818 957 */
819 958 static void
820 959 p_b_cclass(struct parse *p, cset *cs)
821 960 {
822 961 const char *sp = p->next;
823 962 size_t len;
824 963 wctype_t wct;
825 964 char clname[16];
826 965
827 966 while (MORE() && isalpha((uch)PEEK()))
828 967 NEXT();
829 968 len = p->next - sp;
830 969 if (len >= sizeof (clname) - 1) {
831 970 SETERROR(REG_ECTYPE);
832 971 return;
833 972 }
834 973 (void) memcpy(clname, sp, len);
835 974 clname[len] = '\0';
836 975 if ((wct = wctype(clname)) == 0) {
837 976 SETERROR(REG_ECTYPE);
838 977 return;
839 978 }
840 979 CHaddtype(p, cs, wct);
841 980 }
842 981
843 982 /*
844 983 * p_b_eclass - parse an equivalence-class name and deal with it
845 984 *
846 985 * This implementation is incomplete. xxx
847 986 */
848 987 static void
849 988 p_b_eclass(struct parse *p, cset *cs)
850 989 {
851 990 wint_t c;
852 991
853 992 c = p_b_coll_elem(p, '=');
854 993 CHadd(p, cs, c);
855 994 }
856 995
857 996 /*
858 997 * p_b_symbol - parse a character or [..]ed multicharacter collating symbol
859 998 */
860 999 static wint_t /* value of symbol */
861 1000 p_b_symbol(struct parse *p)
862 1001 {
863 1002 wint_t value;
864 1003
865 1004 (void) REQUIRE(MORE(), REG_EBRACK);
866 1005 if (!EATTWO('[', '.'))
867 1006 return (WGETNEXT());
868 1007
869 1008 /* collating symbol */
870 1009 value = p_b_coll_elem(p, '.');
871 1010 (void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
872 1011 return (value);
873 1012 }
874 1013
875 1014 /*
876 1015 * p_b_coll_elem - parse a collating-element name and look it up
877 1016 */
878 1017 static wint_t /* value of collating element */
879 1018 p_b_coll_elem(struct parse *p,
880 1019 wint_t endc) /* name ended by endc,']' */
881 1020 {
882 1021 const char *sp = p->next;
883 1022 struct cname *cp;
884 1023 mbstate_t mbs;
885 1024 wchar_t wc;
↓ open down ↓ |
208 lines elided |
↑ open up ↑ |
886 1025 size_t clen, len;
887 1026
888 1027 while (MORE() && !SEETWO(endc, ']'))
889 1028 NEXT();
890 1029 if (!MORE()) {
891 1030 SETERROR(REG_EBRACK);
892 1031 return (0);
893 1032 }
894 1033 len = p->next - sp;
895 1034 for (cp = cnames; cp->name != NULL; cp++)
896 - if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
1035 + if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
897 1036 return (cp->code); /* known name */
898 1037 (void) memset(&mbs, 0, sizeof (mbs));
899 1038 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
900 1039 return (wc); /* single character */
901 1040 else if (clen == (size_t)-1 || clen == (size_t)-2)
902 1041 SETERROR(REG_ECHAR);
903 1042 else
904 1043 SETERROR(REG_ECOLLATE); /* neither */
905 1044 return (0);
906 1045 }
907 1046
908 1047 /*
909 1048 * othercase - return the case counterpart of an alphabetic
910 1049 */
911 1050 static wint_t /* if no counterpart, return ch */
912 1051 othercase(wint_t ch)
913 1052 {
914 1053 assert(iswalpha(ch));
915 1054 if (iswupper(ch))
916 1055 return (towlower(ch));
917 1056 else if (iswlower(ch))
918 1057 return (towupper(ch));
919 1058 else /* peculiar, but could happen */
920 1059 return (ch);
921 1060 }
922 1061
923 1062 /*
924 1063 * bothcases - emit a dualcase version of a two-case character
925 1064 *
926 1065 * Boy, is this implementation ever a kludge...
927 1066 */
928 1067 static void
929 1068 bothcases(struct parse *p, wint_t ch)
930 1069 {
931 1070 const char *oldnext = p->next;
932 1071 const char *oldend = p->end;
933 1072 char bracket[3 + MB_LEN_MAX];
934 1073 size_t n;
935 1074 mbstate_t mbs;
936 1075
937 1076 assert(othercase(ch) != ch); /* p_bracket() would recurse */
938 1077 p->next = bracket;
939 1078 (void) memset(&mbs, 0, sizeof (mbs));
940 1079 n = wcrtomb(bracket, ch, &mbs);
941 1080 assert(n != (size_t)-1);
942 1081 bracket[n] = ']';
943 1082 bracket[n + 1] = '\0';
944 1083 p->end = bracket+n+1;
945 1084 p_bracket(p);
946 1085 assert(p->next == p->end);
947 1086 p->next = oldnext;
948 1087 p->end = oldend;
949 1088 }
950 1089
951 1090 /*
952 1091 * ordinary - emit an ordinary character
953 1092 */
954 1093 static void
955 1094 ordinary(struct parse *p, wint_t ch)
956 1095 {
957 1096 cset *cs;
958 1097
959 1098 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch)
960 1099 bothcases(p, ch);
961 1100 else if ((ch & OPDMASK) == ch)
962 1101 EMIT(OCHAR, ch);
963 1102 else {
964 1103 /*
965 1104 * Kludge: character is too big to fit into an OCHAR operand.
966 1105 * Emit a singleton set.
967 1106 */
968 1107 if ((cs = allocset(p)) == NULL)
969 1108 return;
970 1109 CHadd(p, cs, ch);
971 1110 EMIT(OANYOF, (int)(cs - p->g->sets));
972 1111 }
973 1112 }
974 1113
975 1114 /*
976 1115 * nonnewline - emit REG_NEWLINE version of OANY
977 1116 *
978 1117 * Boy, is this implementation ever a kludge...
979 1118 */
980 1119 static void
981 1120 nonnewline(struct parse *p)
982 1121 {
983 1122 const char *oldnext = p->next;
984 1123 const char *oldend = p->end;
985 1124 char bracket[4];
986 1125
987 1126 p->next = bracket;
988 1127 p->end = bracket+3;
989 1128 bracket[0] = '^';
990 1129 bracket[1] = '\n';
991 1130 bracket[2] = ']';
992 1131 bracket[3] = '\0';
993 1132 p_bracket(p);
994 1133 assert(p->next == bracket+3);
995 1134 p->next = oldnext;
996 1135 p->end = oldend;
997 1136 }
998 1137
999 1138 /*
1000 1139 * repeat - generate code for a bounded repetition, recursively if needed
1001 1140 */
1002 1141 static void
1003 1142 repeat(struct parse *p,
1004 1143 sopno start, /* operand from here to end of strip */
1005 1144 int from, /* repeated from this number */
1006 1145 int to) /* to this number of times (maybe INFINITY) */
1007 1146 {
1008 1147 sopno finish = HERE();
1009 1148 #define N 2
1010 1149 #define INF 3
1011 1150 #define REP(f, t) ((f)*8 + (t))
1012 1151 #define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1013 1152 sopno copy;
1014 1153
1015 1154 if (p->error != 0) /* head off possible runaway recursion */
1016 1155 return;
1017 1156
1018 1157 assert(from <= to);
1019 1158
1020 1159 switch (REP(MAP(from), MAP(to))) {
1021 1160 case REP(0, 0): /* must be user doing this */
1022 1161 DROP(finish-start); /* drop the operand */
1023 1162 break;
1024 1163 case REP(0, 1): /* as x{1,1}? */
1025 1164 case REP(0, N): /* as x{1,n}? */
1026 1165 case REP(0, INF): /* as x{1,}? */
1027 1166 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1028 1167 INSERT(OCH_, start); /* offset is wrong... */
1029 1168 repeat(p, start+1, 1, to);
1030 1169 ASTERN(OOR1, start);
1031 1170 AHEAD(start); /* ... fix it */
1032 1171 EMIT(OOR2, 0);
1033 1172 AHEAD(THERE());
1034 1173 ASTERN(O_CH, THERETHERE());
1035 1174 break;
1036 1175 case REP(1, 1): /* trivial case */
1037 1176 /* done */
1038 1177 break;
1039 1178 case REP(1, N): /* as x?x{1,n-1} */
1040 1179 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1041 1180 INSERT(OCH_, start);
1042 1181 ASTERN(OOR1, start);
1043 1182 AHEAD(start);
1044 1183 EMIT(OOR2, 0); /* offset very wrong... */
1045 1184 AHEAD(THERE()); /* ...so fix it */
1046 1185 ASTERN(O_CH, THERETHERE());
1047 1186 copy = dupl(p, start+1, finish+1);
1048 1187 assert(copy == finish+4);
1049 1188 repeat(p, copy, 1, to-1);
1050 1189 break;
1051 1190 case REP(1, INF): /* as x+ */
1052 1191 INSERT(OPLUS_, start);
1053 1192 ASTERN(O_PLUS, start);
1054 1193 break;
1055 1194 case REP(N, N): /* as xx{m-1,n-1} */
1056 1195 copy = dupl(p, start, finish);
1057 1196 repeat(p, copy, from-1, to-1);
1058 1197 break;
1059 1198 case REP(N, INF): /* as xx{n-1,INF} */
1060 1199 copy = dupl(p, start, finish);
1061 1200 repeat(p, copy, from-1, to);
1062 1201 break;
1063 1202 default: /* "can't happen" */
1064 1203 SETERROR(REG_EFATAL); /* just in case */
1065 1204 break;
1066 1205 }
1067 1206 }
1068 1207
1069 1208 /*
1070 1209 * wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1071 1210 * character from the parse struct, signals a REG_ILLSEQ error if the
1072 1211 * character can't be converted. Returns the number of bytes consumed.
1073 1212 */
1074 1213 static wint_t
1075 1214 wgetnext(struct parse *p)
1076 1215 {
1077 1216 mbstate_t mbs;
1078 1217 wchar_t wc;
1079 1218 size_t n;
1080 1219
1081 1220 (void) memset(&mbs, 0, sizeof (mbs));
1082 1221 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1083 1222 if (n == (size_t)-1 || n == (size_t)-2) {
1084 1223 SETERROR(REG_ECHAR);
1085 1224 return (0);
1086 1225 }
1087 1226 if (n == 0)
1088 1227 n = 1;
1089 1228 p->next += n;
1090 1229 return (wc);
1091 1230 }
1092 1231
1093 1232 /*
1094 1233 * seterr - set an error condition
1095 1234 */
1096 1235 static int /* useless but makes type checking happy */
1097 1236 seterr(struct parse *p, int e)
1098 1237 {
1099 1238 if (p->error == 0) /* keep earliest error condition */
1100 1239 p->error = e;
1101 1240 p->next = nuls; /* try to bring things to a halt */
1102 1241 p->end = nuls;
1103 1242 return (0); /* make the return value well-defined */
1104 1243 }
1105 1244
1106 1245 /*
1107 1246 * allocset - allocate a set of characters for []
1108 1247 */
1109 1248 static cset *
1110 1249 allocset(struct parse *p)
1111 1250 {
1112 1251 cset *cs, *ncs;
1113 1252
1114 1253 ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs));
1115 1254 if (ncs == NULL) {
1116 1255 SETERROR(REG_ESPACE);
1117 1256 return (NULL);
1118 1257 }
1119 1258 p->g->sets = ncs;
1120 1259 cs = &p->g->sets[p->g->ncsets++];
1121 1260 (void) memset(cs, 0, sizeof (*cs));
1122 1261
1123 1262 return (cs);
1124 1263 }
1125 1264
1126 1265 /*
1127 1266 * freeset - free a now-unused set
1128 1267 */
1129 1268 static void
1130 1269 freeset(struct parse *p, cset *cs)
1131 1270 {
1132 1271 cset *top = &p->g->sets[p->g->ncsets];
1133 1272
1134 1273 free(cs->wides);
1135 1274 free(cs->ranges);
1136 1275 free(cs->types);
1137 1276 (void) memset(cs, 0, sizeof (*cs));
1138 1277 if (cs == top-1) /* recover only the easy case */
1139 1278 p->g->ncsets--;
1140 1279 }
1141 1280
1142 1281 /*
1143 1282 * singleton - Determine whether a set contains only one character,
1144 1283 * returning it if so, otherwise returning OUT.
1145 1284 */
1146 1285 static wint_t
1147 1286 singleton(cset *cs)
1148 1287 {
1149 1288 wint_t i, s, n;
1150 1289
1151 1290 for (i = n = 0; i < NC; i++)
1152 1291 if (CHIN(cs, i)) {
1153 1292 n++;
1154 1293 s = i;
1155 1294 }
1156 1295 if (n == 1)
1157 1296 return (s);
1158 1297 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1159 1298 cs->icase == 0)
1160 1299 return (cs->wides[0]);
1161 1300 /* Don't bother handling the other cases. */
1162 1301 return (OUT);
1163 1302 }
1164 1303
1165 1304 /*
1166 1305 * CHadd - add character to character set.
1167 1306 */
1168 1307 static void
1169 1308 CHadd(struct parse *p, cset *cs, wint_t ch)
1170 1309 {
1171 1310 wint_t nch, *newwides;
1172 1311 assert(ch >= 0);
1173 1312 if (ch < NC)
1174 1313 cs->bmp[ch >> 3] |= 1 << (ch & 7);
1175 1314 else {
1176 1315 newwides = realloc(cs->wides, (cs->nwides + 1) *
1177 1316 sizeof (*cs->wides));
1178 1317 if (newwides == NULL) {
1179 1318 SETERROR(REG_ESPACE);
1180 1319 return;
1181 1320 }
1182 1321 cs->wides = newwides;
1183 1322 cs->wides[cs->nwides++] = ch;
1184 1323 }
1185 1324 if (cs->icase) {
1186 1325 if ((nch = towlower(ch)) < NC)
1187 1326 cs->bmp[nch >> 3] |= 1 << (nch & 7);
1188 1327 if ((nch = towupper(ch)) < NC)
1189 1328 cs->bmp[nch >> 3] |= 1 << (nch & 7);
1190 1329 }
1191 1330 }
1192 1331
1193 1332 /*
1194 1333 * CHaddrange - add all characters in the range [min,max] to a character set.
1195 1334 */
1196 1335 static void
1197 1336 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1198 1337 {
1199 1338 crange *newranges;
1200 1339
1201 1340 for (; min < NC && min <= max; min++)
1202 1341 CHadd(p, cs, min);
1203 1342 if (min >= max)
1204 1343 return;
1205 1344 newranges = realloc(cs->ranges, (cs->nranges + 1) *
1206 1345 sizeof (*cs->ranges));
1207 1346 if (newranges == NULL) {
1208 1347 SETERROR(REG_ESPACE);
1209 1348 return;
1210 1349 }
1211 1350 cs->ranges = newranges;
1212 1351 cs->ranges[cs->nranges].min = min;
1213 1352 cs->ranges[cs->nranges].max = max;
1214 1353 cs->nranges++;
1215 1354 }
1216 1355
1217 1356 /*
1218 1357 * CHaddtype - add all characters of a certain type to a character set.
1219 1358 */
1220 1359 static void
1221 1360 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1222 1361 {
1223 1362 wint_t i;
1224 1363 wctype_t *newtypes;
1225 1364
1226 1365 for (i = 0; i < NC; i++)
1227 1366 if (iswctype(i, wct))
1228 1367 CHadd(p, cs, i);
1229 1368 newtypes = realloc(cs->types, (cs->ntypes + 1) *
1230 1369 sizeof (*cs->types));
1231 1370 if (newtypes == NULL) {
1232 1371 SETERROR(REG_ESPACE);
1233 1372 return;
1234 1373 }
1235 1374 cs->types = newtypes;
1236 1375 cs->types[cs->ntypes++] = wct;
1237 1376 }
1238 1377
1239 1378 /*
1240 1379 * dupl - emit a duplicate of a bunch of sops
1241 1380 */
1242 1381 static sopno /* start of duplicate */
1243 1382 dupl(struct parse *p,
1244 1383 sopno start, /* from here */
1245 1384 sopno finish) /* to this less one */
1246 1385 {
1247 1386 sopno ret = HERE();
1248 1387 sopno len = finish - start;
1249 1388
1250 1389 assert(finish >= start);
1251 1390 if (len == 0)
1252 1391 return (ret);
1253 1392 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1254 1393 return (ret);
1255 1394 assert(p->ssize >= p->slen + len);
1256 1395 (void) memcpy((char *)(p->strip + p->slen),
1257 1396 (char *)(p->strip + start), (size_t)len*sizeof (sop));
1258 1397 p->slen += len;
1259 1398 return (ret);
1260 1399 }
1261 1400
1262 1401 /*
1263 1402 * doemit - emit a strip operator
1264 1403 *
1265 1404 * It might seem better to implement this as a macro with a function as
1266 1405 * hard-case backup, but it's just too big and messy unless there are
1267 1406 * some changes to the data structures. Maybe later.
1268 1407 */
1269 1408 static void
1270 1409 doemit(struct parse *p, sop op, size_t opnd)
1271 1410 {
1272 1411 /* avoid making error situations worse */
1273 1412 if (p->error != 0)
1274 1413 return;
1275 1414
1276 1415 /* deal with oversize operands ("can't happen", more or less) */
1277 1416 assert(opnd < 1<<OPSHIFT);
1278 1417
1279 1418 /* deal with undersized strip */
1280 1419 if (p->slen >= p->ssize)
1281 1420 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */
1282 1421 return;
1283 1422
1284 1423 /* finally, it's all reduced to the easy case */
1285 1424 p->strip[p->slen++] = SOP(op, opnd);
1286 1425 }
1287 1426
1288 1427 /*
1289 1428 * doinsert - insert a sop into the strip
1290 1429 */
1291 1430 static void
1292 1431 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1293 1432 {
1294 1433 sopno sn;
1295 1434 sop s;
1296 1435 int i;
1297 1436
1298 1437 /* avoid making error situations worse */
1299 1438 if (p->error != 0)
1300 1439 return;
1301 1440
1302 1441 sn = HERE();
1303 1442 EMIT(op, opnd); /* do checks, ensure space */
1304 1443 assert(HERE() == sn+1);
1305 1444 s = p->strip[sn];
1306 1445
1307 1446 /* adjust paren pointers */
1308 1447 assert(pos > 0);
1309 1448 for (i = 1; i < NPAREN; i++) {
1310 1449 if (p->pbegin[i] >= pos) {
1311 1450 p->pbegin[i]++;
1312 1451 }
1313 1452 if (p->pend[i] >= pos) {
1314 1453 p->pend[i]++;
1315 1454 }
1316 1455 }
1317 1456
1318 1457 (void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1319 1458 (HERE()-pos-1)*sizeof (sop));
1320 1459 p->strip[pos] = s;
1321 1460 }
1322 1461
1323 1462 /*
1324 1463 * dofwd - complete a forward reference
1325 1464 */
1326 1465 static void
1327 1466 dofwd(struct parse *p, sopno pos, sop value)
1328 1467 {
1329 1468 /* avoid making error situations worse */
1330 1469 if (p->error != 0)
1331 1470 return;
1332 1471
1333 1472 assert(value < 1<<OPSHIFT);
1334 1473 p->strip[pos] = OP(p->strip[pos]) | value;
1335 1474 }
1336 1475
1337 1476 /*
1338 1477 * enlarge - enlarge the strip
1339 1478 */
1340 1479 static int
1341 1480 enlarge(struct parse *p, sopno size)
1342 1481 {
1343 1482 sop *sp;
1344 1483
1345 1484 if (p->ssize >= size)
1346 1485 return (1);
1347 1486
1348 1487 sp = (sop *)realloc(p->strip, size*sizeof (sop));
1349 1488 if (sp == NULL) {
1350 1489 SETERROR(REG_ESPACE);
1351 1490 return (0);
1352 1491 }
1353 1492 p->strip = sp;
1354 1493 p->ssize = size;
1355 1494 return (1);
1356 1495 }
1357 1496
1358 1497 /*
1359 1498 * stripsnug - compact the strip
1360 1499 */
1361 1500 static void
1362 1501 stripsnug(struct parse *p, struct re_guts *g)
1363 1502 {
1364 1503 g->nstates = p->slen;
1365 1504 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop));
1366 1505 if (g->strip == NULL) {
1367 1506 SETERROR(REG_ESPACE);
1368 1507 g->strip = p->strip;
1369 1508 }
1370 1509 }
1371 1510
1372 1511 /*
1373 1512 * findmust - fill in must and mlen with longest mandatory literal string
1374 1513 *
1375 1514 * This algorithm could do fancy things like analyzing the operands of |
1376 1515 * for common subsequences. Someday. This code is simple and finds most
1377 1516 * of the interesting cases.
1378 1517 *
1379 1518 * Note that must and mlen got initialized during setup.
1380 1519 */
1381 1520 static void
1382 1521 findmust(struct parse *p, struct re_guts *g)
1383 1522 {
1384 1523 sop *scan;
1385 1524 sop *start = NULL;
1386 1525 sop *newstart = NULL;
1387 1526 sopno newlen;
1388 1527 sop s;
1389 1528 char *cp;
1390 1529 int offset;
1391 1530 char buf[MB_LEN_MAX];
1392 1531 size_t clen;
1393 1532 mbstate_t mbs;
1394 1533 locale_t loc = uselocale(NULL);
1395 1534
1396 1535 /* avoid making error situations worse */
1397 1536 if (p->error != 0)
1398 1537 return;
1399 1538
1400 1539 /*
1401 1540 * It's not generally safe to do a ``char'' substring search on
1402 1541 * multibyte character strings, but it's safe for at least
1403 1542 * UTF-8 (see RFC 3629).
1404 1543 */
1405 1544 if (MB_CUR_MAX > 1 &&
1406 1545 strcmp(loc->runelocale->__encoding, "UTF-8") != 0)
1407 1546 return;
1408 1547
1409 1548 /* find the longest OCHAR sequence in strip */
1410 1549 newlen = 0;
1411 1550 offset = 0;
1412 1551 g->moffset = 0;
1413 1552 scan = g->strip + 1;
1414 1553 do {
1415 1554 s = *scan++;
1416 1555 switch (OP(s)) {
1417 1556 case OCHAR: /* sequence member */
1418 1557 if (newlen == 0) { /* new sequence */
1419 1558 (void) memset(&mbs, 0, sizeof (mbs));
1420 1559 newstart = scan - 1;
1421 1560 }
1422 1561 clen = wcrtomb(buf, OPND(s), &mbs);
1423 1562 if (clen == (size_t)-1)
1424 1563 goto toohard;
1425 1564 newlen += clen;
1426 1565 break;
1427 1566 case OPLUS_: /* things that don't break one */
1428 1567 case OLPAREN:
↓ open down ↓ |
522 lines elided |
↑ open up ↑ |
1429 1568 case ORPAREN:
1430 1569 break;
1431 1570 case OQUEST_: /* things that must be skipped */
1432 1571 case OCH_:
1433 1572 offset = altoffset(scan, offset);
1434 1573 scan--;
1435 1574 do {
1436 1575 scan += OPND(s);
1437 1576 s = *scan;
1438 1577 /* assert() interferes w debug printouts */
1439 - if (OP(s) != O_QUEST && OP(s) != O_CH &&
1440 - OP(s) != OOR2) {
1578 + if (OP(s) != (sop)O_QUEST &&
1579 + OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) {
1441 1580 g->iflags |= BAD;
1442 1581 return;
1443 1582 }
1444 - } while (OP(s) != O_QUEST && OP(s) != O_CH);
1583 + } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1445 1584 /* FALLTHROUGH */
1446 1585 case OBOW: /* things that break a sequence */
1447 1586 case OEOW:
1448 1587 case OBOL:
1449 1588 case OEOL:
1450 1589 case O_QUEST:
1451 1590 case O_CH:
1452 1591 case OEND:
1453 - if (newlen > g->mlen) { /* ends one */
1592 + if (newlen > (sopno)g->mlen) { /* ends one */
1454 1593 start = newstart;
1455 1594 g->mlen = newlen;
1456 1595 if (offset > -1) {
1457 1596 g->moffset += offset;
1458 1597 offset = newlen;
1459 1598 } else
1460 1599 g->moffset = offset;
1461 1600 } else {
1462 1601 if (offset > -1)
1463 1602 offset += newlen;
1464 1603 }
1465 1604 newlen = 0;
1466 1605 break;
1467 1606 case OANY:
1468 - if (newlen > g->mlen) { /* ends one */
1607 + if (newlen > (sopno)g->mlen) { /* ends one */
1469 1608 start = newstart;
1470 1609 g->mlen = newlen;
1471 1610 if (offset > -1) {
1472 1611 g->moffset += offset;
1473 1612 offset = newlen;
1474 1613 } else
1475 1614 g->moffset = offset;
1476 1615 } else {
1477 1616 if (offset > -1)
1478 1617 offset += newlen;
1479 1618 }
1480 1619 if (offset > -1)
1481 1620 offset++;
1482 1621 newlen = 0;
1483 1622 break;
1484 1623 case OANYOF: /* may or may not invalidate offset */
1485 1624 /* First, everything as OANY */
1486 - if (newlen > g->mlen) { /* ends one */
1625 + if (newlen > (sopno)g->mlen) { /* ends one */
1487 1626 start = newstart;
1488 1627 g->mlen = newlen;
1489 1628 if (offset > -1) {
1490 1629 g->moffset += offset;
1491 1630 offset = newlen;
1492 1631 } else
1493 1632 g->moffset = offset;
1494 1633 } else {
1495 1634 if (offset > -1)
1496 1635 offset += newlen;
1497 1636 }
1498 1637 if (offset > -1)
1499 1638 offset++;
↓ open down ↓ |
3 lines elided |
↑ open up ↑ |
1500 1639 newlen = 0;
1501 1640 break;
1502 1641 toohard:
1503 1642 default:
1504 1643 /*
1505 1644 * Anything here makes it impossible or too hard
1506 1645 * to calculate the offset -- so we give up;
1507 1646 * save the last known good offset, in case the
1508 1647 * must sequence doesn't occur later.
1509 1648 */
1510 - if (newlen > g->mlen) { /* ends one */
1649 + if (newlen > (sopno)g->mlen) { /* ends one */
1511 1650 start = newstart;
1512 1651 g->mlen = newlen;
1513 1652 if (offset > -1)
1514 1653 g->moffset += offset;
1515 1654 else
1516 1655 g->moffset = offset;
1517 1656 }
1518 1657 offset = -1;
1519 1658 newlen = 0;
1520 1659 break;
1521 1660 }
1522 1661 } while (OP(s) != OEND);
1523 1662
1524 1663 if (g->mlen == 0) { /* there isn't one */
1525 1664 g->moffset = -1;
1526 1665 return;
1527 1666 }
1528 1667
1529 1668 /* turn it into a character string */
1530 1669 g->must = malloc((size_t)g->mlen + 1);
1531 1670 if (g->must == NULL) { /* argh; just forget it */
1532 1671 g->mlen = 0;
1533 1672 g->moffset = -1;
1534 1673 return;
1535 1674 }
1536 1675 cp = g->must;
1537 1676 scan = start;
1538 1677 (void) memset(&mbs, 0, sizeof (mbs));
1539 1678 while (cp < g->must + g->mlen) {
1540 1679 while (OP(s = *scan++) != OCHAR)
1541 1680 continue;
1542 1681 clen = wcrtomb(cp, OPND(s), &mbs);
1543 1682 assert(clen != (size_t)-1);
1544 1683 cp += clen;
1545 1684 }
1546 1685 assert(cp == g->must + g->mlen);
1547 1686 *cp++ = '\0'; /* just on general principles */
1548 1687 }
1549 1688
1550 1689 /*
1551 1690 * altoffset - choose biggest offset among multiple choices
1552 1691 *
1553 1692 * Compute, recursively if necessary, the largest offset among multiple
1554 1693 * re paths.
1555 1694 */
1556 1695 static int
1557 1696 altoffset(sop *scan, int offset)
1558 1697 {
1559 1698 int largest;
↓ open down ↓ |
39 lines elided |
↑ open up ↑ |
1560 1699 int try;
1561 1700 sop s;
1562 1701
1563 1702 /* If we gave up already on offsets, return */
1564 1703 if (offset == -1)
1565 1704 return (-1);
1566 1705
1567 1706 largest = 0;
1568 1707 try = 0;
1569 1708 s = *scan++;
1570 - while (OP(s) != O_QUEST && OP(s) != O_CH) {
1709 + while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) {
1571 1710 switch (OP(s)) {
1572 1711 case OOR1:
1573 1712 if (try > largest)
1574 1713 largest = try;
1575 1714 try = 0;
1576 1715 break;
1577 1716 case OQUEST_:
1578 1717 case OCH_:
1579 1718 try = altoffset(scan, try);
1580 1719 if (try == -1)
1581 1720 return (-1);
1582 1721 scan--;
1583 1722 do {
1584 1723 scan += OPND(s);
1585 1724 s = *scan;
1586 - if (OP(s) != O_QUEST && OP(s) != O_CH &&
1587 - OP(s) != OOR2)
1725 + if (OP(s) != (sop)O_QUEST &&
1726 + OP(s) != (sop)O_CH && OP(s) != (sop)OOR2)
1588 1727 return (-1);
1589 - } while (OP(s) != O_QUEST && OP(s) != O_CH);
1728 + } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1590 1729 /*
1591 1730 * We must skip to the next position, or we'll
1592 1731 * leave altoffset() too early.
1593 1732 */
1594 1733 scan++;
1595 1734 break;
1596 1735 case OANYOF:
1597 1736 case OCHAR:
1598 1737 case OANY:
1599 1738 try++;
1600 1739 /*FALLTHRU*/
1601 1740 case OBOW:
1602 1741 case OEOW:
1603 1742 case OLPAREN:
1604 1743 case ORPAREN:
1605 1744 case OOR2:
1606 1745 break;
1607 1746 default:
1608 1747 try = -1;
1609 1748 break;
1610 1749 }
1611 1750 if (try == -1)
1612 1751 return (-1);
1613 1752 s = *scan++;
1614 1753 }
1615 1754
1616 1755 if (try > largest)
1617 1756 largest = try;
1618 1757
1619 1758 return (largest+offset);
1620 1759 }
1621 1760
1622 1761 /*
1623 1762 * computejumps - compute char jumps for BM scan
1624 1763 *
1625 1764 * This algorithm assumes g->must exists and is has size greater than
1626 1765 * zero. It's based on the algorithm found on Computer Algorithms by
1627 1766 * Sara Baase.
1628 1767 *
1629 1768 * A char jump is the number of characters one needs to jump based on
1630 1769 * the value of the character from the text that was mismatched.
1631 1770 */
1632 1771 static void
1633 1772 computejumps(struct parse *p, struct re_guts *g)
1634 1773 {
1635 1774 int ch;
1636 1775 int mindex;
1637 1776
1638 1777 /* Avoid making errors worse */
1639 1778 if (p->error != 0)
1640 1779 return;
1641 1780
1642 1781 g->charjump = (int *)malloc((NC + 1) * sizeof (int));
1643 1782 if (g->charjump == NULL) /* Not a fatal error */
1644 1783 return;
1645 1784 /* Adjust for signed chars, if necessary */
1646 1785 g->charjump = &g->charjump[-(CHAR_MIN)];
1647 1786
1648 1787 /*
1649 1788 * If the character does not exist in the pattern, the jump
1650 1789 * is equal to the number of characters in the pattern.
1651 1790 */
1652 1791 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1653 1792 g->charjump[ch] = g->mlen;
1654 1793
1655 1794 /*
1656 1795 * If the character does exist, compute the jump that would
1657 1796 * take us to the last character in the pattern equal to it
1658 1797 * (notice that we match right to left, so that last character
1659 1798 * is the first one that would be matched).
1660 1799 */
1661 1800 for (mindex = 0; mindex < g->mlen; mindex++)
1662 1801 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1663 1802 }
1664 1803
1665 1804 /*
1666 1805 * computematchjumps - compute match jumps for BM scan
1667 1806 *
1668 1807 * This algorithm assumes g->must exists and is has size greater than
1669 1808 * zero. It's based on the algorithm found on Computer Algorithms by
1670 1809 * Sara Baase.
1671 1810 *
1672 1811 * A match jump is the number of characters one needs to advance based
1673 1812 * on the already-matched suffix.
1674 1813 * Notice that all values here are minus (g->mlen-1), because of the way
1675 1814 * the search algorithm works.
1676 1815 */
1677 1816 static void
1678 1817 computematchjumps(struct parse *p, struct re_guts *g)
1679 1818 {
1680 1819 int mindex; /* General "must" iterator */
1681 1820 int suffix; /* Keeps track of matching suffix */
1682 1821 int ssuffix; /* Keeps track of suffixes' suffix */
1683 1822 int *pmatches;
1684 1823 /*
1685 1824 * pmatches[k] points to the next i
1686 1825 * such that i+1...mlen is a substring
1687 1826 * of k+1...k+mlen-i-1
1688 1827 */
1689 1828
1690 1829 /* Avoid making errors worse */
1691 1830 if (p->error != 0)
1692 1831 return;
1693 1832
1694 1833 pmatches = (int *)malloc(g->mlen * sizeof (unsigned int));
1695 1834 if (pmatches == NULL) {
1696 1835 g->matchjump = NULL;
1697 1836 return;
1698 1837 }
1699 1838
1700 1839 g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int));
1701 1840 if (g->matchjump == NULL) { /* Not a fatal error */
1702 1841 free(pmatches);
1703 1842 return;
1704 1843 }
1705 1844
1706 1845 /* Set maximum possible jump for each character in the pattern */
1707 1846 for (mindex = 0; mindex < g->mlen; mindex++)
1708 1847 g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1709 1848
1710 1849 /* Compute pmatches[] */
1711 1850 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1712 1851 mindex--, suffix--) {
1713 1852 pmatches[mindex] = suffix;
1714 1853
1715 1854 /*
1716 1855 * If a mismatch is found, interrupting the substring,
1717 1856 * compute the matchjump for that position. If no
1718 1857 * mismatch is found, then a text substring mismatched
1719 1858 * against the suffix will also mismatch against the
1720 1859 * substring.
1721 1860 */
1722 1861 while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) {
1723 1862 g->matchjump[suffix] = MIN(g->matchjump[suffix],
1724 1863 g->mlen - mindex - 1);
1725 1864 suffix = pmatches[suffix];
1726 1865 }
1727 1866 }
1728 1867
1729 1868 /*
1730 1869 * Compute the matchjump up to the last substring found to jump
1731 1870 * to the beginning of the largest must pattern prefix matching
1732 1871 * it's own suffix.
1733 1872 */
1734 1873 for (mindex = 0; mindex <= suffix; mindex++)
1735 1874 g->matchjump[mindex] = MIN(g->matchjump[mindex],
1736 1875 g->mlen + suffix - mindex);
1737 1876
1738 1877 ssuffix = pmatches[suffix];
1739 1878 while (suffix < g->mlen) {
1740 1879 while (suffix <= ssuffix && suffix < g->mlen) {
1741 1880 g->matchjump[suffix] = MIN(g->matchjump[suffix],
1742 1881 g->mlen + ssuffix - suffix);
1743 1882 suffix++;
1744 1883 }
1745 1884 if (suffix < g->mlen)
1746 1885 ssuffix = pmatches[ssuffix];
1747 1886 }
1748 1887
1749 1888 free(pmatches);
1750 1889 }
1751 1890
1752 1891 /*
1753 1892 * pluscount - count + nesting
1754 1893 */
1755 1894 static sopno /* nesting depth */
1756 1895 pluscount(struct parse *p, struct re_guts *g)
1757 1896 {
1758 1897 sop *scan;
1759 1898 sop s;
1760 1899 sopno plusnest = 0;
1761 1900 sopno maxnest = 0;
1762 1901
1763 1902 if (p->error != 0)
1764 1903 return (0); /* there may not be an OEND */
1765 1904
1766 1905 scan = g->strip + 1;
1767 1906 do {
1768 1907 s = *scan++;
1769 1908 switch (OP(s)) {
1770 1909 case OPLUS_:
1771 1910 plusnest++;
1772 1911 break;
1773 1912 case O_PLUS:
1774 1913 if (plusnest > maxnest)
1775 1914 maxnest = plusnest;
1776 1915 plusnest--;
1777 1916 break;
1778 1917 }
1779 1918 } while (OP(s) != OEND);
1780 1919 if (plusnest != 0)
1781 1920 g->iflags |= BAD;
1782 1921 return (maxnest);
1783 1922 }
↓ open down ↓ |
184 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX