1 /*
2 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
4 * Copyright 2012 Milan Jurik. All rights reserved.
5 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
6 * Copyright (c) 1992, 1993, 1994
7 * The Regents of the University of California. All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * Henry Spencer.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37 #include "lint.h"
38 #include "file64.h"
39 #include <sys/types.h>
40 #include <stdio.h>
41 #include <string.h>
42 #include <ctype.h>
43 #include <limits.h>
44 #include <regex.h>
45 #include <stdlib.h>
46 #include <stdbool.h>
47 #include <wchar.h>
48 #include <wctype.h>
49
50 #include "../locale/runetype.h"
51 #include "../locale/collate.h"
52
53 #include "utils.h"
54 #include "regex2.h"
55
56 #include "cname.h"
57 #include "../locale/mblocal.h"
58
59 /*
60 * Branching context, used to keep track of branch state for all of the branch-
61 * aware functions. In addition to keeping track of branch positions for the
62 * p_branch_* functions, we use this to simplify some clumsiness in BREs for
63 * detection of whether ^ is acting as an anchor or being used erroneously and
64 * also for whether we're in a sub-expression or not.
65 */
66 struct branchc {
67 sopno start;
68 sopno back;
69 sopno fwd;
70
71 int nbranch;
72 int nchain;
73 bool outer;
74 bool terminate;
75 };
76
77 /*
78 * parse structure, passed up and down to avoid global variables and
79 * other clumsinesses
80 */
81 struct parse {
82 const char *next; /* next character in RE */
83 const char *end; /* end of string (-> NUL normally) */
84 int error; /* has an error been seen? */
85 sop *strip; /* malloced strip */
86 sopno ssize; /* malloced strip size (allocated) */
87 sopno slen; /* malloced strip length (used) */
88 int ncsalloc; /* number of csets allocated */
89 struct re_guts *g;
90 #define NPAREN 10 /* we need to remember () 1-9 for back refs */
91 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
92 sopno pend[NPAREN]; /* -> ) ([0] unused) */
93 bool allowbranch; /* can this expression branch? */
94 bool bre; /* convenience; is this a BRE? */
95 bool (*parse_expr)(struct parse *, struct branchc *);
96 void (*pre_parse)(struct parse *, struct branchc *);
97 void (*post_parse)(struct parse *, struct branchc *);
98 };
99
100 /* ========= begin header generated by ./mkh ========= */
101 #ifdef __cplusplus
102 extern "C" {
103 #endif
104
105 /* === regcomp.c === */
106 static bool p_ere_exp(struct parse *p, struct branchc *bc);
107 static void p_str(struct parse *p);
108 static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
109 static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
110 static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
111 static bool p_branch_empty(struct parse *p, struct branchc *bc);
112 static bool p_branch_do(struct parse *p, struct branchc *bc);
113 static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
114 static void p_bre_post_parse(struct parse *p, struct branchc *bc);
115 static void p_re(struct parse *p, int end1, int end2);
116 static bool p_simp_re(struct parse *p, struct branchc *bc);
117 static int p_count(struct parse *p);
118 static void p_bracket(struct parse *p);
119 static void p_b_term(struct parse *p, cset *cs);
120 static void p_b_cclass(struct parse *p, cset *cs);
121 static void p_b_eclass(struct parse *p, cset *cs);
122 static wint_t p_b_symbol(struct parse *p);
123 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
124 static wint_t othercase(wint_t ch);
125 static void bothcases(struct parse *p, wint_t ch);
126 static void ordinary(struct parse *p, wint_t ch);
127 static void nonnewline(struct parse *p);
128 static void repeat(struct parse *p, sopno start, int from, int to);
129 static int seterr(struct parse *p, int e);
130 static cset *allocset(struct parse *p);
131 static void freeset(struct parse *p, cset *cs);
132 static void CHadd(struct parse *p, cset *cs, wint_t ch);
133 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
134 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
135 static wint_t singleton(cset *cs);
136 static sopno dupl(struct parse *p, sopno start, sopno finish);
137 static void doemit(struct parse *p, sop op, size_t opnd);
138 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
139 static void dofwd(struct parse *p, sopno pos, sop value);
140 static int enlarge(struct parse *p, sopno size);
141 static void stripsnug(struct parse *p, struct re_guts *g);
142 static void findmust(struct parse *p, struct re_guts *g);
143 static int altoffset(sop *scan, int offset);
144 static void computejumps(struct parse *p, struct re_guts *g);
145 static void computematchjumps(struct parse *p, struct re_guts *g);
146 static sopno pluscount(struct parse *p, struct re_guts *g);
147 static wint_t wgetnext(struct parse *p);
148
149 #ifdef __cplusplus
150 }
151 #endif
152 /* ========= end header generated by ./mkh ========= */
153
154 static char nuls[10]; /* place to point scanner in event of error */
155
156 /*
157 * macros for use with parse structure
158 * BEWARE: these know that the parse structure is named `p' !!!
159 */
160 #define PEEK() (*p->next)
161 #define PEEK2() (*(p->next+1))
162 #define MORE() (p->next < p->end)
163 #define MORE2() (p->next+1 < p->end)
164 #define SEE(c) (MORE() && PEEK() == (c))
165 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
166 #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a))
167 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
168 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
169 #define NEXT() (p->next++)
170 #define NEXT2() (p->next += 2)
171 #define NEXTn(n) (p->next += (n))
172 #define GETNEXT() (*p->next++)
173 #define WGETNEXT() wgetnext(p)
174 #define SETERROR(e) ((void)seterr(p, (e)))
175 #define REQUIRE(co, e) ((co) || seterr(p, e))
176 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
177 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
178 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
179 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
180 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
181 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
182 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
183 #define HERE() (p->slen)
184 #define THERE() (p->slen - 1)
185 #define THERETHERE() (p->slen - 2)
186 #define DROP(n) (p->slen -= (n))
187
188 #ifndef NDEBUG
189 static int never = 0; /* for use in asserts; shuts lint up */
190 #else
191 #define never 0 /* some <assert.h>s have bugs too */
192 #endif
193
194 /*
195 * regcomp - interface for parser and compilation
196 */
197 int /* 0 success, otherwise REG_something */
198 regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern,
199 int cflags)
200 {
201 struct parse pa;
202 struct re_guts *g;
203 struct parse *p = &pa;
204 int i;
205 size_t len;
206 size_t maxlen;
207 #ifdef REDEBUG
208 #define GOODFLAGS(f) (f)
209 #else
210 #define GOODFLAGS(f) ((f)&~REG_DUMP)
211 #endif
212
213 /* We had REG_INVARG, but we don't have that on Solaris. */
214 cflags = GOODFLAGS(cflags);
215 if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
216 return (REG_EFATAL);
217
218 if (cflags®_PEND) {
219 if (preg->re_endp < pattern)
220 return (REG_EFATAL);
221 len = preg->re_endp - pattern;
222 } else
223 len = strlen(pattern);
224
225 /* do the mallocs early so failure handling is easy */
226 g = (struct re_guts *)malloc(sizeof (struct re_guts));
227 if (g == NULL)
228 return (REG_ESPACE);
229 /*
230 * Limit the pattern space to avoid a 32-bit overflow on buffer
231 * extension. Also avoid any signed overflow in case of conversion
232 * so make the real limit based on a 31-bit overflow.
233 *
234 * Likely not applicable on 64-bit systems but handle the case
235 * generically (who are we to stop people from using ~715MB+
236 * patterns?).
237 */
238 maxlen = ((size_t)-1 >> 1) / sizeof (sop) * 2 / 3;
239 if (len >= maxlen) {
240 free((char *)g);
241 return (REG_ESPACE);
242 }
243 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
244 assert(p->ssize >= len);
245
246 p->strip = (sop *)malloc(p->ssize * sizeof (sop));
247 p->slen = 0;
248 if (p->strip == NULL) {
249 free((char *)g);
250 return (REG_ESPACE);
251 }
252
253 /* set things up */
254 p->g = g;
255 p->next = pattern; /* convenience; we do not modify it */
256 p->end = p->next + len;
257 p->error = 0;
258 p->ncsalloc = 0;
259 for (i = 0; i < NPAREN; i++) {
260 p->pbegin[i] = 0;
261 p->pend[i] = 0;
262 }
263 if (cflags & REG_EXTENDED) {
264 p->allowbranch = true;
265 p->bre = false;
266 p->parse_expr = p_ere_exp;
267 p->pre_parse = NULL;
268 p->post_parse = NULL;
269 } else {
270 p->allowbranch = false;
271 p->bre = true;
272 p->parse_expr = p_simp_re;
273 p->pre_parse = p_bre_pre_parse;
274 p->post_parse = p_bre_post_parse;
275 }
276 g->sets = NULL;
277 g->ncsets = 0;
278 g->cflags = cflags;
279 g->iflags = 0;
280 g->nbol = 0;
281 g->neol = 0;
282 g->must = NULL;
283 g->moffset = -1;
284 g->charjump = NULL;
285 g->matchjump = NULL;
286 g->mlen = 0;
287 g->nsub = 0;
288 g->backrefs = 0;
289
290 /* do it */
291 EMIT(OEND, 0);
292 g->firststate = THERE();
293 if (cflags & REG_NOSPEC)
294 p_str(p);
295 else
296 p_re(p, OUT, OUT);
297 EMIT(OEND, 0);
298 g->laststate = THERE();
299
300 /* tidy up loose ends and fill things in */
301 stripsnug(p, g);
302 findmust(p, g);
303 /*
304 * only use Boyer-Moore algorithm if the pattern is bigger
305 * than three characters
306 */
307 if (g->mlen > 3) {
308 computejumps(p, g);
309 computematchjumps(p, g);
310 if (g->matchjump == NULL && g->charjump != NULL) {
311 free(g->charjump);
312 g->charjump = NULL;
313 }
314 }
315 g->nplus = pluscount(p, g);
316 g->magic = MAGIC2;
317 preg->re_nsub = g->nsub;
318 preg->re_g = g;
319 preg->re_magic = MAGIC1;
320 #ifndef REDEBUG
321 /* not debugging, so can't rely on the assert() in regexec() */
322 if (g->iflags&BAD)
323 SETERROR(REG_EFATAL);
324 #endif
325
326 /* win or lose, we're done */
327 if (p->error != 0) /* lose */
328 regfree(preg);
329 return (p->error);
330 }
331
332 /*
333 * Parse one subERE, an atom possibly followed by a repetition op,
334 * return whether we should terminate or not.
335 */
336 static bool
337 p_ere_exp(struct parse *p, struct branchc *bc)
338 {
339 char c;
340 wint_t wc;
341 sopno pos;
342 int count;
343 int count2;
344 sopno subno;
345 int wascaret = 0;
346
347 (void) bc;
348 assert(MORE()); /* caller should have ensured this */
349 c = GETNEXT();
350
351 pos = HERE();
352 switch (c) {
353 case '(':
354 (void) REQUIRE(MORE(), REG_EPAREN);
355 p->g->nsub++;
356 subno = p->g->nsub;
357 if (subno < NPAREN)
358 p->pbegin[subno] = HERE();
359 EMIT(OLPAREN, subno);
360 if (!SEE(')'))
361 p_re(p, ')', IGN);
362 if (subno < NPAREN) {
363 p->pend[subno] = HERE();
364 assert(p->pend[subno] != 0);
365 }
366 EMIT(ORPAREN, subno);
367 (void) MUSTEAT(')', REG_EPAREN);
368 break;
369 #ifndef POSIX_MISTAKE
370 case ')': /* happens only if no current unmatched ( */
371 /*
372 * You may ask, why the ifndef? Because I didn't notice
373 * this until slightly too late for 1003.2, and none of the
374 * other 1003.2 regular-expression reviewers noticed it at
375 * all. So an unmatched ) is legal POSIX, at least until
376 * we can get it fixed.
377 */
378 SETERROR(REG_EPAREN);
379 break;
380 #endif
381 case '^':
382 EMIT(OBOL, 0);
383 p->g->iflags |= USEBOL;
384 p->g->nbol++;
385 wascaret = 1;
386 break;
387 case '$':
388 EMIT(OEOL, 0);
389 p->g->iflags |= USEEOL;
390 p->g->neol++;
391 break;
392 case '|':
393 SETERROR(REG_BADPAT);
394 break;
395 case '*':
396 case '+':
397 case '?':
398 case '{':
399 SETERROR(REG_BADRPT);
400 break;
401 case '.':
402 if (p->g->cflags®_NEWLINE)
403 nonnewline(p);
404 else
405 EMIT(OANY, 0);
406 break;
407 case '[':
408 p_bracket(p);
409 break;
410 case '\\':
411 (void) REQUIRE(MORE(), REG_EESCAPE);
412 wc = WGETNEXT();
413 switch (wc) {
414 case '<':
415 EMIT(OBOW, 0);
416 break;
417 case '>':
418 EMIT(OEOW, 0);
419 break;
420 default:
421 ordinary(p, wc);
422 break;
423 }
424 break;
425 default:
426 if (p->error != 0)
427 return (false);
428 p->next--;
429 wc = WGETNEXT();
430 ordinary(p, wc);
431 break;
432 }
433
434 if (!MORE())
435 return (false);
436 c = PEEK();
437 /* we call { a repetition if followed by a digit */
438 if (!(c == '*' || c == '+' || c == '?' || c == '{'))
439 return (false); /* no repetition, we're done */
440 else if (c == '{')
441 (void) REQUIRE(MORE2() && \
442 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
443 NEXT();
444
445 (void) REQUIRE(!wascaret, REG_BADRPT);
446 switch (c) {
447 case '*': /* implemented as +? */
448 /* this case does not require the (y|) trick, noKLUDGE */
449 INSERT(OPLUS_, pos);
450 ASTERN(O_PLUS, pos);
451 INSERT(OQUEST_, pos);
452 ASTERN(O_QUEST, pos);
453 break;
454 case '+':
455 INSERT(OPLUS_, pos);
456 ASTERN(O_PLUS, pos);
457 break;
458 case '?':
459 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
460 INSERT(OCH_, pos); /* offset slightly wrong */
461 ASTERN(OOR1, pos); /* this one's right */
462 AHEAD(pos); /* fix the OCH_ */
463 EMIT(OOR2, 0); /* offset very wrong... */
464 AHEAD(THERE()); /* ...so fix it */
465 ASTERN(O_CH, THERETHERE());
466 break;
467 case '{':
468 count = p_count(p);
469 if (EAT(',')) {
470 if (isdigit((uch)PEEK())) {
471 count2 = p_count(p);
472 (void) REQUIRE(count <= count2, REG_BADBR);
473 } else /* single number with comma */
474 count2 = INFINITY;
475 } else /* just a single number */
476 count2 = count;
477 repeat(p, pos, count, count2);
478 if (!EAT('}')) { /* error heuristics */
479 while (MORE() && PEEK() != '}')
480 NEXT();
481 (void) REQUIRE(MORE(), REG_EBRACE);
482 SETERROR(REG_BADBR);
483 }
484 break;
485 }
486
487 if (!MORE())
488 return (false);
489 c = PEEK();
490 if (!(c == '*' || c == '+' || c == '?' ||
491 (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
492 return (false);
493 SETERROR(REG_BADRPT);
494 return (false);
495 }
496
497 /*
498 * p_str - string (no metacharacters) "parser"
499 */
500 static void
501 p_str(struct parse *p)
502 {
503 (void) REQUIRE(MORE(), REG_BADPAT);
504 while (MORE())
505 ordinary(p, WGETNEXT());
506 }
507
508 /*
509 * Eat consecutive branch delimiters for the kind of expression that we are
510 * parsing, return the number of delimiters that we ate.
511 */
512 static int
513 p_branch_eat_delim(struct parse *p, struct branchc *bc)
514 {
515 int nskip;
516
517 (void) bc;
518 nskip = 0;
519 while (EAT('|'))
520 ++nskip;
521 return (nskip);
522 }
523
524 /*
525 * Insert necessary branch book-keeping operations. This emits a
526 * bogus 'next' offset, since we still have more to parse
527 */
528 static void
529 p_branch_ins_offset(struct parse *p, struct branchc *bc)
530 {
531 if (bc->nbranch == 0) {
532 INSERT(OCH_, bc->start); /* offset is wrong */
533 bc->fwd = bc->start;
534 bc->back = bc->start;
535 }
536
537 ASTERN(OOR1, bc->back);
538 bc->back = THERE();
539 AHEAD(bc->fwd); /* fix previous offset */
540 bc->fwd = HERE();
541 EMIT(OOR2, 0); /* offset is very wrong */
542 ++bc->nbranch;
543 }
544
545 /*
546 * Fix the offset of the tail branch, if we actually had any branches.
547 * This is to correct the bogus placeholder offset that we use.
548 */
549 static void
550 p_branch_fix_tail(struct parse *p, struct branchc *bc)
551 {
552 /* Fix bogus offset at the tail if we actually have branches */
553 if (bc->nbranch > 0) {
554 AHEAD(bc->fwd);
555 ASTERN(O_CH, bc->back);
556 }
557 }
558
559 /*
560 * Signal to the parser that an empty branch has been encountered; this will,
561 * in the future, be used to allow for more permissive behavior with empty
562 * branches. The return value should indicate whether parsing may continue
563 * or not.
564 */
565 static bool
566 p_branch_empty(struct parse *p, struct branchc *bc)
567 {
568 (void) bc;
569 SETERROR(REG_BADPAT);
570 return (false);
571 }
572
573 /*
574 * Take care of any branching requirements. This includes inserting the
575 * appropriate branching instructions as well as eating all of the branch
576 * delimiters until we either run out of pattern or need to parse more pattern.
577 */
578 static bool
579 p_branch_do(struct parse *p, struct branchc *bc)
580 {
581 int ate = 0;
582
583 ate = p_branch_eat_delim(p, bc);
584 if (ate == 0)
585 return (false);
586 else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
587 /*
588 * Halt parsing only if we have an empty branch and
589 * p_branch_empty indicates that we must not continue.
590 * In the future, this will not necessarily be an error.
591 */
592 return (false);
593 p_branch_ins_offset(p, bc);
594
595 return (true);
596 }
597
598 static void
599 p_bre_pre_parse(struct parse *p, struct branchc *bc)
600 {
601 (void) bc;
602 /*
603 * Does not move cleanly into expression parser because of
604 * ordinary interpration of * at the beginning position of
605 * an expression.
606 */
607 if (EAT('^')) {
608 EMIT(OBOL, 0);
609 p->g->iflags |= USEBOL;
610 p->g->nbol++;
611 }
612 }
613
614 static void
615 p_bre_post_parse(struct parse *p, struct branchc *bc)
616 {
617 /* Expression is terminating due to EOL token */
618 if (bc->terminate) {
619 DROP(1);
620 EMIT(OEOL, 0);
621 p->g->iflags |= USEEOL;
622 p->g->neol++;
623 }
624 }
625
626 /*
627 * Top level parser, concatenation and BRE anchoring.
628 * Giving end1 as OUT essentially eliminates the end1/end2 check.
629 *
630 * This implementation is a bit of a kludge, in that a trailing $ is first
631 * taken as an ordinary character and then revised to be an anchor.
632 * The amount of lookahead needed to avoid this kludge is excessive.
633 */
634 static void
635 p_re(struct parse *p,
636 int end1, /* first terminating character */
637 int end2) /* second terminating character; ignored for EREs */
638 {
639 struct branchc bc;
640
641 bc.nbranch = 0;
642 if (end1 == OUT && end2 == OUT)
643 bc.outer = true;
644 else
645 bc.outer = false;
646 #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2))
647 for (;;) {
648 bc.start = HERE();
649 bc.nchain = 0;
650 bc.terminate = false;
651 if (p->pre_parse != NULL)
652 p->pre_parse(p, &bc);
653 while (MORE() && (!p->allowbranch || !SEESPEC('|')) &&
654 !SEEEND()) {
655 bc.terminate = p->parse_expr(p, &bc);
656 ++bc.nchain;
657 }
658 if (p->post_parse != NULL)
659 p->post_parse(p, &bc);
660 (void) REQUIRE(HERE() != bc.start, REG_BADPAT);
661 if (!p->allowbranch)
662 break;
663 /*
664 * p_branch_do's return value indicates whether we should
665 * continue parsing or not. This is both for correctness and
666 * a slight optimization, because it will check if we've
667 * encountered an empty branch or the end of the string
668 * immediately following a branch delimiter.
669 */
670 if (!p_branch_do(p, &bc))
671 break;
672 }
673 #undef SEE_END
674 if (p->allowbranch)
675 p_branch_fix_tail(p, &bc);
676 assert(!MORE() || SEE(end1));
677 }
678
679 /*
680 * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
681 */
682 static bool /* was the simple RE an unbackslashed $? */
683 p_simp_re(struct parse *p, struct branchc *bc)
684 {
685 int c;
686 int count;
687 int count2;
688 sopno pos;
689 int i;
690 wint_t wc;
691 sopno subno;
692 #define BACKSL (1<<CHAR_BIT)
693
694 pos = HERE(); /* repetition op, if any, covers from here */
695
696 assert(MORE()); /* caller should have ensured this */
697 c = GETNEXT();
698 if (c == '\\') {
699 (void) REQUIRE(MORE(), REG_EESCAPE);
700 c = BACKSL | GETNEXT();
701 }
702 switch (c) {
703 case '.':
704 if (p->g->cflags®_NEWLINE)
705 nonnewline(p);
706 else
707 EMIT(OANY, 0);
708 break;
709 case '[':
710 p_bracket(p);
711 break;
712 case BACKSL|'<':
713 EMIT(OBOW, 0);
714 break;
715 case BACKSL|'>':
716 EMIT(OEOW, 0);
717 break;
718 case BACKSL|'{':
719 SETERROR(REG_BADRPT);
720 break;
721 case BACKSL|'(':
722 p->g->nsub++;
723 subno = p->g->nsub;
724 if (subno < NPAREN)
725 p->pbegin[subno] = HERE();
726 EMIT(OLPAREN, subno);
727 /* the MORE here is an error heuristic */
728 if (MORE() && !SEETWO('\\', ')'))
729 p_re(p, '\\', ')');
730 if (subno < NPAREN) {
731 p->pend[subno] = HERE();
732 assert(p->pend[subno] != 0);
733 }
734 EMIT(ORPAREN, subno);
735 (void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
736 break;
737 case BACKSL|')': /* should not get here -- must be user */
738 SETERROR(REG_EPAREN);
739 break;
740 case BACKSL|'1':
741 case BACKSL|'2':
742 case BACKSL|'3':
743 case BACKSL|'4':
744 case BACKSL|'5':
745 case BACKSL|'6':
746 case BACKSL|'7':
747 case BACKSL|'8':
748 case BACKSL|'9':
749 i = (c&~BACKSL) - '0';
750 assert(i < NPAREN);
751 if (p->pend[i] != 0) {
752 assert(i <= p->g->nsub);
753 EMIT(OBACK_, i);
754 assert(p->pbegin[i] != 0);
755 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
756 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
757 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
758 EMIT(O_BACK, i);
759 } else
760 SETERROR(REG_ESUBREG);
761 p->g->backrefs = 1;
762 break;
763 case '*':
764 /*
765 * Ordinary if used as the first character beyond BOL anchor of
766 * a (sub-)expression, counts as a bad repetition operator if it
767 * appears otherwise.
768 */
769 (void) REQUIRE(bc->nchain == 0, REG_BADRPT);
770 /* FALLTHROUGH */
771 default:
772 if (p->error != 0)
773 return (false); /* Definitely not $... */
774 p->next--;
775 wc = WGETNEXT();
776 ordinary(p, wc);
777 break;
778 }
779
780 if (EAT('*')) { /* implemented as +? */
781 /* this case does not require the (y|) trick, noKLUDGE */
782 INSERT(OPLUS_, pos);
783 ASTERN(O_PLUS, pos);
784 INSERT(OQUEST_, pos);
785 ASTERN(O_QUEST, pos);
786 } else if (EATTWO('\\', '{')) {
787 count = p_count(p);
788 if (EAT(',')) {
789 if (MORE() && isdigit((uch)PEEK())) {
790 count2 = p_count(p);
791 (void) REQUIRE(count <= count2, REG_BADBR);
792 } else /* single number with comma */
793 count2 = INFINITY;
794 } else /* just a single number */
795 count2 = count;
796 repeat(p, pos, count, count2);
797 if (!EATTWO('\\', '}')) { /* error heuristics */
798 while (MORE() && !SEETWO('\\', '}'))
799 NEXT();
800 (void) REQUIRE(MORE(), REG_EBRACE);
801 SETERROR(REG_BADBR);
802 }
803 } else if (c == '$') /* $ (but not \$) ends it */
804 return (true);
805
806 return (false);
807 }
808
809 /*
810 * p_count - parse a repetition count
811 */
812 static int /* the value */
813 p_count(struct parse *p)
814 {
815 int count = 0;
816 int ndigits = 0;
817
818 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
819 count = count*10 + (GETNEXT() - '0');
820 ndigits++;
821 }
822
823 (void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
824 return (count);
825 }
826
827 /*
828 * p_bracket - parse a bracketed character list
829 */
830 static void
831 p_bracket(struct parse *p)
832 {
833 cset *cs;
834 wint_t ch;
835
836 /* Dept of Truly Sickening Special-Case Kludges */
837 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
838 EMIT(OBOW, 0);
839 NEXTn(6);
840 return;
841 }
842 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
843 EMIT(OEOW, 0);
844 NEXTn(6);
845 return;
846 }
847
848 if ((cs = allocset(p)) == NULL)
849 return;
850
851 if (p->g->cflags®_ICASE)
852 cs->icase = 1;
853 if (EAT('^'))
854 cs->invert = 1;
855 if (EAT(']'))
856 CHadd(p, cs, ']');
857 else if (EAT('-'))
858 CHadd(p, cs, '-');
859 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
860 p_b_term(p, cs);
861 if (EAT('-'))
862 CHadd(p, cs, '-');
863 (void) MUSTEAT(']', REG_EBRACK);
864
865 if (p->error != 0) /* don't mess things up further */
866 return;
867
868 if (cs->invert && p->g->cflags®_NEWLINE)
869 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
870
871 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
872 ordinary(p, ch);
873 freeset(p, cs);
874 } else
875 EMIT(OANYOF, (int)(cs - p->g->sets));
876 }
877
878 /*
879 * p_b_term - parse one term of a bracketed character list
880 */
881 static void
882 p_b_term(struct parse *p, cset *cs)
883 {
884 char c;
885 wint_t start, finish;
886 wint_t i;
887 locale_t loc = uselocale(NULL);
888
889 /* classify what we've got */
890 switch ((MORE()) ? PEEK() : '\0') {
891 case '[':
892 c = (MORE2()) ? PEEK2() : '\0';
893 break;
894 case '-':
895 SETERROR(REG_ERANGE);
896 return; /* NOTE RETURN */
897 default:
898 c = '\0';
899 break;
900 }
901
902 switch (c) {
903 case ':': /* character class */
904 NEXT2();
905 (void) REQUIRE(MORE(), REG_EBRACK);
906 c = PEEK();
907 (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);
908 p_b_cclass(p, cs);
909 (void) REQUIRE(MORE(), REG_EBRACK);
910 (void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
911 break;
912 case '=': /* equivalence class */
913 NEXT2();
914 (void) REQUIRE(MORE(), REG_EBRACK);
915 c = PEEK();
916 (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
917 p_b_eclass(p, cs);
918 (void) REQUIRE(MORE(), REG_EBRACK);
919 (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
920 break;
921 default: /* symbol, ordinary character, or range */
922 start = p_b_symbol(p);
923 if (SEE('-') && MORE2() && PEEK2() != ']') {
924 /* range */
925 NEXT();
926 if (EAT('-'))
927 finish = '-';
928 else
929 finish = p_b_symbol(p);
930 } else
931 finish = start;
932 if (start == finish)
933 CHadd(p, cs, start);
934 else {
935 if (loc->collate->lc_is_posix) {
936 (void) REQUIRE((uch)start <= (uch)finish,
937 REG_ERANGE);
938 CHaddrange(p, cs, start, finish);
939 } else {
940 (void) REQUIRE(_collate_range_cmp(start,
941 finish, loc) <= 0, REG_ERANGE);
942 for (i = 0; i <= UCHAR_MAX; i++) {
943 if (_collate_range_cmp(start, i, loc)
944 <= 0 &&
945 _collate_range_cmp(i, finish, loc)
946 <= 0)
947 CHadd(p, cs, i);
948 }
949 }
950 }
951 break;
952 }
953 }
954
955 /*
956 * p_b_cclass - parse a character-class name and deal with it
957 */
958 static void
959 p_b_cclass(struct parse *p, cset *cs)
960 {
961 const char *sp = p->next;
962 size_t len;
963 wctype_t wct;
964 char clname[16];
965
966 while (MORE() && isalpha((uch)PEEK()))
967 NEXT();
968 len = p->next - sp;
969 if (len >= sizeof (clname) - 1) {
970 SETERROR(REG_ECTYPE);
971 return;
972 }
973 (void) memcpy(clname, sp, len);
974 clname[len] = '\0';
975 if ((wct = wctype(clname)) == 0) {
976 SETERROR(REG_ECTYPE);
977 return;
978 }
979 CHaddtype(p, cs, wct);
980 }
981
982 /*
983 * p_b_eclass - parse an equivalence-class name and deal with it
984 *
985 * This implementation is incomplete. xxx
986 */
987 static void
988 p_b_eclass(struct parse *p, cset *cs)
989 {
990 wint_t c;
991
992 c = p_b_coll_elem(p, '=');
993 CHadd(p, cs, c);
994 }
995
996 /*
997 * p_b_symbol - parse a character or [..]ed multicharacter collating symbol
998 */
999 static wint_t /* value of symbol */
1000 p_b_symbol(struct parse *p)
1001 {
1002 wint_t value;
1003
1004 (void) REQUIRE(MORE(), REG_EBRACK);
1005 if (!EATTWO('[', '.'))
1006 return (WGETNEXT());
1007
1008 /* collating symbol */
1009 value = p_b_coll_elem(p, '.');
1010 (void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1011 return (value);
1012 }
1013
1014 /*
1015 * p_b_coll_elem - parse a collating-element name and look it up
1016 */
1017 static wint_t /* value of collating element */
1018 p_b_coll_elem(struct parse *p,
1019 wint_t endc) /* name ended by endc,']' */
1020 {
1021 const char *sp = p->next;
1022 struct cname *cp;
1023 mbstate_t mbs;
1024 wchar_t wc;
1025 size_t clen, len;
1026
1027 while (MORE() && !SEETWO(endc, ']'))
1028 NEXT();
1029 if (!MORE()) {
1030 SETERROR(REG_EBRACK);
1031 return (0);
1032 }
1033 len = p->next - sp;
1034 for (cp = cnames; cp->name != NULL; cp++)
1035 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1036 return (cp->code); /* known name */
1037 (void) memset(&mbs, 0, sizeof (mbs));
1038 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1039 return (wc); /* single character */
1040 else if (clen == (size_t)-1 || clen == (size_t)-2)
1041 SETERROR(REG_ECHAR);
1042 else
1043 SETERROR(REG_ECOLLATE); /* neither */
1044 return (0);
1045 }
1046
1047 /*
1048 * othercase - return the case counterpart of an alphabetic
1049 */
1050 static wint_t /* if no counterpart, return ch */
1051 othercase(wint_t ch)
1052 {
1053 assert(iswalpha(ch));
1054 if (iswupper(ch))
1055 return (towlower(ch));
1056 else if (iswlower(ch))
1057 return (towupper(ch));
1058 else /* peculiar, but could happen */
1059 return (ch);
1060 }
1061
1062 /*
1063 * bothcases - emit a dualcase version of a two-case character
1064 *
1065 * Boy, is this implementation ever a kludge...
1066 */
1067 static void
1068 bothcases(struct parse *p, wint_t ch)
1069 {
1070 const char *oldnext = p->next;
1071 const char *oldend = p->end;
1072 char bracket[3 + MB_LEN_MAX];
1073 size_t n;
1074 mbstate_t mbs;
1075
1076 assert(othercase(ch) != ch); /* p_bracket() would recurse */
1077 p->next = bracket;
1078 (void) memset(&mbs, 0, sizeof (mbs));
1079 n = wcrtomb(bracket, ch, &mbs);
1080 assert(n != (size_t)-1);
1081 bracket[n] = ']';
1082 bracket[n + 1] = '\0';
1083 p->end = bracket+n+1;
1084 p_bracket(p);
1085 assert(p->next == p->end);
1086 p->next = oldnext;
1087 p->end = oldend;
1088 }
1089
1090 /*
1091 * ordinary - emit an ordinary character
1092 */
1093 static void
1094 ordinary(struct parse *p, wint_t ch)
1095 {
1096 cset *cs;
1097
1098 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1099 bothcases(p, ch);
1100 else if ((ch & OPDMASK) == ch)
1101 EMIT(OCHAR, ch);
1102 else {
1103 /*
1104 * Kludge: character is too big to fit into an OCHAR operand.
1105 * Emit a singleton set.
1106 */
1107 if ((cs = allocset(p)) == NULL)
1108 return;
1109 CHadd(p, cs, ch);
1110 EMIT(OANYOF, (int)(cs - p->g->sets));
1111 }
1112 }
1113
1114 /*
1115 * nonnewline - emit REG_NEWLINE version of OANY
1116 *
1117 * Boy, is this implementation ever a kludge...
1118 */
1119 static void
1120 nonnewline(struct parse *p)
1121 {
1122 const char *oldnext = p->next;
1123 const char *oldend = p->end;
1124 char bracket[4];
1125
1126 p->next = bracket;
1127 p->end = bracket+3;
1128 bracket[0] = '^';
1129 bracket[1] = '\n';
1130 bracket[2] = ']';
1131 bracket[3] = '\0';
1132 p_bracket(p);
1133 assert(p->next == bracket+3);
1134 p->next = oldnext;
1135 p->end = oldend;
1136 }
1137
1138 /*
1139 * repeat - generate code for a bounded repetition, recursively if needed
1140 */
1141 static void
1142 repeat(struct parse *p,
1143 sopno start, /* operand from here to end of strip */
1144 int from, /* repeated from this number */
1145 int to) /* to this number of times (maybe INFINITY) */
1146 {
1147 sopno finish = HERE();
1148 #define N 2
1149 #define INF 3
1150 #define REP(f, t) ((f)*8 + (t))
1151 #define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1152 sopno copy;
1153
1154 if (p->error != 0) /* head off possible runaway recursion */
1155 return;
1156
1157 assert(from <= to);
1158
1159 switch (REP(MAP(from), MAP(to))) {
1160 case REP(0, 0): /* must be user doing this */
1161 DROP(finish-start); /* drop the operand */
1162 break;
1163 case REP(0, 1): /* as x{1,1}? */
1164 case REP(0, N): /* as x{1,n}? */
1165 case REP(0, INF): /* as x{1,}? */
1166 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1167 INSERT(OCH_, start); /* offset is wrong... */
1168 repeat(p, start+1, 1, to);
1169 ASTERN(OOR1, start);
1170 AHEAD(start); /* ... fix it */
1171 EMIT(OOR2, 0);
1172 AHEAD(THERE());
1173 ASTERN(O_CH, THERETHERE());
1174 break;
1175 case REP(1, 1): /* trivial case */
1176 /* done */
1177 break;
1178 case REP(1, N): /* as x?x{1,n-1} */
1179 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1180 INSERT(OCH_, start);
1181 ASTERN(OOR1, start);
1182 AHEAD(start);
1183 EMIT(OOR2, 0); /* offset very wrong... */
1184 AHEAD(THERE()); /* ...so fix it */
1185 ASTERN(O_CH, THERETHERE());
1186 copy = dupl(p, start+1, finish+1);
1187 assert(copy == finish+4);
1188 repeat(p, copy, 1, to-1);
1189 break;
1190 case REP(1, INF): /* as x+ */
1191 INSERT(OPLUS_, start);
1192 ASTERN(O_PLUS, start);
1193 break;
1194 case REP(N, N): /* as xx{m-1,n-1} */
1195 copy = dupl(p, start, finish);
1196 repeat(p, copy, from-1, to-1);
1197 break;
1198 case REP(N, INF): /* as xx{n-1,INF} */
1199 copy = dupl(p, start, finish);
1200 repeat(p, copy, from-1, to);
1201 break;
1202 default: /* "can't happen" */
1203 SETERROR(REG_EFATAL); /* just in case */
1204 break;
1205 }
1206 }
1207
1208 /*
1209 * wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1210 * character from the parse struct, signals a REG_ILLSEQ error if the
1211 * character can't be converted. Returns the number of bytes consumed.
1212 */
1213 static wint_t
1214 wgetnext(struct parse *p)
1215 {
1216 mbstate_t mbs;
1217 wchar_t wc;
1218 size_t n;
1219
1220 (void) memset(&mbs, 0, sizeof (mbs));
1221 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1222 if (n == (size_t)-1 || n == (size_t)-2) {
1223 SETERROR(REG_ECHAR);
1224 return (0);
1225 }
1226 if (n == 0)
1227 n = 1;
1228 p->next += n;
1229 return (wc);
1230 }
1231
1232 /*
1233 * seterr - set an error condition
1234 */
1235 static int /* useless but makes type checking happy */
1236 seterr(struct parse *p, int e)
1237 {
1238 if (p->error == 0) /* keep earliest error condition */
1239 p->error = e;
1240 p->next = nuls; /* try to bring things to a halt */
1241 p->end = nuls;
1242 return (0); /* make the return value well-defined */
1243 }
1244
1245 /*
1246 * allocset - allocate a set of characters for []
1247 */
1248 static cset *
1249 allocset(struct parse *p)
1250 {
1251 cset *cs, *ncs;
1252
1253 ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs));
1254 if (ncs == NULL) {
1255 SETERROR(REG_ESPACE);
1256 return (NULL);
1257 }
1258 p->g->sets = ncs;
1259 cs = &p->g->sets[p->g->ncsets++];
1260 (void) memset(cs, 0, sizeof (*cs));
1261
1262 return (cs);
1263 }
1264
1265 /*
1266 * freeset - free a now-unused set
1267 */
1268 static void
1269 freeset(struct parse *p, cset *cs)
1270 {
1271 cset *top = &p->g->sets[p->g->ncsets];
1272
1273 free(cs->wides);
1274 free(cs->ranges);
1275 free(cs->types);
1276 (void) memset(cs, 0, sizeof (*cs));
1277 if (cs == top-1) /* recover only the easy case */
1278 p->g->ncsets--;
1279 }
1280
1281 /*
1282 * singleton - Determine whether a set contains only one character,
1283 * returning it if so, otherwise returning OUT.
1284 */
1285 static wint_t
1286 singleton(cset *cs)
1287 {
1288 wint_t i, s, n;
1289
1290 for (i = n = 0; i < NC; i++)
1291 if (CHIN(cs, i)) {
1292 n++;
1293 s = i;
1294 }
1295 if (n == 1)
1296 return (s);
1297 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1298 cs->icase == 0)
1299 return (cs->wides[0]);
1300 /* Don't bother handling the other cases. */
1301 return (OUT);
1302 }
1303
1304 /*
1305 * CHadd - add character to character set.
1306 */
1307 static void
1308 CHadd(struct parse *p, cset *cs, wint_t ch)
1309 {
1310 wint_t nch, *newwides;
1311 assert(ch >= 0);
1312 if (ch < NC)
1313 cs->bmp[ch >> 3] |= 1 << (ch & 7);
1314 else {
1315 newwides = realloc(cs->wides, (cs->nwides + 1) *
1316 sizeof (*cs->wides));
1317 if (newwides == NULL) {
1318 SETERROR(REG_ESPACE);
1319 return;
1320 }
1321 cs->wides = newwides;
1322 cs->wides[cs->nwides++] = ch;
1323 }
1324 if (cs->icase) {
1325 if ((nch = towlower(ch)) < NC)
1326 cs->bmp[nch >> 3] |= 1 << (nch & 7);
1327 if ((nch = towupper(ch)) < NC)
1328 cs->bmp[nch >> 3] |= 1 << (nch & 7);
1329 }
1330 }
1331
1332 /*
1333 * CHaddrange - add all characters in the range [min,max] to a character set.
1334 */
1335 static void
1336 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1337 {
1338 crange *newranges;
1339
1340 for (; min < NC && min <= max; min++)
1341 CHadd(p, cs, min);
1342 if (min >= max)
1343 return;
1344 newranges = realloc(cs->ranges, (cs->nranges + 1) *
1345 sizeof (*cs->ranges));
1346 if (newranges == NULL) {
1347 SETERROR(REG_ESPACE);
1348 return;
1349 }
1350 cs->ranges = newranges;
1351 cs->ranges[cs->nranges].min = min;
1352 cs->ranges[cs->nranges].max = max;
1353 cs->nranges++;
1354 }
1355
1356 /*
1357 * CHaddtype - add all characters of a certain type to a character set.
1358 */
1359 static void
1360 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1361 {
1362 wint_t i;
1363 wctype_t *newtypes;
1364
1365 for (i = 0; i < NC; i++)
1366 if (iswctype(i, wct))
1367 CHadd(p, cs, i);
1368 newtypes = realloc(cs->types, (cs->ntypes + 1) *
1369 sizeof (*cs->types));
1370 if (newtypes == NULL) {
1371 SETERROR(REG_ESPACE);
1372 return;
1373 }
1374 cs->types = newtypes;
1375 cs->types[cs->ntypes++] = wct;
1376 }
1377
1378 /*
1379 * dupl - emit a duplicate of a bunch of sops
1380 */
1381 static sopno /* start of duplicate */
1382 dupl(struct parse *p,
1383 sopno start, /* from here */
1384 sopno finish) /* to this less one */
1385 {
1386 sopno ret = HERE();
1387 sopno len = finish - start;
1388
1389 assert(finish >= start);
1390 if (len == 0)
1391 return (ret);
1392 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1393 return (ret);
1394 assert(p->ssize >= p->slen + len);
1395 (void) memcpy((char *)(p->strip + p->slen),
1396 (char *)(p->strip + start), (size_t)len*sizeof (sop));
1397 p->slen += len;
1398 return (ret);
1399 }
1400
1401 /*
1402 * doemit - emit a strip operator
1403 *
1404 * It might seem better to implement this as a macro with a function as
1405 * hard-case backup, but it's just too big and messy unless there are
1406 * some changes to the data structures. Maybe later.
1407 */
1408 static void
1409 doemit(struct parse *p, sop op, size_t opnd)
1410 {
1411 /* avoid making error situations worse */
1412 if (p->error != 0)
1413 return;
1414
1415 /* deal with oversize operands ("can't happen", more or less) */
1416 assert(opnd < 1<<OPSHIFT);
1417
1418 /* deal with undersized strip */
1419 if (p->slen >= p->ssize)
1420 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */
1421 return;
1422
1423 /* finally, it's all reduced to the easy case */
1424 p->strip[p->slen++] = SOP(op, opnd);
1425 }
1426
1427 /*
1428 * doinsert - insert a sop into the strip
1429 */
1430 static void
1431 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1432 {
1433 sopno sn;
1434 sop s;
1435 int i;
1436
1437 /* avoid making error situations worse */
1438 if (p->error != 0)
1439 return;
1440
1441 sn = HERE();
1442 EMIT(op, opnd); /* do checks, ensure space */
1443 assert(HERE() == sn+1);
1444 s = p->strip[sn];
1445
1446 /* adjust paren pointers */
1447 assert(pos > 0);
1448 for (i = 1; i < NPAREN; i++) {
1449 if (p->pbegin[i] >= pos) {
1450 p->pbegin[i]++;
1451 }
1452 if (p->pend[i] >= pos) {
1453 p->pend[i]++;
1454 }
1455 }
1456
1457 (void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1458 (HERE()-pos-1)*sizeof (sop));
1459 p->strip[pos] = s;
1460 }
1461
1462 /*
1463 * dofwd - complete a forward reference
1464 */
1465 static void
1466 dofwd(struct parse *p, sopno pos, sop value)
1467 {
1468 /* avoid making error situations worse */
1469 if (p->error != 0)
1470 return;
1471
1472 assert(value < 1<<OPSHIFT);
1473 p->strip[pos] = OP(p->strip[pos]) | value;
1474 }
1475
1476 /*
1477 * enlarge - enlarge the strip
1478 */
1479 static int
1480 enlarge(struct parse *p, sopno size)
1481 {
1482 sop *sp;
1483
1484 if (p->ssize >= size)
1485 return (1);
1486
1487 sp = (sop *)realloc(p->strip, size*sizeof (sop));
1488 if (sp == NULL) {
1489 SETERROR(REG_ESPACE);
1490 return (0);
1491 }
1492 p->strip = sp;
1493 p->ssize = size;
1494 return (1);
1495 }
1496
1497 /*
1498 * stripsnug - compact the strip
1499 */
1500 static void
1501 stripsnug(struct parse *p, struct re_guts *g)
1502 {
1503 g->nstates = p->slen;
1504 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop));
1505 if (g->strip == NULL) {
1506 SETERROR(REG_ESPACE);
1507 g->strip = p->strip;
1508 }
1509 }
1510
1511 /*
1512 * findmust - fill in must and mlen with longest mandatory literal string
1513 *
1514 * This algorithm could do fancy things like analyzing the operands of |
1515 * for common subsequences. Someday. This code is simple and finds most
1516 * of the interesting cases.
1517 *
1518 * Note that must and mlen got initialized during setup.
1519 */
1520 static void
1521 findmust(struct parse *p, struct re_guts *g)
1522 {
1523 sop *scan;
1524 sop *start = NULL;
1525 sop *newstart = NULL;
1526 sopno newlen;
1527 sop s;
1528 char *cp;
1529 int offset;
1530 char buf[MB_LEN_MAX];
1531 size_t clen;
1532 mbstate_t mbs;
1533 locale_t loc = uselocale(NULL);
1534
1535 /* avoid making error situations worse */
1536 if (p->error != 0)
1537 return;
1538
1539 /*
1540 * It's not generally safe to do a ``char'' substring search on
1541 * multibyte character strings, but it's safe for at least
1542 * UTF-8 (see RFC 3629).
1543 */
1544 if (MB_CUR_MAX > 1 &&
1545 strcmp(loc->runelocale->__encoding, "UTF-8") != 0)
1546 return;
1547
1548 /* find the longest OCHAR sequence in strip */
1549 newlen = 0;
1550 offset = 0;
1551 g->moffset = 0;
1552 scan = g->strip + 1;
1553 do {
1554 s = *scan++;
1555 switch (OP(s)) {
1556 case OCHAR: /* sequence member */
1557 if (newlen == 0) { /* new sequence */
1558 (void) memset(&mbs, 0, sizeof (mbs));
1559 newstart = scan - 1;
1560 }
1561 clen = wcrtomb(buf, OPND(s), &mbs);
1562 if (clen == (size_t)-1)
1563 goto toohard;
1564 newlen += clen;
1565 break;
1566 case OPLUS_: /* things that don't break one */
1567 case OLPAREN:
1568 case ORPAREN:
1569 break;
1570 case OQUEST_: /* things that must be skipped */
1571 case OCH_:
1572 offset = altoffset(scan, offset);
1573 scan--;
1574 do {
1575 scan += OPND(s);
1576 s = *scan;
1577 /* assert() interferes w debug printouts */
1578 if (OP(s) != (sop)O_QUEST &&
1579 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) {
1580 g->iflags |= BAD;
1581 return;
1582 }
1583 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1584 /* FALLTHROUGH */
1585 case OBOW: /* things that break a sequence */
1586 case OEOW:
1587 case OBOL:
1588 case OEOL:
1589 case O_QUEST:
1590 case O_CH:
1591 case OEND:
1592 if (newlen > (sopno)g->mlen) { /* ends one */
1593 start = newstart;
1594 g->mlen = newlen;
1595 if (offset > -1) {
1596 g->moffset += offset;
1597 offset = newlen;
1598 } else
1599 g->moffset = offset;
1600 } else {
1601 if (offset > -1)
1602 offset += newlen;
1603 }
1604 newlen = 0;
1605 break;
1606 case OANY:
1607 if (newlen > (sopno)g->mlen) { /* ends one */
1608 start = newstart;
1609 g->mlen = newlen;
1610 if (offset > -1) {
1611 g->moffset += offset;
1612 offset = newlen;
1613 } else
1614 g->moffset = offset;
1615 } else {
1616 if (offset > -1)
1617 offset += newlen;
1618 }
1619 if (offset > -1)
1620 offset++;
1621 newlen = 0;
1622 break;
1623 case OANYOF: /* may or may not invalidate offset */
1624 /* First, everything as OANY */
1625 if (newlen > (sopno)g->mlen) { /* ends one */
1626 start = newstart;
1627 g->mlen = newlen;
1628 if (offset > -1) {
1629 g->moffset += offset;
1630 offset = newlen;
1631 } else
1632 g->moffset = offset;
1633 } else {
1634 if (offset > -1)
1635 offset += newlen;
1636 }
1637 if (offset > -1)
1638 offset++;
1639 newlen = 0;
1640 break;
1641 toohard:
1642 default:
1643 /*
1644 * Anything here makes it impossible or too hard
1645 * to calculate the offset -- so we give up;
1646 * save the last known good offset, in case the
1647 * must sequence doesn't occur later.
1648 */
1649 if (newlen > (sopno)g->mlen) { /* ends one */
1650 start = newstart;
1651 g->mlen = newlen;
1652 if (offset > -1)
1653 g->moffset += offset;
1654 else
1655 g->moffset = offset;
1656 }
1657 offset = -1;
1658 newlen = 0;
1659 break;
1660 }
1661 } while (OP(s) != OEND);
1662
1663 if (g->mlen == 0) { /* there isn't one */
1664 g->moffset = -1;
1665 return;
1666 }
1667
1668 /* turn it into a character string */
1669 g->must = malloc((size_t)g->mlen + 1);
1670 if (g->must == NULL) { /* argh; just forget it */
1671 g->mlen = 0;
1672 g->moffset = -1;
1673 return;
1674 }
1675 cp = g->must;
1676 scan = start;
1677 (void) memset(&mbs, 0, sizeof (mbs));
1678 while (cp < g->must + g->mlen) {
1679 while (OP(s = *scan++) != OCHAR)
1680 continue;
1681 clen = wcrtomb(cp, OPND(s), &mbs);
1682 assert(clen != (size_t)-1);
1683 cp += clen;
1684 }
1685 assert(cp == g->must + g->mlen);
1686 *cp++ = '\0'; /* just on general principles */
1687 }
1688
1689 /*
1690 * altoffset - choose biggest offset among multiple choices
1691 *
1692 * Compute, recursively if necessary, the largest offset among multiple
1693 * re paths.
1694 */
1695 static int
1696 altoffset(sop *scan, int offset)
1697 {
1698 int largest;
1699 int try;
1700 sop s;
1701
1702 /* If we gave up already on offsets, return */
1703 if (offset == -1)
1704 return (-1);
1705
1706 largest = 0;
1707 try = 0;
1708 s = *scan++;
1709 while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) {
1710 switch (OP(s)) {
1711 case OOR1:
1712 if (try > largest)
1713 largest = try;
1714 try = 0;
1715 break;
1716 case OQUEST_:
1717 case OCH_:
1718 try = altoffset(scan, try);
1719 if (try == -1)
1720 return (-1);
1721 scan--;
1722 do {
1723 scan += OPND(s);
1724 s = *scan;
1725 if (OP(s) != (sop)O_QUEST &&
1726 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2)
1727 return (-1);
1728 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1729 /*
1730 * We must skip to the next position, or we'll
1731 * leave altoffset() too early.
1732 */
1733 scan++;
1734 break;
1735 case OANYOF:
1736 case OCHAR:
1737 case OANY:
1738 try++;
1739 /*FALLTHRU*/
1740 case OBOW:
1741 case OEOW:
1742 case OLPAREN:
1743 case ORPAREN:
1744 case OOR2:
1745 break;
1746 default:
1747 try = -1;
1748 break;
1749 }
1750 if (try == -1)
1751 return (-1);
1752 s = *scan++;
1753 }
1754
1755 if (try > largest)
1756 largest = try;
1757
1758 return (largest+offset);
1759 }
1760
1761 /*
1762 * computejumps - compute char jumps for BM scan
1763 *
1764 * This algorithm assumes g->must exists and is has size greater than
1765 * zero. It's based on the algorithm found on Computer Algorithms by
1766 * Sara Baase.
1767 *
1768 * A char jump is the number of characters one needs to jump based on
1769 * the value of the character from the text that was mismatched.
1770 */
1771 static void
1772 computejumps(struct parse *p, struct re_guts *g)
1773 {
1774 int ch;
1775 int mindex;
1776
1777 /* Avoid making errors worse */
1778 if (p->error != 0)
1779 return;
1780
1781 g->charjump = (int *)malloc((NC + 1) * sizeof (int));
1782 if (g->charjump == NULL) /* Not a fatal error */
1783 return;
1784 /* Adjust for signed chars, if necessary */
1785 g->charjump = &g->charjump[-(CHAR_MIN)];
1786
1787 /*
1788 * If the character does not exist in the pattern, the jump
1789 * is equal to the number of characters in the pattern.
1790 */
1791 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1792 g->charjump[ch] = g->mlen;
1793
1794 /*
1795 * If the character does exist, compute the jump that would
1796 * take us to the last character in the pattern equal to it
1797 * (notice that we match right to left, so that last character
1798 * is the first one that would be matched).
1799 */
1800 for (mindex = 0; mindex < g->mlen; mindex++)
1801 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1802 }
1803
1804 /*
1805 * computematchjumps - compute match jumps for BM scan
1806 *
1807 * This algorithm assumes g->must exists and is has size greater than
1808 * zero. It's based on the algorithm found on Computer Algorithms by
1809 * Sara Baase.
1810 *
1811 * A match jump is the number of characters one needs to advance based
1812 * on the already-matched suffix.
1813 * Notice that all values here are minus (g->mlen-1), because of the way
1814 * the search algorithm works.
1815 */
1816 static void
1817 computematchjumps(struct parse *p, struct re_guts *g)
1818 {
1819 int mindex; /* General "must" iterator */
1820 int suffix; /* Keeps track of matching suffix */
1821 int ssuffix; /* Keeps track of suffixes' suffix */
1822 int *pmatches;
1823 /*
1824 * pmatches[k] points to the next i
1825 * such that i+1...mlen is a substring
1826 * of k+1...k+mlen-i-1
1827 */
1828
1829 /* Avoid making errors worse */
1830 if (p->error != 0)
1831 return;
1832
1833 pmatches = (int *)malloc(g->mlen * sizeof (unsigned int));
1834 if (pmatches == NULL) {
1835 g->matchjump = NULL;
1836 return;
1837 }
1838
1839 g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int));
1840 if (g->matchjump == NULL) { /* Not a fatal error */
1841 free(pmatches);
1842 return;
1843 }
1844
1845 /* Set maximum possible jump for each character in the pattern */
1846 for (mindex = 0; mindex < g->mlen; mindex++)
1847 g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1848
1849 /* Compute pmatches[] */
1850 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1851 mindex--, suffix--) {
1852 pmatches[mindex] = suffix;
1853
1854 /*
1855 * If a mismatch is found, interrupting the substring,
1856 * compute the matchjump for that position. If no
1857 * mismatch is found, then a text substring mismatched
1858 * against the suffix will also mismatch against the
1859 * substring.
1860 */
1861 while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) {
1862 g->matchjump[suffix] = MIN(g->matchjump[suffix],
1863 g->mlen - mindex - 1);
1864 suffix = pmatches[suffix];
1865 }
1866 }
1867
1868 /*
1869 * Compute the matchjump up to the last substring found to jump
1870 * to the beginning of the largest must pattern prefix matching
1871 * it's own suffix.
1872 */
1873 for (mindex = 0; mindex <= suffix; mindex++)
1874 g->matchjump[mindex] = MIN(g->matchjump[mindex],
1875 g->mlen + suffix - mindex);
1876
1877 ssuffix = pmatches[suffix];
1878 while (suffix < g->mlen) {
1879 while (suffix <= ssuffix && suffix < g->mlen) {
1880 g->matchjump[suffix] = MIN(g->matchjump[suffix],
1881 g->mlen + ssuffix - suffix);
1882 suffix++;
1883 }
1884 if (suffix < g->mlen)
1885 ssuffix = pmatches[ssuffix];
1886 }
1887
1888 free(pmatches);
1889 }
1890
1891 /*
1892 * pluscount - count + nesting
1893 */
1894 static sopno /* nesting depth */
1895 pluscount(struct parse *p, struct re_guts *g)
1896 {
1897 sop *scan;
1898 sop s;
1899 sopno plusnest = 0;
1900 sopno maxnest = 0;
1901
1902 if (p->error != 0)
1903 return (0); /* there may not be an OEND */
1904
1905 scan = g->strip + 1;
1906 do {
1907 s = *scan++;
1908 switch (OP(s)) {
1909 case OPLUS_:
1910 plusnest++;
1911 break;
1912 case O_PLUS:
1913 if (plusnest > maxnest)
1914 maxnest = plusnest;
1915 plusnest--;
1916 break;
1917 }
1918 } while (OP(s) != OEND);
1919 if (plusnest != 0)
1920 g->iflags |= BAD;
1921 return (maxnest);
1922 }