24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37 #include "lint.h"
38 #include "file64.h"
39 #include <sys/types.h>
40 #include <stdio.h>
41 #include <string.h>
42 #include <ctype.h>
43 #include <limits.h>
44 #include <stdlib.h>
45 #include <regex.h>
46 #include <wchar.h>
47 #include <wctype.h>
48
49 #include "../locale/runetype.h"
50 #include "../locale/collate.h"
51
52 #include "utils.h"
53 #include "regex2.h"
54
55 #include "cname.h"
56 #include "../locale/mblocal.h"
57
58 /*
59 * parse structure, passed up and down to avoid global variables and
60 * other clumsinesses
61 */
62 struct parse {
63 const char *next; /* next character in RE */
64 const char *end; /* end of string (-> NUL normally) */
65 int error; /* has an error been seen? */
66 sop *strip; /* malloced strip */
67 sopno ssize; /* malloced strip size (allocated) */
68 sopno slen; /* malloced strip length (used) */
69 int ncsalloc; /* number of csets allocated */
70 struct re_guts *g;
71 #define NPAREN 10 /* we need to remember () 1-9 for back refs */
72 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
73 sopno pend[NPAREN]; /* -> ) ([0] unused) */
74 };
75
76 /* ========= begin header generated by ./mkh ========= */
77 #ifdef __cplusplus
78 extern "C" {
79 #endif
80
81 /* === regcomp.c === */
82 static void p_ere(struct parse *p, int stop);
83 static void p_ere_exp(struct parse *p);
84 static void p_str(struct parse *p);
85 static void p_bre(struct parse *p, int end1, int end2);
86 static int p_simp_re(struct parse *p, int starordinary);
87 static int p_count(struct parse *p);
88 static void p_bracket(struct parse *p);
89 static void p_b_term(struct parse *p, cset *cs);
90 static void p_b_cclass(struct parse *p, cset *cs);
91 static void p_b_eclass(struct parse *p, cset *cs);
92 static wint_t p_b_symbol(struct parse *p);
93 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
94 static wint_t othercase(wint_t ch);
95 static void bothcases(struct parse *p, wint_t ch);
96 static void ordinary(struct parse *p, wint_t ch);
97 static void nonnewline(struct parse *p);
98 static void repeat(struct parse *p, sopno start, int from, int to);
99 static int seterr(struct parse *p, int e);
100 static cset *allocset(struct parse *p);
101 static void freeset(struct parse *p, cset *cs);
102 static void CHadd(struct parse *p, cset *cs, wint_t ch);
103 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
104 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
105 static wint_t singleton(cset *cs);
106 static sopno dupl(struct parse *p, sopno start, sopno finish);
116 static sopno pluscount(struct parse *p, struct re_guts *g);
117 static wint_t wgetnext(struct parse *p);
118
119 #ifdef __cplusplus
120 }
121 #endif
122 /* ========= end header generated by ./mkh ========= */
123
124 static char nuls[10]; /* place to point scanner in event of error */
125
126 /*
127 * macros for use with parse structure
128 * BEWARE: these know that the parse structure is named `p' !!!
129 */
130 #define PEEK() (*p->next)
131 #define PEEK2() (*(p->next+1))
132 #define MORE() (p->next < p->end)
133 #define MORE2() (p->next+1 < p->end)
134 #define SEE(c) (MORE() && PEEK() == (c))
135 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
136 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
137 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
138 #define NEXT() (p->next++)
139 #define NEXT2() (p->next += 2)
140 #define NEXTn(n) (p->next += (n))
141 #define GETNEXT() (*p->next++)
142 #define WGETNEXT() wgetnext(p)
143 #define SETERROR(e) ((void)seterr(p, (e)))
144 #define REQUIRE(co, e) ((co) || seterr(p, e))
145 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
146 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
147 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
148 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
149 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
150 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
151 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
152 #define HERE() (p->slen)
153 #define THERE() (p->slen - 1)
154 #define THERETHERE() (p->slen - 2)
155 #define DROP(n) (p->slen -= (n))
212 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
213 assert(p->ssize >= len);
214
215 p->strip = (sop *)malloc(p->ssize * sizeof (sop));
216 p->slen = 0;
217 if (p->strip == NULL) {
218 free((char *)g);
219 return (REG_ESPACE);
220 }
221
222 /* set things up */
223 p->g = g;
224 p->next = pattern; /* convenience; we do not modify it */
225 p->end = p->next + len;
226 p->error = 0;
227 p->ncsalloc = 0;
228 for (i = 0; i < NPAREN; i++) {
229 p->pbegin[i] = 0;
230 p->pend[i] = 0;
231 }
232 g->sets = NULL;
233 g->ncsets = 0;
234 g->cflags = cflags;
235 g->iflags = 0;
236 g->nbol = 0;
237 g->neol = 0;
238 g->must = NULL;
239 g->moffset = -1;
240 g->charjump = NULL;
241 g->matchjump = NULL;
242 g->mlen = 0;
243 g->nsub = 0;
244 g->backrefs = 0;
245
246 /* do it */
247 EMIT(OEND, 0);
248 g->firststate = THERE();
249 if (cflags®_EXTENDED)
250 p_ere(p, OUT);
251 else if (cflags®_NOSPEC)
252 p_str(p);
253 else
254 p_bre(p, OUT, OUT);
255 EMIT(OEND, 0);
256 g->laststate = THERE();
257
258 /* tidy up loose ends and fill things in */
259 stripsnug(p, g);
260 findmust(p, g);
261 /*
262 * only use Boyer-Moore algorithm if the pattern is bigger
263 * than three characters
264 */
265 if (g->mlen > 3) {
266 computejumps(p, g);
267 computematchjumps(p, g);
268 if (g->matchjump == NULL && g->charjump != NULL) {
269 free(g->charjump);
270 g->charjump = NULL;
271 }
272 }
273 g->nplus = pluscount(p, g);
274 g->magic = MAGIC2;
275 preg->re_nsub = g->nsub;
276 preg->re_g = g;
277 preg->re_magic = MAGIC1;
278 #ifndef REDEBUG
279 /* not debugging, so can't rely on the assert() in regexec() */
280 if (g->iflags&BAD)
281 SETERROR(REG_EFATAL);
282 #endif
283
284 /* win or lose, we're done */
285 if (p->error != 0) /* lose */
286 regfree(preg);
287 return (p->error);
288 }
289
290 /*
291 * p_ere - ERE parser top level, concatenation and alternation
292 */
293 static void
294 p_ere(struct parse *p,
295 int stop) /* character this ERE should end at */
296 {
297 char c;
298 sopno prevback;
299 sopno prevfwd;
300 sopno conc;
301 int first = 1; /* is this the first alternative? */
302
303 for (;;) {
304 /* do a bunch of concatenated expressions */
305 conc = HERE();
306 while (MORE() && (c = PEEK()) != '|' && c != stop)
307 p_ere_exp(p);
308 /* require nonempty */
309 (void) REQUIRE(HERE() != conc, REG_BADPAT);
310
311 if (!EAT('|'))
312 break; /* NOTE BREAK OUT */
313
314 if (first) {
315 INSERT(OCH_, conc); /* offset is wrong */
316 prevfwd = conc;
317 prevback = conc;
318 first = 0;
319 }
320 ASTERN(OOR1, prevback);
321 prevback = THERE();
322 AHEAD(prevfwd); /* fix previous offset */
323 prevfwd = HERE();
324 EMIT(OOR2, 0); /* offset is very wrong */
325 }
326
327 if (!first) { /* tail-end fixups */
328 AHEAD(prevfwd);
329 ASTERN(O_CH, prevback);
330 }
331
332 assert(!MORE() || SEE(stop));
333 }
334
335 /*
336 * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
337 */
338 static void
339 p_ere_exp(struct parse *p)
340 {
341 char c;
342 wint_t wc;
343 sopno pos;
344 int count;
345 int count2;
346 sopno subno;
347 int wascaret = 0;
348
349 assert(MORE()); /* caller should have ensured this */
350 c = GETNEXT();
351
352 pos = HERE();
353 switch (c) {
354 case '(':
355 (void) REQUIRE(MORE(), REG_EPAREN);
356 p->g->nsub++;
357 subno = p->g->nsub;
358 if (subno < NPAREN)
359 p->pbegin[subno] = HERE();
360 EMIT(OLPAREN, subno);
361 if (!SEE(')'))
362 p_ere(p, ')');
363 if (subno < NPAREN) {
364 p->pend[subno] = HERE();
365 assert(p->pend[subno] != 0);
366 }
367 EMIT(ORPAREN, subno);
368 (void) MUSTEAT(')', REG_EPAREN);
369 break;
370 #ifndef POSIX_MISTAKE
371 case ')': /* happens only if no current unmatched ( */
372 /*
373 * You may ask, why the ifndef? Because I didn't notice
374 * this until slightly too late for 1003.2, and none of the
375 * other 1003.2 regular-expression reviewers noticed it at
376 * all. So an unmatched ) is legal POSIX, at least until
377 * we can get it fixed.
378 */
379 SETERROR(REG_EPAREN);
380 break;
381 #endif
382 case '^':
408 case '[':
409 p_bracket(p);
410 break;
411 case '\\':
412 (void) REQUIRE(MORE(), REG_EESCAPE);
413 wc = WGETNEXT();
414 switch (wc) {
415 case '<':
416 EMIT(OBOW, 0);
417 break;
418 case '>':
419 EMIT(OEOW, 0);
420 break;
421 default:
422 ordinary(p, wc);
423 break;
424 }
425 break;
426 default:
427 if (p->error != 0)
428 return;
429 p->next--;
430 wc = WGETNEXT();
431 ordinary(p, wc);
432 break;
433 }
434
435 if (!MORE())
436 return;
437 c = PEEK();
438 /* we call { a repetition if followed by a digit */
439 if (!(c == '*' || c == '+' || c == '?' || c == '{'))
440 return; /* no repetition, we're done */
441 else if (c == '{')
442 (void) REQUIRE(MORE2() && \
443 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
444 NEXT();
445
446 (void) REQUIRE(!wascaret, REG_BADRPT);
447 switch (c) {
448 case '*': /* implemented as +? */
449 /* this case does not require the (y|) trick, noKLUDGE */
450 INSERT(OPLUS_, pos);
451 ASTERN(O_PLUS, pos);
452 INSERT(OQUEST_, pos);
453 ASTERN(O_QUEST, pos);
454 break;
455 case '+':
456 INSERT(OPLUS_, pos);
457 ASTERN(O_PLUS, pos);
458 break;
459 case '?':
460 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
469 count = p_count(p);
470 if (EAT(',')) {
471 if (isdigit((uch)PEEK())) {
472 count2 = p_count(p);
473 (void) REQUIRE(count <= count2, REG_BADBR);
474 } else /* single number with comma */
475 count2 = INFINITY;
476 } else /* just a single number */
477 count2 = count;
478 repeat(p, pos, count, count2);
479 if (!EAT('}')) { /* error heuristics */
480 while (MORE() && PEEK() != '}')
481 NEXT();
482 (void) REQUIRE(MORE(), REG_EBRACE);
483 SETERROR(REG_BADBR);
484 }
485 break;
486 }
487
488 if (!MORE())
489 return;
490 c = PEEK();
491 if (!(c == '*' || c == '+' || c == '?' ||
492 (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
493 return;
494 SETERROR(REG_BADRPT);
495 }
496
497 /*
498 * p_str - string (no metacharacters) "parser"
499 */
500 static void
501 p_str(struct parse *p)
502 {
503 (void) REQUIRE(MORE(), REG_BADPAT);
504 while (MORE())
505 ordinary(p, WGETNEXT());
506 }
507
508 /*
509 * p_bre - BRE parser top level, anchoring and concatenation
510 * Giving end1 as OUT essentially eliminates the end1/end2 check.
511 *
512 * This implementation is a bit of a kludge, in that a trailing $ is first
513 * taken as an ordinary character and then revised to be an anchor.
514 * The amount of lookahead needed to avoid this kludge is excessive.
515 */
516 static void
517 p_bre(struct parse *p,
518 int end1, /* first terminating character */
519 int end2) /* second terminating character */
520 {
521 sopno start = HERE();
522 int first = 1; /* first subexpression? */
523 int wasdollar = 0;
524
525 if (EAT('^')) {
526 EMIT(OBOL, 0);
527 p->g->iflags |= USEBOL;
528 p->g->nbol++;
529 }
530 while (MORE() && !SEETWO(end1, end2)) {
531 wasdollar = p_simp_re(p, first);
532 first = 0;
533 }
534 if (wasdollar) { /* oops, that was a trailing anchor */
535 DROP(1);
536 EMIT(OEOL, 0);
537 p->g->iflags |= USEEOL;
538 p->g->neol++;
539 }
540
541 (void) REQUIRE(HERE() != start, REG_BADPAT); /* require nonempty */
542 }
543
544 /*
545 * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
546 */
547 static int /* was the simple RE an unbackslashed $? */
548 p_simp_re(struct parse *p,
549 int starordinary) /* is a leading * an ordinary character? */
550 {
551 int c;
552 int count;
553 int count2;
554 sopno pos;
555 int i;
556 wint_t wc;
557 sopno subno;
558 #define BACKSL (1<<CHAR_BIT)
559
560 pos = HERE(); /* repetition op, if any, covers from here */
561
562 assert(MORE()); /* caller should have ensured this */
563 c = GETNEXT();
564 if (c == '\\') {
565 (void) REQUIRE(MORE(), REG_EESCAPE);
566 c = BACKSL | GETNEXT();
567 }
568 switch (c) {
569 case '.':
575 case '[':
576 p_bracket(p);
577 break;
578 case BACKSL|'<':
579 EMIT(OBOW, 0);
580 break;
581 case BACKSL|'>':
582 EMIT(OEOW, 0);
583 break;
584 case BACKSL|'{':
585 SETERROR(REG_BADRPT);
586 break;
587 case BACKSL|'(':
588 p->g->nsub++;
589 subno = p->g->nsub;
590 if (subno < NPAREN)
591 p->pbegin[subno] = HERE();
592 EMIT(OLPAREN, subno);
593 /* the MORE here is an error heuristic */
594 if (MORE() && !SEETWO('\\', ')'))
595 p_bre(p, '\\', ')');
596 if (subno < NPAREN) {
597 p->pend[subno] = HERE();
598 assert(p->pend[subno] != 0);
599 }
600 EMIT(ORPAREN, subno);
601 (void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
602 break;
603 case BACKSL|')': /* should not get here -- must be user */
604 SETERROR(REG_EPAREN);
605 break;
606 case BACKSL|'1':
607 case BACKSL|'2':
608 case BACKSL|'3':
609 case BACKSL|'4':
610 case BACKSL|'5':
611 case BACKSL|'6':
612 case BACKSL|'7':
613 case BACKSL|'8':
614 case BACKSL|'9':
615 i = (c&~BACKSL) - '0';
616 assert(i < NPAREN);
617 if (p->pend[i] != 0) {
618 assert(i <= p->g->nsub);
619 EMIT(OBACK_, i);
620 assert(p->pbegin[i] != 0);
621 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
622 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
623 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
624 EMIT(O_BACK, i);
625 } else
626 SETERROR(REG_ESUBREG);
627 p->g->backrefs = 1;
628 break;
629 case '*':
630 (void) REQUIRE(starordinary, REG_BADRPT);
631 /* FALLTHROUGH */
632 default:
633 if (p->error != 0)
634 return (0); /* Definitely not $... */
635 p->next--;
636 wc = WGETNEXT();
637 ordinary(p, wc);
638 break;
639 }
640
641 if (EAT('*')) { /* implemented as +? */
642 /* this case does not require the (y|) trick, noKLUDGE */
643 INSERT(OPLUS_, pos);
644 ASTERN(O_PLUS, pos);
645 INSERT(OQUEST_, pos);
646 ASTERN(O_QUEST, pos);
647 } else if (EATTWO('\\', '{')) {
648 count = p_count(p);
649 if (EAT(',')) {
650 if (MORE() && isdigit((uch)PEEK())) {
651 count2 = p_count(p);
652 (void) REQUIRE(count <= count2, REG_BADBR);
653 } else /* single number with comma */
654 count2 = INFINITY;
655 } else /* just a single number */
656 count2 = count;
657 repeat(p, pos, count, count2);
658 if (!EATTWO('\\', '}')) { /* error heuristics */
659 while (MORE() && !SEETWO('\\', '}'))
660 NEXT();
661 (void) REQUIRE(MORE(), REG_EBRACE);
662 SETERROR(REG_BADBR);
663 }
664 } else if (c == '$') /* $ (but not \$) ends it */
665 return (1);
666
667 return (0);
668 }
669
670 /*
671 * p_count - parse a repetition count
672 */
673 static int /* the value */
674 p_count(struct parse *p)
675 {
676 int count = 0;
677 int ndigits = 0;
678
679 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
680 count = count*10 + (GETNEXT() - '0');
681 ndigits++;
682 }
683
684 (void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
685 return (count);
686 }
687
876 * p_b_coll_elem - parse a collating-element name and look it up
877 */
878 static wint_t /* value of collating element */
879 p_b_coll_elem(struct parse *p,
880 wint_t endc) /* name ended by endc,']' */
881 {
882 const char *sp = p->next;
883 struct cname *cp;
884 mbstate_t mbs;
885 wchar_t wc;
886 size_t clen, len;
887
888 while (MORE() && !SEETWO(endc, ']'))
889 NEXT();
890 if (!MORE()) {
891 SETERROR(REG_EBRACK);
892 return (0);
893 }
894 len = p->next - sp;
895 for (cp = cnames; cp->name != NULL; cp++)
896 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
897 return (cp->code); /* known name */
898 (void) memset(&mbs, 0, sizeof (mbs));
899 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
900 return (wc); /* single character */
901 else if (clen == (size_t)-1 || clen == (size_t)-2)
902 SETERROR(REG_ECHAR);
903 else
904 SETERROR(REG_ECOLLATE); /* neither */
905 return (0);
906 }
907
908 /*
909 * othercase - return the case counterpart of an alphabetic
910 */
911 static wint_t /* if no counterpart, return ch */
912 othercase(wint_t ch)
913 {
914 assert(iswalpha(ch));
915 if (iswupper(ch))
916 return (towlower(ch));
1419 (void) memset(&mbs, 0, sizeof (mbs));
1420 newstart = scan - 1;
1421 }
1422 clen = wcrtomb(buf, OPND(s), &mbs);
1423 if (clen == (size_t)-1)
1424 goto toohard;
1425 newlen += clen;
1426 break;
1427 case OPLUS_: /* things that don't break one */
1428 case OLPAREN:
1429 case ORPAREN:
1430 break;
1431 case OQUEST_: /* things that must be skipped */
1432 case OCH_:
1433 offset = altoffset(scan, offset);
1434 scan--;
1435 do {
1436 scan += OPND(s);
1437 s = *scan;
1438 /* assert() interferes w debug printouts */
1439 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1440 OP(s) != OOR2) {
1441 g->iflags |= BAD;
1442 return;
1443 }
1444 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1445 /* FALLTHROUGH */
1446 case OBOW: /* things that break a sequence */
1447 case OEOW:
1448 case OBOL:
1449 case OEOL:
1450 case O_QUEST:
1451 case O_CH:
1452 case OEND:
1453 if (newlen > g->mlen) { /* ends one */
1454 start = newstart;
1455 g->mlen = newlen;
1456 if (offset > -1) {
1457 g->moffset += offset;
1458 offset = newlen;
1459 } else
1460 g->moffset = offset;
1461 } else {
1462 if (offset > -1)
1463 offset += newlen;
1464 }
1465 newlen = 0;
1466 break;
1467 case OANY:
1468 if (newlen > g->mlen) { /* ends one */
1469 start = newstart;
1470 g->mlen = newlen;
1471 if (offset > -1) {
1472 g->moffset += offset;
1473 offset = newlen;
1474 } else
1475 g->moffset = offset;
1476 } else {
1477 if (offset > -1)
1478 offset += newlen;
1479 }
1480 if (offset > -1)
1481 offset++;
1482 newlen = 0;
1483 break;
1484 case OANYOF: /* may or may not invalidate offset */
1485 /* First, everything as OANY */
1486 if (newlen > g->mlen) { /* ends one */
1487 start = newstart;
1488 g->mlen = newlen;
1489 if (offset > -1) {
1490 g->moffset += offset;
1491 offset = newlen;
1492 } else
1493 g->moffset = offset;
1494 } else {
1495 if (offset > -1)
1496 offset += newlen;
1497 }
1498 if (offset > -1)
1499 offset++;
1500 newlen = 0;
1501 break;
1502 toohard:
1503 default:
1504 /*
1505 * Anything here makes it impossible or too hard
1506 * to calculate the offset -- so we give up;
1507 * save the last known good offset, in case the
1508 * must sequence doesn't occur later.
1509 */
1510 if (newlen > g->mlen) { /* ends one */
1511 start = newstart;
1512 g->mlen = newlen;
1513 if (offset > -1)
1514 g->moffset += offset;
1515 else
1516 g->moffset = offset;
1517 }
1518 offset = -1;
1519 newlen = 0;
1520 break;
1521 }
1522 } while (OP(s) != OEND);
1523
1524 if (g->mlen == 0) { /* there isn't one */
1525 g->moffset = -1;
1526 return;
1527 }
1528
1529 /* turn it into a character string */
1530 g->must = malloc((size_t)g->mlen + 1);
1550 /*
1551 * altoffset - choose biggest offset among multiple choices
1552 *
1553 * Compute, recursively if necessary, the largest offset among multiple
1554 * re paths.
1555 */
1556 static int
1557 altoffset(sop *scan, int offset)
1558 {
1559 int largest;
1560 int try;
1561 sop s;
1562
1563 /* If we gave up already on offsets, return */
1564 if (offset == -1)
1565 return (-1);
1566
1567 largest = 0;
1568 try = 0;
1569 s = *scan++;
1570 while (OP(s) != O_QUEST && OP(s) != O_CH) {
1571 switch (OP(s)) {
1572 case OOR1:
1573 if (try > largest)
1574 largest = try;
1575 try = 0;
1576 break;
1577 case OQUEST_:
1578 case OCH_:
1579 try = altoffset(scan, try);
1580 if (try == -1)
1581 return (-1);
1582 scan--;
1583 do {
1584 scan += OPND(s);
1585 s = *scan;
1586 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1587 OP(s) != OOR2)
1588 return (-1);
1589 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1590 /*
1591 * We must skip to the next position, or we'll
1592 * leave altoffset() too early.
1593 */
1594 scan++;
1595 break;
1596 case OANYOF:
1597 case OCHAR:
1598 case OANY:
1599 try++;
1600 /*FALLTHRU*/
1601 case OBOW:
1602 case OEOW:
1603 case OLPAREN:
1604 case ORPAREN:
1605 case OOR2:
1606 break;
1607 default:
1608 try = -1;
1609 break;
|
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37 #include "lint.h"
38 #include "file64.h"
39 #include <sys/types.h>
40 #include <stdio.h>
41 #include <string.h>
42 #include <ctype.h>
43 #include <limits.h>
44 #include <regex.h>
45 #include <stdlib.h>
46 #include <stdbool.h>
47 #include <wchar.h>
48 #include <wctype.h>
49
50 #include "../locale/runetype.h"
51 #include "../locale/collate.h"
52
53 #include "utils.h"
54 #include "regex2.h"
55
56 #include "cname.h"
57 #include "../locale/mblocal.h"
58
59 /*
60 * Branching context, used to keep track of branch state for all of the branch-
61 * aware functions. In addition to keeping track of branch positions for the
62 * p_branch_* functions, we use this to simplify some clumsiness in BREs for
63 * detection of whether ^ is acting as an anchor or being used erroneously and
64 * also for whether we're in a sub-expression or not.
65 */
66 struct branchc {
67 sopno start;
68 sopno back;
69 sopno fwd;
70
71 int nbranch;
72 int nchain;
73 bool outer;
74 bool terminate;
75 };
76
77 /*
78 * parse structure, passed up and down to avoid global variables and
79 * other clumsinesses
80 */
81 struct parse {
82 const char *next; /* next character in RE */
83 const char *end; /* end of string (-> NUL normally) */
84 int error; /* has an error been seen? */
85 sop *strip; /* malloced strip */
86 sopno ssize; /* malloced strip size (allocated) */
87 sopno slen; /* malloced strip length (used) */
88 int ncsalloc; /* number of csets allocated */
89 struct re_guts *g;
90 #define NPAREN 10 /* we need to remember () 1-9 for back refs */
91 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
92 sopno pend[NPAREN]; /* -> ) ([0] unused) */
93 bool allowbranch; /* can this expression branch? */
94 bool bre; /* convenience; is this a BRE? */
95 bool (*parse_expr)(struct parse *, struct branchc *);
96 void (*pre_parse)(struct parse *, struct branchc *);
97 void (*post_parse)(struct parse *, struct branchc *);
98 };
99
100 /* ========= begin header generated by ./mkh ========= */
101 #ifdef __cplusplus
102 extern "C" {
103 #endif
104
105 /* === regcomp.c === */
106 static bool p_ere_exp(struct parse *p, struct branchc *bc);
107 static void p_str(struct parse *p);
108 static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
109 static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
110 static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
111 static bool p_branch_empty(struct parse *p, struct branchc *bc);
112 static bool p_branch_do(struct parse *p, struct branchc *bc);
113 static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
114 static void p_bre_post_parse(struct parse *p, struct branchc *bc);
115 static void p_re(struct parse *p, int end1, int end2);
116 static bool p_simp_re(struct parse *p, struct branchc *bc);
117 static int p_count(struct parse *p);
118 static void p_bracket(struct parse *p);
119 static void p_b_term(struct parse *p, cset *cs);
120 static void p_b_cclass(struct parse *p, cset *cs);
121 static void p_b_eclass(struct parse *p, cset *cs);
122 static wint_t p_b_symbol(struct parse *p);
123 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
124 static wint_t othercase(wint_t ch);
125 static void bothcases(struct parse *p, wint_t ch);
126 static void ordinary(struct parse *p, wint_t ch);
127 static void nonnewline(struct parse *p);
128 static void repeat(struct parse *p, sopno start, int from, int to);
129 static int seterr(struct parse *p, int e);
130 static cset *allocset(struct parse *p);
131 static void freeset(struct parse *p, cset *cs);
132 static void CHadd(struct parse *p, cset *cs, wint_t ch);
133 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
134 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
135 static wint_t singleton(cset *cs);
136 static sopno dupl(struct parse *p, sopno start, sopno finish);
146 static sopno pluscount(struct parse *p, struct re_guts *g);
147 static wint_t wgetnext(struct parse *p);
148
149 #ifdef __cplusplus
150 }
151 #endif
152 /* ========= end header generated by ./mkh ========= */
153
154 static char nuls[10]; /* place to point scanner in event of error */
155
156 /*
157 * macros for use with parse structure
158 * BEWARE: these know that the parse structure is named `p' !!!
159 */
160 #define PEEK() (*p->next)
161 #define PEEK2() (*(p->next+1))
162 #define MORE() (p->next < p->end)
163 #define MORE2() (p->next+1 < p->end)
164 #define SEE(c) (MORE() && PEEK() == (c))
165 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
166 #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a))
167 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
168 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
169 #define NEXT() (p->next++)
170 #define NEXT2() (p->next += 2)
171 #define NEXTn(n) (p->next += (n))
172 #define GETNEXT() (*p->next++)
173 #define WGETNEXT() wgetnext(p)
174 #define SETERROR(e) ((void)seterr(p, (e)))
175 #define REQUIRE(co, e) ((co) || seterr(p, e))
176 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
177 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
178 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
179 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
180 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
181 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
182 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
183 #define HERE() (p->slen)
184 #define THERE() (p->slen - 1)
185 #define THERETHERE() (p->slen - 2)
186 #define DROP(n) (p->slen -= (n))
243 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
244 assert(p->ssize >= len);
245
246 p->strip = (sop *)malloc(p->ssize * sizeof (sop));
247 p->slen = 0;
248 if (p->strip == NULL) {
249 free((char *)g);
250 return (REG_ESPACE);
251 }
252
253 /* set things up */
254 p->g = g;
255 p->next = pattern; /* convenience; we do not modify it */
256 p->end = p->next + len;
257 p->error = 0;
258 p->ncsalloc = 0;
259 for (i = 0; i < NPAREN; i++) {
260 p->pbegin[i] = 0;
261 p->pend[i] = 0;
262 }
263 if (cflags & REG_EXTENDED) {
264 p->allowbranch = true;
265 p->bre = false;
266 p->parse_expr = p_ere_exp;
267 p->pre_parse = NULL;
268 p->post_parse = NULL;
269 } else {
270 p->allowbranch = false;
271 p->bre = true;
272 p->parse_expr = p_simp_re;
273 p->pre_parse = p_bre_pre_parse;
274 p->post_parse = p_bre_post_parse;
275 }
276 g->sets = NULL;
277 g->ncsets = 0;
278 g->cflags = cflags;
279 g->iflags = 0;
280 g->nbol = 0;
281 g->neol = 0;
282 g->must = NULL;
283 g->moffset = -1;
284 g->charjump = NULL;
285 g->matchjump = NULL;
286 g->mlen = 0;
287 g->nsub = 0;
288 g->backrefs = 0;
289
290 /* do it */
291 EMIT(OEND, 0);
292 g->firststate = THERE();
293 if (cflags & REG_NOSPEC)
294 p_str(p);
295 else
296 p_re(p, OUT, OUT);
297 EMIT(OEND, 0);
298 g->laststate = THERE();
299
300 /* tidy up loose ends and fill things in */
301 stripsnug(p, g);
302 findmust(p, g);
303 /*
304 * only use Boyer-Moore algorithm if the pattern is bigger
305 * than three characters
306 */
307 if (g->mlen > 3) {
308 computejumps(p, g);
309 computematchjumps(p, g);
310 if (g->matchjump == NULL && g->charjump != NULL) {
311 free(g->charjump);
312 g->charjump = NULL;
313 }
314 }
315 g->nplus = pluscount(p, g);
316 g->magic = MAGIC2;
317 preg->re_nsub = g->nsub;
318 preg->re_g = g;
319 preg->re_magic = MAGIC1;
320 #ifndef REDEBUG
321 /* not debugging, so can't rely on the assert() in regexec() */
322 if (g->iflags&BAD)
323 SETERROR(REG_EFATAL);
324 #endif
325
326 /* win or lose, we're done */
327 if (p->error != 0) /* lose */
328 regfree(preg);
329 return (p->error);
330 }
331
332 /*
333 * Parse one subERE, an atom possibly followed by a repetition op,
334 * return whether we should terminate or not.
335 */
336 static bool
337 p_ere_exp(struct parse *p, struct branchc *bc)
338 {
339 char c;
340 wint_t wc;
341 sopno pos;
342 int count;
343 int count2;
344 sopno subno;
345 int wascaret = 0;
346
347 (void) bc;
348 assert(MORE()); /* caller should have ensured this */
349 c = GETNEXT();
350
351 pos = HERE();
352 switch (c) {
353 case '(':
354 (void) REQUIRE(MORE(), REG_EPAREN);
355 p->g->nsub++;
356 subno = p->g->nsub;
357 if (subno < NPAREN)
358 p->pbegin[subno] = HERE();
359 EMIT(OLPAREN, subno);
360 if (!SEE(')'))
361 p_re(p, ')', IGN);
362 if (subno < NPAREN) {
363 p->pend[subno] = HERE();
364 assert(p->pend[subno] != 0);
365 }
366 EMIT(ORPAREN, subno);
367 (void) MUSTEAT(')', REG_EPAREN);
368 break;
369 #ifndef POSIX_MISTAKE
370 case ')': /* happens only if no current unmatched ( */
371 /*
372 * You may ask, why the ifndef? Because I didn't notice
373 * this until slightly too late for 1003.2, and none of the
374 * other 1003.2 regular-expression reviewers noticed it at
375 * all. So an unmatched ) is legal POSIX, at least until
376 * we can get it fixed.
377 */
378 SETERROR(REG_EPAREN);
379 break;
380 #endif
381 case '^':
407 case '[':
408 p_bracket(p);
409 break;
410 case '\\':
411 (void) REQUIRE(MORE(), REG_EESCAPE);
412 wc = WGETNEXT();
413 switch (wc) {
414 case '<':
415 EMIT(OBOW, 0);
416 break;
417 case '>':
418 EMIT(OEOW, 0);
419 break;
420 default:
421 ordinary(p, wc);
422 break;
423 }
424 break;
425 default:
426 if (p->error != 0)
427 return (false);
428 p->next--;
429 wc = WGETNEXT();
430 ordinary(p, wc);
431 break;
432 }
433
434 if (!MORE())
435 return (false);
436 c = PEEK();
437 /* we call { a repetition if followed by a digit */
438 if (!(c == '*' || c == '+' || c == '?' || c == '{'))
439 return (false); /* no repetition, we're done */
440 else if (c == '{')
441 (void) REQUIRE(MORE2() && \
442 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
443 NEXT();
444
445 (void) REQUIRE(!wascaret, REG_BADRPT);
446 switch (c) {
447 case '*': /* implemented as +? */
448 /* this case does not require the (y|) trick, noKLUDGE */
449 INSERT(OPLUS_, pos);
450 ASTERN(O_PLUS, pos);
451 INSERT(OQUEST_, pos);
452 ASTERN(O_QUEST, pos);
453 break;
454 case '+':
455 INSERT(OPLUS_, pos);
456 ASTERN(O_PLUS, pos);
457 break;
458 case '?':
459 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
468 count = p_count(p);
469 if (EAT(',')) {
470 if (isdigit((uch)PEEK())) {
471 count2 = p_count(p);
472 (void) REQUIRE(count <= count2, REG_BADBR);
473 } else /* single number with comma */
474 count2 = INFINITY;
475 } else /* just a single number */
476 count2 = count;
477 repeat(p, pos, count, count2);
478 if (!EAT('}')) { /* error heuristics */
479 while (MORE() && PEEK() != '}')
480 NEXT();
481 (void) REQUIRE(MORE(), REG_EBRACE);
482 SETERROR(REG_BADBR);
483 }
484 break;
485 }
486
487 if (!MORE())
488 return (false);
489 c = PEEK();
490 if (!(c == '*' || c == '+' || c == '?' ||
491 (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
492 return (false);
493 SETERROR(REG_BADRPT);
494 return (false);
495 }
496
497 /*
498 * p_str - string (no metacharacters) "parser"
499 */
500 static void
501 p_str(struct parse *p)
502 {
503 (void) REQUIRE(MORE(), REG_BADPAT);
504 while (MORE())
505 ordinary(p, WGETNEXT());
506 }
507
508 /*
509 * Eat consecutive branch delimiters for the kind of expression that we are
510 * parsing, return the number of delimiters that we ate.
511 */
512 static int
513 p_branch_eat_delim(struct parse *p, struct branchc *bc)
514 {
515 int nskip;
516
517 (void) bc;
518 nskip = 0;
519 while (EAT('|'))
520 ++nskip;
521 return (nskip);
522 }
523
524 /*
525 * Insert necessary branch book-keeping operations. This emits a
526 * bogus 'next' offset, since we still have more to parse
527 */
528 static void
529 p_branch_ins_offset(struct parse *p, struct branchc *bc)
530 {
531 if (bc->nbranch == 0) {
532 INSERT(OCH_, bc->start); /* offset is wrong */
533 bc->fwd = bc->start;
534 bc->back = bc->start;
535 }
536
537 ASTERN(OOR1, bc->back);
538 bc->back = THERE();
539 AHEAD(bc->fwd); /* fix previous offset */
540 bc->fwd = HERE();
541 EMIT(OOR2, 0); /* offset is very wrong */
542 ++bc->nbranch;
543 }
544
545 /*
546 * Fix the offset of the tail branch, if we actually had any branches.
547 * This is to correct the bogus placeholder offset that we use.
548 */
549 static void
550 p_branch_fix_tail(struct parse *p, struct branchc *bc)
551 {
552 /* Fix bogus offset at the tail if we actually have branches */
553 if (bc->nbranch > 0) {
554 AHEAD(bc->fwd);
555 ASTERN(O_CH, bc->back);
556 }
557 }
558
559 /*
560 * Signal to the parser that an empty branch has been encountered; this will,
561 * in the future, be used to allow for more permissive behavior with empty
562 * branches. The return value should indicate whether parsing may continue
563 * or not.
564 */
565 static bool
566 p_branch_empty(struct parse *p, struct branchc *bc)
567 {
568 (void) bc;
569 SETERROR(REG_BADPAT);
570 return (false);
571 }
572
573 /*
574 * Take care of any branching requirements. This includes inserting the
575 * appropriate branching instructions as well as eating all of the branch
576 * delimiters until we either run out of pattern or need to parse more pattern.
577 */
578 static bool
579 p_branch_do(struct parse *p, struct branchc *bc)
580 {
581 int ate = 0;
582
583 ate = p_branch_eat_delim(p, bc);
584 if (ate == 0)
585 return (false);
586 else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
587 /*
588 * Halt parsing only if we have an empty branch and
589 * p_branch_empty indicates that we must not continue.
590 * In the future, this will not necessarily be an error.
591 */
592 return (false);
593 p_branch_ins_offset(p, bc);
594
595 return (true);
596 }
597
598 static void
599 p_bre_pre_parse(struct parse *p, struct branchc *bc)
600 {
601 (void) bc;
602 /*
603 * Does not move cleanly into expression parser because of
604 * ordinary interpration of * at the beginning position of
605 * an expression.
606 */
607 if (EAT('^')) {
608 EMIT(OBOL, 0);
609 p->g->iflags |= USEBOL;
610 p->g->nbol++;
611 }
612 }
613
614 static void
615 p_bre_post_parse(struct parse *p, struct branchc *bc)
616 {
617 /* Expression is terminating due to EOL token */
618 if (bc->terminate) {
619 DROP(1);
620 EMIT(OEOL, 0);
621 p->g->iflags |= USEEOL;
622 p->g->neol++;
623 }
624 }
625
626 /*
627 * Top level parser, concatenation and BRE anchoring.
628 * Giving end1 as OUT essentially eliminates the end1/end2 check.
629 *
630 * This implementation is a bit of a kludge, in that a trailing $ is first
631 * taken as an ordinary character and then revised to be an anchor.
632 * The amount of lookahead needed to avoid this kludge is excessive.
633 */
634 static void
635 p_re(struct parse *p,
636 int end1, /* first terminating character */
637 int end2) /* second terminating character; ignored for EREs */
638 {
639 struct branchc bc;
640
641 bc.nbranch = 0;
642 if (end1 == OUT && end2 == OUT)
643 bc.outer = true;
644 else
645 bc.outer = false;
646 #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2))
647 for (;;) {
648 bc.start = HERE();
649 bc.nchain = 0;
650 bc.terminate = false;
651 if (p->pre_parse != NULL)
652 p->pre_parse(p, &bc);
653 while (MORE() && (!p->allowbranch || !SEESPEC('|')) &&
654 !SEEEND()) {
655 bc.terminate = p->parse_expr(p, &bc);
656 ++bc.nchain;
657 }
658 if (p->post_parse != NULL)
659 p->post_parse(p, &bc);
660 (void) REQUIRE(HERE() != bc.start, REG_BADPAT);
661 if (!p->allowbranch)
662 break;
663 /*
664 * p_branch_do's return value indicates whether we should
665 * continue parsing or not. This is both for correctness and
666 * a slight optimization, because it will check if we've
667 * encountered an empty branch or the end of the string
668 * immediately following a branch delimiter.
669 */
670 if (!p_branch_do(p, &bc))
671 break;
672 }
673 #undef SEE_END
674 if (p->allowbranch)
675 p_branch_fix_tail(p, &bc);
676 assert(!MORE() || SEE(end1));
677 }
678
679 /*
680 * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
681 */
682 static bool /* was the simple RE an unbackslashed $? */
683 p_simp_re(struct parse *p, struct branchc *bc)
684 {
685 int c;
686 int count;
687 int count2;
688 sopno pos;
689 int i;
690 wint_t wc;
691 sopno subno;
692 #define BACKSL (1<<CHAR_BIT)
693
694 pos = HERE(); /* repetition op, if any, covers from here */
695
696 assert(MORE()); /* caller should have ensured this */
697 c = GETNEXT();
698 if (c == '\\') {
699 (void) REQUIRE(MORE(), REG_EESCAPE);
700 c = BACKSL | GETNEXT();
701 }
702 switch (c) {
703 case '.':
709 case '[':
710 p_bracket(p);
711 break;
712 case BACKSL|'<':
713 EMIT(OBOW, 0);
714 break;
715 case BACKSL|'>':
716 EMIT(OEOW, 0);
717 break;
718 case BACKSL|'{':
719 SETERROR(REG_BADRPT);
720 break;
721 case BACKSL|'(':
722 p->g->nsub++;
723 subno = p->g->nsub;
724 if (subno < NPAREN)
725 p->pbegin[subno] = HERE();
726 EMIT(OLPAREN, subno);
727 /* the MORE here is an error heuristic */
728 if (MORE() && !SEETWO('\\', ')'))
729 p_re(p, '\\', ')');
730 if (subno < NPAREN) {
731 p->pend[subno] = HERE();
732 assert(p->pend[subno] != 0);
733 }
734 EMIT(ORPAREN, subno);
735 (void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
736 break;
737 case BACKSL|')': /* should not get here -- must be user */
738 SETERROR(REG_EPAREN);
739 break;
740 case BACKSL|'1':
741 case BACKSL|'2':
742 case BACKSL|'3':
743 case BACKSL|'4':
744 case BACKSL|'5':
745 case BACKSL|'6':
746 case BACKSL|'7':
747 case BACKSL|'8':
748 case BACKSL|'9':
749 i = (c&~BACKSL) - '0';
750 assert(i < NPAREN);
751 if (p->pend[i] != 0) {
752 assert(i <= p->g->nsub);
753 EMIT(OBACK_, i);
754 assert(p->pbegin[i] != 0);
755 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
756 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
757 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
758 EMIT(O_BACK, i);
759 } else
760 SETERROR(REG_ESUBREG);
761 p->g->backrefs = 1;
762 break;
763 case '*':
764 /*
765 * Ordinary if used as the first character beyond BOL anchor of
766 * a (sub-)expression, counts as a bad repetition operator if it
767 * appears otherwise.
768 */
769 (void) REQUIRE(bc->nchain == 0, REG_BADRPT);
770 /* FALLTHROUGH */
771 default:
772 if (p->error != 0)
773 return (false); /* Definitely not $... */
774 p->next--;
775 wc = WGETNEXT();
776 ordinary(p, wc);
777 break;
778 }
779
780 if (EAT('*')) { /* implemented as +? */
781 /* this case does not require the (y|) trick, noKLUDGE */
782 INSERT(OPLUS_, pos);
783 ASTERN(O_PLUS, pos);
784 INSERT(OQUEST_, pos);
785 ASTERN(O_QUEST, pos);
786 } else if (EATTWO('\\', '{')) {
787 count = p_count(p);
788 if (EAT(',')) {
789 if (MORE() && isdigit((uch)PEEK())) {
790 count2 = p_count(p);
791 (void) REQUIRE(count <= count2, REG_BADBR);
792 } else /* single number with comma */
793 count2 = INFINITY;
794 } else /* just a single number */
795 count2 = count;
796 repeat(p, pos, count, count2);
797 if (!EATTWO('\\', '}')) { /* error heuristics */
798 while (MORE() && !SEETWO('\\', '}'))
799 NEXT();
800 (void) REQUIRE(MORE(), REG_EBRACE);
801 SETERROR(REG_BADBR);
802 }
803 } else if (c == '$') /* $ (but not \$) ends it */
804 return (true);
805
806 return (false);
807 }
808
809 /*
810 * p_count - parse a repetition count
811 */
812 static int /* the value */
813 p_count(struct parse *p)
814 {
815 int count = 0;
816 int ndigits = 0;
817
818 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
819 count = count*10 + (GETNEXT() - '0');
820 ndigits++;
821 }
822
823 (void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
824 return (count);
825 }
826
1015 * p_b_coll_elem - parse a collating-element name and look it up
1016 */
1017 static wint_t /* value of collating element */
1018 p_b_coll_elem(struct parse *p,
1019 wint_t endc) /* name ended by endc,']' */
1020 {
1021 const char *sp = p->next;
1022 struct cname *cp;
1023 mbstate_t mbs;
1024 wchar_t wc;
1025 size_t clen, len;
1026
1027 while (MORE() && !SEETWO(endc, ']'))
1028 NEXT();
1029 if (!MORE()) {
1030 SETERROR(REG_EBRACK);
1031 return (0);
1032 }
1033 len = p->next - sp;
1034 for (cp = cnames; cp->name != NULL; cp++)
1035 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1036 return (cp->code); /* known name */
1037 (void) memset(&mbs, 0, sizeof (mbs));
1038 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1039 return (wc); /* single character */
1040 else if (clen == (size_t)-1 || clen == (size_t)-2)
1041 SETERROR(REG_ECHAR);
1042 else
1043 SETERROR(REG_ECOLLATE); /* neither */
1044 return (0);
1045 }
1046
1047 /*
1048 * othercase - return the case counterpart of an alphabetic
1049 */
1050 static wint_t /* if no counterpart, return ch */
1051 othercase(wint_t ch)
1052 {
1053 assert(iswalpha(ch));
1054 if (iswupper(ch))
1055 return (towlower(ch));
1558 (void) memset(&mbs, 0, sizeof (mbs));
1559 newstart = scan - 1;
1560 }
1561 clen = wcrtomb(buf, OPND(s), &mbs);
1562 if (clen == (size_t)-1)
1563 goto toohard;
1564 newlen += clen;
1565 break;
1566 case OPLUS_: /* things that don't break one */
1567 case OLPAREN:
1568 case ORPAREN:
1569 break;
1570 case OQUEST_: /* things that must be skipped */
1571 case OCH_:
1572 offset = altoffset(scan, offset);
1573 scan--;
1574 do {
1575 scan += OPND(s);
1576 s = *scan;
1577 /* assert() interferes w debug printouts */
1578 if (OP(s) != (sop)O_QUEST &&
1579 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) {
1580 g->iflags |= BAD;
1581 return;
1582 }
1583 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1584 /* FALLTHROUGH */
1585 case OBOW: /* things that break a sequence */
1586 case OEOW:
1587 case OBOL:
1588 case OEOL:
1589 case O_QUEST:
1590 case O_CH:
1591 case OEND:
1592 if (newlen > (sopno)g->mlen) { /* ends one */
1593 start = newstart;
1594 g->mlen = newlen;
1595 if (offset > -1) {
1596 g->moffset += offset;
1597 offset = newlen;
1598 } else
1599 g->moffset = offset;
1600 } else {
1601 if (offset > -1)
1602 offset += newlen;
1603 }
1604 newlen = 0;
1605 break;
1606 case OANY:
1607 if (newlen > (sopno)g->mlen) { /* ends one */
1608 start = newstart;
1609 g->mlen = newlen;
1610 if (offset > -1) {
1611 g->moffset += offset;
1612 offset = newlen;
1613 } else
1614 g->moffset = offset;
1615 } else {
1616 if (offset > -1)
1617 offset += newlen;
1618 }
1619 if (offset > -1)
1620 offset++;
1621 newlen = 0;
1622 break;
1623 case OANYOF: /* may or may not invalidate offset */
1624 /* First, everything as OANY */
1625 if (newlen > (sopno)g->mlen) { /* ends one */
1626 start = newstart;
1627 g->mlen = newlen;
1628 if (offset > -1) {
1629 g->moffset += offset;
1630 offset = newlen;
1631 } else
1632 g->moffset = offset;
1633 } else {
1634 if (offset > -1)
1635 offset += newlen;
1636 }
1637 if (offset > -1)
1638 offset++;
1639 newlen = 0;
1640 break;
1641 toohard:
1642 default:
1643 /*
1644 * Anything here makes it impossible or too hard
1645 * to calculate the offset -- so we give up;
1646 * save the last known good offset, in case the
1647 * must sequence doesn't occur later.
1648 */
1649 if (newlen > (sopno)g->mlen) { /* ends one */
1650 start = newstart;
1651 g->mlen = newlen;
1652 if (offset > -1)
1653 g->moffset += offset;
1654 else
1655 g->moffset = offset;
1656 }
1657 offset = -1;
1658 newlen = 0;
1659 break;
1660 }
1661 } while (OP(s) != OEND);
1662
1663 if (g->mlen == 0) { /* there isn't one */
1664 g->moffset = -1;
1665 return;
1666 }
1667
1668 /* turn it into a character string */
1669 g->must = malloc((size_t)g->mlen + 1);
1689 /*
1690 * altoffset - choose biggest offset among multiple choices
1691 *
1692 * Compute, recursively if necessary, the largest offset among multiple
1693 * re paths.
1694 */
1695 static int
1696 altoffset(sop *scan, int offset)
1697 {
1698 int largest;
1699 int try;
1700 sop s;
1701
1702 /* If we gave up already on offsets, return */
1703 if (offset == -1)
1704 return (-1);
1705
1706 largest = 0;
1707 try = 0;
1708 s = *scan++;
1709 while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) {
1710 switch (OP(s)) {
1711 case OOR1:
1712 if (try > largest)
1713 largest = try;
1714 try = 0;
1715 break;
1716 case OQUEST_:
1717 case OCH_:
1718 try = altoffset(scan, try);
1719 if (try == -1)
1720 return (-1);
1721 scan--;
1722 do {
1723 scan += OPND(s);
1724 s = *scan;
1725 if (OP(s) != (sop)O_QUEST &&
1726 OP(s) != (sop)O_CH && OP(s) != (sop)OOR2)
1727 return (-1);
1728 } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1729 /*
1730 * We must skip to the next position, or we'll
1731 * leave altoffset() too early.
1732 */
1733 scan++;
1734 break;
1735 case OANYOF:
1736 case OCHAR:
1737 case OANY:
1738 try++;
1739 /*FALLTHRU*/
1740 case OBOW:
1741 case OEOW:
1742 case OLPAREN:
1743 case ORPAREN:
1744 case OOR2:
1745 break;
1746 default:
1747 try = -1;
1748 break;
|