1 /*
2 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
4 * Copyright 2012 Milan Jurik. All rights reserved.
5 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
6 * Copyright (c) 1992, 1993, 1994
7 * The Regents of the University of California. All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * Henry Spencer.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37 #include "lint.h"
38 #include "file64.h"
39 #include <sys/types.h>
40 #include <stdio.h>
41 #include <string.h>
42 #include <ctype.h>
43 #include <limits.h>
44 #include <stdlib.h>
45 #include <regex.h>
46 #include <wchar.h>
47 #include <wctype.h>
48
49 #include "../locale/runetype.h"
50 #include "../locale/collate.h"
51
52 #include "utils.h"
53 #include "regex2.h"
54
55 #include "cname.h"
56 #include "../locale/mblocal.h"
57
58 /*
59 * parse structure, passed up and down to avoid global variables and
60 * other clumsinesses
61 */
62 struct parse {
63 const char *next; /* next character in RE */
64 const char *end; /* end of string (-> NUL normally) */
65 int error; /* has an error been seen? */
66 sop *strip; /* malloced strip */
67 sopno ssize; /* malloced strip size (allocated) */
68 sopno slen; /* malloced strip length (used) */
69 int ncsalloc; /* number of csets allocated */
70 struct re_guts *g;
71 #define NPAREN 10 /* we need to remember () 1-9 for back refs */
72 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
73 sopno pend[NPAREN]; /* -> ) ([0] unused) */
74 };
75
76 /* ========= begin header generated by ./mkh ========= */
77 #ifdef __cplusplus
78 extern "C" {
79 #endif
80
81 /* === regcomp.c === */
82 static void p_ere(struct parse *p, int stop);
83 static void p_ere_exp(struct parse *p);
84 static void p_str(struct parse *p);
85 static void p_bre(struct parse *p, int end1, int end2);
86 static int p_simp_re(struct parse *p, int starordinary);
87 static int p_count(struct parse *p);
88 static void p_bracket(struct parse *p);
89 static void p_b_term(struct parse *p, cset *cs);
90 static void p_b_cclass(struct parse *p, cset *cs);
91 static void p_b_eclass(struct parse *p, cset *cs);
92 static wint_t p_b_symbol(struct parse *p);
93 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
94 static wint_t othercase(wint_t ch);
95 static void bothcases(struct parse *p, wint_t ch);
96 static void ordinary(struct parse *p, wint_t ch);
97 static void nonnewline(struct parse *p);
98 static void repeat(struct parse *p, sopno start, int from, int to);
99 static int seterr(struct parse *p, int e);
100 static cset *allocset(struct parse *p);
101 static void freeset(struct parse *p, cset *cs);
102 static void CHadd(struct parse *p, cset *cs, wint_t ch);
103 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
104 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
105 static wint_t singleton(cset *cs);
106 static sopno dupl(struct parse *p, sopno start, sopno finish);
107 static void doemit(struct parse *p, sop op, size_t opnd);
108 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
109 static void dofwd(struct parse *p, sopno pos, sop value);
110 static int enlarge(struct parse *p, sopno size);
111 static void stripsnug(struct parse *p, struct re_guts *g);
112 static void findmust(struct parse *p, struct re_guts *g);
113 static int altoffset(sop *scan, int offset);
114 static void computejumps(struct parse *p, struct re_guts *g);
115 static void computematchjumps(struct parse *p, struct re_guts *g);
116 static sopno pluscount(struct parse *p, struct re_guts *g);
117 static wint_t wgetnext(struct parse *p);
118
119 #ifdef __cplusplus
120 }
121 #endif
122 /* ========= end header generated by ./mkh ========= */
123
124 static char nuls[10]; /* place to point scanner in event of error */
125
126 /*
127 * macros for use with parse structure
128 * BEWARE: these know that the parse structure is named `p' !!!
129 */
130 #define PEEK() (*p->next)
131 #define PEEK2() (*(p->next+1))
132 #define MORE() (p->next < p->end)
133 #define MORE2() (p->next+1 < p->end)
134 #define SEE(c) (MORE() && PEEK() == (c))
135 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
136 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
137 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
138 #define NEXT() (p->next++)
139 #define NEXT2() (p->next += 2)
140 #define NEXTn(n) (p->next += (n))
141 #define GETNEXT() (*p->next++)
142 #define WGETNEXT() wgetnext(p)
143 #define SETERROR(e) ((void)seterr(p, (e)))
144 #define REQUIRE(co, e) ((co) || seterr(p, e))
145 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
146 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
147 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
148 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
149 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
150 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
151 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
152 #define HERE() (p->slen)
153 #define THERE() (p->slen - 1)
154 #define THERETHERE() (p->slen - 2)
155 #define DROP(n) (p->slen -= (n))
156
157 #ifndef NDEBUG
158 static int never = 0; /* for use in asserts; shuts lint up */
159 #else
160 #define never 0 /* some <assert.h>s have bugs too */
161 #endif
162
163 /*
164 * regcomp - interface for parser and compilation
165 */
166 int /* 0 success, otherwise REG_something */
167 regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern,
168 int cflags)
169 {
170 struct parse pa;
171 struct re_guts *g;
172 struct parse *p = &pa;
173 int i;
174 size_t len;
175 size_t maxlen;
176 #ifdef REDEBUG
177 #define GOODFLAGS(f) (f)
178 #else
179 #define GOODFLAGS(f) ((f)&~REG_DUMP)
180 #endif
181
182 /* We had REG_INVARG, but we don't have that on Solaris. */
183 cflags = GOODFLAGS(cflags);
184 if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
185 return (REG_EFATAL);
186
187 if (cflags®_PEND) {
188 if (preg->re_endp < pattern)
189 return (REG_EFATAL);
190 len = preg->re_endp - pattern;
191 } else
192 len = strlen(pattern);
193
194 /* do the mallocs early so failure handling is easy */
195 g = (struct re_guts *)malloc(sizeof (struct re_guts));
196 if (g == NULL)
197 return (REG_ESPACE);
198 /*
199 * Limit the pattern space to avoid a 32-bit overflow on buffer
200 * extension. Also avoid any signed overflow in case of conversion
201 * so make the real limit based on a 31-bit overflow.
202 *
203 * Likely not applicable on 64-bit systems but handle the case
204 * generically (who are we to stop people from using ~715MB+
205 * patterns?).
206 */
207 maxlen = ((size_t)-1 >> 1) / sizeof (sop) * 2 / 3;
208 if (len >= maxlen) {
209 free((char *)g);
210 return (REG_ESPACE);
211 }
212 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
213 assert(p->ssize >= len);
214
215 p->strip = (sop *)malloc(p->ssize * sizeof (sop));
216 p->slen = 0;
217 if (p->strip == NULL) {
218 free((char *)g);
219 return (REG_ESPACE);
220 }
221
222 /* set things up */
223 p->g = g;
224 p->next = pattern; /* convenience; we do not modify it */
225 p->end = p->next + len;
226 p->error = 0;
227 p->ncsalloc = 0;
228 for (i = 0; i < NPAREN; i++) {
229 p->pbegin[i] = 0;
230 p->pend[i] = 0;
231 }
232 g->sets = NULL;
233 g->ncsets = 0;
234 g->cflags = cflags;
235 g->iflags = 0;
236 g->nbol = 0;
237 g->neol = 0;
238 g->must = NULL;
239 g->moffset = -1;
240 g->charjump = NULL;
241 g->matchjump = NULL;
242 g->mlen = 0;
243 g->nsub = 0;
244 g->backrefs = 0;
245
246 /* do it */
247 EMIT(OEND, 0);
248 g->firststate = THERE();
249 if (cflags®_EXTENDED)
250 p_ere(p, OUT);
251 else if (cflags®_NOSPEC)
252 p_str(p);
253 else
254 p_bre(p, OUT, OUT);
255 EMIT(OEND, 0);
256 g->laststate = THERE();
257
258 /* tidy up loose ends and fill things in */
259 stripsnug(p, g);
260 findmust(p, g);
261 /*
262 * only use Boyer-Moore algorithm if the pattern is bigger
263 * than three characters
264 */
265 if (g->mlen > 3) {
266 computejumps(p, g);
267 computematchjumps(p, g);
268 if (g->matchjump == NULL && g->charjump != NULL) {
269 free(g->charjump);
270 g->charjump = NULL;
271 }
272 }
273 g->nplus = pluscount(p, g);
274 g->magic = MAGIC2;
275 preg->re_nsub = g->nsub;
276 preg->re_g = g;
277 preg->re_magic = MAGIC1;
278 #ifndef REDEBUG
279 /* not debugging, so can't rely on the assert() in regexec() */
280 if (g->iflags&BAD)
281 SETERROR(REG_EFATAL);
282 #endif
283
284 /* win or lose, we're done */
285 if (p->error != 0) /* lose */
286 regfree(preg);
287 return (p->error);
288 }
289
290 /*
291 * p_ere - ERE parser top level, concatenation and alternation
292 */
293 static void
294 p_ere(struct parse *p,
295 int stop) /* character this ERE should end at */
296 {
297 char c;
298 sopno prevback;
299 sopno prevfwd;
300 sopno conc;
301 int first = 1; /* is this the first alternative? */
302
303 for (;;) {
304 /* do a bunch of concatenated expressions */
305 conc = HERE();
306 while (MORE() && (c = PEEK()) != '|' && c != stop)
307 p_ere_exp(p);
308 /* require nonempty */
309 (void) REQUIRE(HERE() != conc, REG_BADPAT);
310
311 if (!EAT('|'))
312 break; /* NOTE BREAK OUT */
313
314 if (first) {
315 INSERT(OCH_, conc); /* offset is wrong */
316 prevfwd = conc;
317 prevback = conc;
318 first = 0;
319 }
320 ASTERN(OOR1, prevback);
321 prevback = THERE();
322 AHEAD(prevfwd); /* fix previous offset */
323 prevfwd = HERE();
324 EMIT(OOR2, 0); /* offset is very wrong */
325 }
326
327 if (!first) { /* tail-end fixups */
328 AHEAD(prevfwd);
329 ASTERN(O_CH, prevback);
330 }
331
332 assert(!MORE() || SEE(stop));
333 }
334
335 /*
336 * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
337 */
338 static void
339 p_ere_exp(struct parse *p)
340 {
341 char c;
342 wint_t wc;
343 sopno pos;
344 int count;
345 int count2;
346 sopno subno;
347 int wascaret = 0;
348
349 assert(MORE()); /* caller should have ensured this */
350 c = GETNEXT();
351
352 pos = HERE();
353 switch (c) {
354 case '(':
355 (void) REQUIRE(MORE(), REG_EPAREN);
356 p->g->nsub++;
357 subno = p->g->nsub;
358 if (subno < NPAREN)
359 p->pbegin[subno] = HERE();
360 EMIT(OLPAREN, subno);
361 if (!SEE(')'))
362 p_ere(p, ')');
363 if (subno < NPAREN) {
364 p->pend[subno] = HERE();
365 assert(p->pend[subno] != 0);
366 }
367 EMIT(ORPAREN, subno);
368 (void) MUSTEAT(')', REG_EPAREN);
369 break;
370 #ifndef POSIX_MISTAKE
371 case ')': /* happens only if no current unmatched ( */
372 /*
373 * You may ask, why the ifndef? Because I didn't notice
374 * this until slightly too late for 1003.2, and none of the
375 * other 1003.2 regular-expression reviewers noticed it at
376 * all. So an unmatched ) is legal POSIX, at least until
377 * we can get it fixed.
378 */
379 SETERROR(REG_EPAREN);
380 break;
381 #endif
382 case '^':
383 EMIT(OBOL, 0);
384 p->g->iflags |= USEBOL;
385 p->g->nbol++;
386 wascaret = 1;
387 break;
388 case '$':
389 EMIT(OEOL, 0);
390 p->g->iflags |= USEEOL;
391 p->g->neol++;
392 break;
393 case '|':
394 SETERROR(REG_BADPAT);
395 break;
396 case '*':
397 case '+':
398 case '?':
399 case '{':
400 SETERROR(REG_BADRPT);
401 break;
402 case '.':
403 if (p->g->cflags®_NEWLINE)
404 nonnewline(p);
405 else
406 EMIT(OANY, 0);
407 break;
408 case '[':
409 p_bracket(p);
410 break;
411 case '\\':
412 (void) REQUIRE(MORE(), REG_EESCAPE);
413 wc = WGETNEXT();
414 switch (wc) {
415 case '<':
416 EMIT(OBOW, 0);
417 break;
418 case '>':
419 EMIT(OEOW, 0);
420 break;
421 default:
422 ordinary(p, wc);
423 break;
424 }
425 break;
426 default:
427 if (p->error != 0)
428 return;
429 p->next--;
430 wc = WGETNEXT();
431 ordinary(p, wc);
432 break;
433 }
434
435 if (!MORE())
436 return;
437 c = PEEK();
438 /* we call { a repetition if followed by a digit */
439 if (!(c == '*' || c == '+' || c == '?' || c == '{'))
440 return; /* no repetition, we're done */
441 else if (c == '{')
442 (void) REQUIRE(MORE2() && \
443 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
444 NEXT();
445
446 (void) REQUIRE(!wascaret, REG_BADRPT);
447 switch (c) {
448 case '*': /* implemented as +? */
449 /* this case does not require the (y|) trick, noKLUDGE */
450 INSERT(OPLUS_, pos);
451 ASTERN(O_PLUS, pos);
452 INSERT(OQUEST_, pos);
453 ASTERN(O_QUEST, pos);
454 break;
455 case '+':
456 INSERT(OPLUS_, pos);
457 ASTERN(O_PLUS, pos);
458 break;
459 case '?':
460 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
461 INSERT(OCH_, pos); /* offset slightly wrong */
462 ASTERN(OOR1, pos); /* this one's right */
463 AHEAD(pos); /* fix the OCH_ */
464 EMIT(OOR2, 0); /* offset very wrong... */
465 AHEAD(THERE()); /* ...so fix it */
466 ASTERN(O_CH, THERETHERE());
467 break;
468 case '{':
469 count = p_count(p);
470 if (EAT(',')) {
471 if (isdigit((uch)PEEK())) {
472 count2 = p_count(p);
473 (void) REQUIRE(count <= count2, REG_BADBR);
474 } else /* single number with comma */
475 count2 = INFINITY;
476 } else /* just a single number */
477 count2 = count;
478 repeat(p, pos, count, count2);
479 if (!EAT('}')) { /* error heuristics */
480 while (MORE() && PEEK() != '}')
481 NEXT();
482 (void) REQUIRE(MORE(), REG_EBRACE);
483 SETERROR(REG_BADBR);
484 }
485 break;
486 }
487
488 if (!MORE())
489 return;
490 c = PEEK();
491 if (!(c == '*' || c == '+' || c == '?' ||
492 (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
493 return;
494 SETERROR(REG_BADRPT);
495 }
496
497 /*
498 * p_str - string (no metacharacters) "parser"
499 */
500 static void
501 p_str(struct parse *p)
502 {
503 (void) REQUIRE(MORE(), REG_BADPAT);
504 while (MORE())
505 ordinary(p, WGETNEXT());
506 }
507
508 /*
509 * p_bre - BRE parser top level, anchoring and concatenation
510 * Giving end1 as OUT essentially eliminates the end1/end2 check.
511 *
512 * This implementation is a bit of a kludge, in that a trailing $ is first
513 * taken as an ordinary character and then revised to be an anchor.
514 * The amount of lookahead needed to avoid this kludge is excessive.
515 */
516 static void
517 p_bre(struct parse *p,
518 int end1, /* first terminating character */
519 int end2) /* second terminating character */
520 {
521 sopno start = HERE();
522 int first = 1; /* first subexpression? */
523 int wasdollar = 0;
524
525 if (EAT('^')) {
526 EMIT(OBOL, 0);
527 p->g->iflags |= USEBOL;
528 p->g->nbol++;
529 }
530 while (MORE() && !SEETWO(end1, end2)) {
531 wasdollar = p_simp_re(p, first);
532 first = 0;
533 }
534 if (wasdollar) { /* oops, that was a trailing anchor */
535 DROP(1);
536 EMIT(OEOL, 0);
537 p->g->iflags |= USEEOL;
538 p->g->neol++;
539 }
540
541 (void) REQUIRE(HERE() != start, REG_BADPAT); /* require nonempty */
542 }
543
544 /*
545 * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
546 */
547 static int /* was the simple RE an unbackslashed $? */
548 p_simp_re(struct parse *p,
549 int starordinary) /* is a leading * an ordinary character? */
550 {
551 int c;
552 int count;
553 int count2;
554 sopno pos;
555 int i;
556 wint_t wc;
557 sopno subno;
558 #define BACKSL (1<<CHAR_BIT)
559
560 pos = HERE(); /* repetition op, if any, covers from here */
561
562 assert(MORE()); /* caller should have ensured this */
563 c = GETNEXT();
564 if (c == '\\') {
565 (void) REQUIRE(MORE(), REG_EESCAPE);
566 c = BACKSL | GETNEXT();
567 }
568 switch (c) {
569 case '.':
570 if (p->g->cflags®_NEWLINE)
571 nonnewline(p);
572 else
573 EMIT(OANY, 0);
574 break;
575 case '[':
576 p_bracket(p);
577 break;
578 case BACKSL|'<':
579 EMIT(OBOW, 0);
580 break;
581 case BACKSL|'>':
582 EMIT(OEOW, 0);
583 break;
584 case BACKSL|'{':
585 SETERROR(REG_BADRPT);
586 break;
587 case BACKSL|'(':
588 p->g->nsub++;
589 subno = p->g->nsub;
590 if (subno < NPAREN)
591 p->pbegin[subno] = HERE();
592 EMIT(OLPAREN, subno);
593 /* the MORE here is an error heuristic */
594 if (MORE() && !SEETWO('\\', ')'))
595 p_bre(p, '\\', ')');
596 if (subno < NPAREN) {
597 p->pend[subno] = HERE();
598 assert(p->pend[subno] != 0);
599 }
600 EMIT(ORPAREN, subno);
601 (void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
602 break;
603 case BACKSL|')': /* should not get here -- must be user */
604 SETERROR(REG_EPAREN);
605 break;
606 case BACKSL|'1':
607 case BACKSL|'2':
608 case BACKSL|'3':
609 case BACKSL|'4':
610 case BACKSL|'5':
611 case BACKSL|'6':
612 case BACKSL|'7':
613 case BACKSL|'8':
614 case BACKSL|'9':
615 i = (c&~BACKSL) - '0';
616 assert(i < NPAREN);
617 if (p->pend[i] != 0) {
618 assert(i <= p->g->nsub);
619 EMIT(OBACK_, i);
620 assert(p->pbegin[i] != 0);
621 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
622 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
623 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
624 EMIT(O_BACK, i);
625 } else
626 SETERROR(REG_ESUBREG);
627 p->g->backrefs = 1;
628 break;
629 case '*':
630 (void) REQUIRE(starordinary, REG_BADRPT);
631 /* FALLTHROUGH */
632 default:
633 if (p->error != 0)
634 return (0); /* Definitely not $... */
635 p->next--;
636 wc = WGETNEXT();
637 ordinary(p, wc);
638 break;
639 }
640
641 if (EAT('*')) { /* implemented as +? */
642 /* this case does not require the (y|) trick, noKLUDGE */
643 INSERT(OPLUS_, pos);
644 ASTERN(O_PLUS, pos);
645 INSERT(OQUEST_, pos);
646 ASTERN(O_QUEST, pos);
647 } else if (EATTWO('\\', '{')) {
648 count = p_count(p);
649 if (EAT(',')) {
650 if (MORE() && isdigit((uch)PEEK())) {
651 count2 = p_count(p);
652 (void) REQUIRE(count <= count2, REG_BADBR);
653 } else /* single number with comma */
654 count2 = INFINITY;
655 } else /* just a single number */
656 count2 = count;
657 repeat(p, pos, count, count2);
658 if (!EATTWO('\\', '}')) { /* error heuristics */
659 while (MORE() && !SEETWO('\\', '}'))
660 NEXT();
661 (void) REQUIRE(MORE(), REG_EBRACE);
662 SETERROR(REG_BADBR);
663 }
664 } else if (c == '$') /* $ (but not \$) ends it */
665 return (1);
666
667 return (0);
668 }
669
670 /*
671 * p_count - parse a repetition count
672 */
673 static int /* the value */
674 p_count(struct parse *p)
675 {
676 int count = 0;
677 int ndigits = 0;
678
679 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
680 count = count*10 + (GETNEXT() - '0');
681 ndigits++;
682 }
683
684 (void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
685 return (count);
686 }
687
688 /*
689 * p_bracket - parse a bracketed character list
690 */
691 static void
692 p_bracket(struct parse *p)
693 {
694 cset *cs;
695 wint_t ch;
696
697 /* Dept of Truly Sickening Special-Case Kludges */
698 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
699 EMIT(OBOW, 0);
700 NEXTn(6);
701 return;
702 }
703 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
704 EMIT(OEOW, 0);
705 NEXTn(6);
706 return;
707 }
708
709 if ((cs = allocset(p)) == NULL)
710 return;
711
712 if (p->g->cflags®_ICASE)
713 cs->icase = 1;
714 if (EAT('^'))
715 cs->invert = 1;
716 if (EAT(']'))
717 CHadd(p, cs, ']');
718 else if (EAT('-'))
719 CHadd(p, cs, '-');
720 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
721 p_b_term(p, cs);
722 if (EAT('-'))
723 CHadd(p, cs, '-');
724 (void) MUSTEAT(']', REG_EBRACK);
725
726 if (p->error != 0) /* don't mess things up further */
727 return;
728
729 if (cs->invert && p->g->cflags®_NEWLINE)
730 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
731
732 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
733 ordinary(p, ch);
734 freeset(p, cs);
735 } else
736 EMIT(OANYOF, (int)(cs - p->g->sets));
737 }
738
739 /*
740 * p_b_term - parse one term of a bracketed character list
741 */
742 static void
743 p_b_term(struct parse *p, cset *cs)
744 {
745 char c;
746 wint_t start, finish;
747 wint_t i;
748 locale_t loc = uselocale(NULL);
749
750 /* classify what we've got */
751 switch ((MORE()) ? PEEK() : '\0') {
752 case '[':
753 c = (MORE2()) ? PEEK2() : '\0';
754 break;
755 case '-':
756 SETERROR(REG_ERANGE);
757 return; /* NOTE RETURN */
758 default:
759 c = '\0';
760 break;
761 }
762
763 switch (c) {
764 case ':': /* character class */
765 NEXT2();
766 (void) REQUIRE(MORE(), REG_EBRACK);
767 c = PEEK();
768 (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);
769 p_b_cclass(p, cs);
770 (void) REQUIRE(MORE(), REG_EBRACK);
771 (void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
772 break;
773 case '=': /* equivalence class */
774 NEXT2();
775 (void) REQUIRE(MORE(), REG_EBRACK);
776 c = PEEK();
777 (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
778 p_b_eclass(p, cs);
779 (void) REQUIRE(MORE(), REG_EBRACK);
780 (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
781 break;
782 default: /* symbol, ordinary character, or range */
783 start = p_b_symbol(p);
784 if (SEE('-') && MORE2() && PEEK2() != ']') {
785 /* range */
786 NEXT();
787 if (EAT('-'))
788 finish = '-';
789 else
790 finish = p_b_symbol(p);
791 } else
792 finish = start;
793 if (start == finish)
794 CHadd(p, cs, start);
795 else {
796 if (loc->collate->lc_is_posix) {
797 (void) REQUIRE((uch)start <= (uch)finish,
798 REG_ERANGE);
799 CHaddrange(p, cs, start, finish);
800 } else {
801 (void) REQUIRE(_collate_range_cmp(start,
802 finish, loc) <= 0, REG_ERANGE);
803 for (i = 0; i <= UCHAR_MAX; i++) {
804 if (_collate_range_cmp(start, i, loc)
805 <= 0 &&
806 _collate_range_cmp(i, finish, loc)
807 <= 0)
808 CHadd(p, cs, i);
809 }
810 }
811 }
812 break;
813 }
814 }
815
816 /*
817 * p_b_cclass - parse a character-class name and deal with it
818 */
819 static void
820 p_b_cclass(struct parse *p, cset *cs)
821 {
822 const char *sp = p->next;
823 size_t len;
824 wctype_t wct;
825 char clname[16];
826
827 while (MORE() && isalpha((uch)PEEK()))
828 NEXT();
829 len = p->next - sp;
830 if (len >= sizeof (clname) - 1) {
831 SETERROR(REG_ECTYPE);
832 return;
833 }
834 (void) memcpy(clname, sp, len);
835 clname[len] = '\0';
836 if ((wct = wctype(clname)) == 0) {
837 SETERROR(REG_ECTYPE);
838 return;
839 }
840 CHaddtype(p, cs, wct);
841 }
842
843 /*
844 * p_b_eclass - parse an equivalence-class name and deal with it
845 *
846 * This implementation is incomplete. xxx
847 */
848 static void
849 p_b_eclass(struct parse *p, cset *cs)
850 {
851 wint_t c;
852
853 c = p_b_coll_elem(p, '=');
854 CHadd(p, cs, c);
855 }
856
857 /*
858 * p_b_symbol - parse a character or [..]ed multicharacter collating symbol
859 */
860 static wint_t /* value of symbol */
861 p_b_symbol(struct parse *p)
862 {
863 wint_t value;
864
865 (void) REQUIRE(MORE(), REG_EBRACK);
866 if (!EATTWO('[', '.'))
867 return (WGETNEXT());
868
869 /* collating symbol */
870 value = p_b_coll_elem(p, '.');
871 (void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
872 return (value);
873 }
874
875 /*
876 * p_b_coll_elem - parse a collating-element name and look it up
877 */
878 static wint_t /* value of collating element */
879 p_b_coll_elem(struct parse *p,
880 wint_t endc) /* name ended by endc,']' */
881 {
882 const char *sp = p->next;
883 struct cname *cp;
884 mbstate_t mbs;
885 wchar_t wc;
886 size_t clen, len;
887
888 while (MORE() && !SEETWO(endc, ']'))
889 NEXT();
890 if (!MORE()) {
891 SETERROR(REG_EBRACK);
892 return (0);
893 }
894 len = p->next - sp;
895 for (cp = cnames; cp->name != NULL; cp++)
896 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
897 return (cp->code); /* known name */
898 (void) memset(&mbs, 0, sizeof (mbs));
899 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
900 return (wc); /* single character */
901 else if (clen == (size_t)-1 || clen == (size_t)-2)
902 SETERROR(REG_ECHAR);
903 else
904 SETERROR(REG_ECOLLATE); /* neither */
905 return (0);
906 }
907
908 /*
909 * othercase - return the case counterpart of an alphabetic
910 */
911 static wint_t /* if no counterpart, return ch */
912 othercase(wint_t ch)
913 {
914 assert(iswalpha(ch));
915 if (iswupper(ch))
916 return (towlower(ch));
917 else if (iswlower(ch))
918 return (towupper(ch));
919 else /* peculiar, but could happen */
920 return (ch);
921 }
922
923 /*
924 * bothcases - emit a dualcase version of a two-case character
925 *
926 * Boy, is this implementation ever a kludge...
927 */
928 static void
929 bothcases(struct parse *p, wint_t ch)
930 {
931 const char *oldnext = p->next;
932 const char *oldend = p->end;
933 char bracket[3 + MB_LEN_MAX];
934 size_t n;
935 mbstate_t mbs;
936
937 assert(othercase(ch) != ch); /* p_bracket() would recurse */
938 p->next = bracket;
939 (void) memset(&mbs, 0, sizeof (mbs));
940 n = wcrtomb(bracket, ch, &mbs);
941 assert(n != (size_t)-1);
942 bracket[n] = ']';
943 bracket[n + 1] = '\0';
944 p->end = bracket+n+1;
945 p_bracket(p);
946 assert(p->next == p->end);
947 p->next = oldnext;
948 p->end = oldend;
949 }
950
951 /*
952 * ordinary - emit an ordinary character
953 */
954 static void
955 ordinary(struct parse *p, wint_t ch)
956 {
957 cset *cs;
958
959 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch)
960 bothcases(p, ch);
961 else if ((ch & OPDMASK) == ch)
962 EMIT(OCHAR, ch);
963 else {
964 /*
965 * Kludge: character is too big to fit into an OCHAR operand.
966 * Emit a singleton set.
967 */
968 if ((cs = allocset(p)) == NULL)
969 return;
970 CHadd(p, cs, ch);
971 EMIT(OANYOF, (int)(cs - p->g->sets));
972 }
973 }
974
975 /*
976 * nonnewline - emit REG_NEWLINE version of OANY
977 *
978 * Boy, is this implementation ever a kludge...
979 */
980 static void
981 nonnewline(struct parse *p)
982 {
983 const char *oldnext = p->next;
984 const char *oldend = p->end;
985 char bracket[4];
986
987 p->next = bracket;
988 p->end = bracket+3;
989 bracket[0] = '^';
990 bracket[1] = '\n';
991 bracket[2] = ']';
992 bracket[3] = '\0';
993 p_bracket(p);
994 assert(p->next == bracket+3);
995 p->next = oldnext;
996 p->end = oldend;
997 }
998
999 /*
1000 * repeat - generate code for a bounded repetition, recursively if needed
1001 */
1002 static void
1003 repeat(struct parse *p,
1004 sopno start, /* operand from here to end of strip */
1005 int from, /* repeated from this number */
1006 int to) /* to this number of times (maybe INFINITY) */
1007 {
1008 sopno finish = HERE();
1009 #define N 2
1010 #define INF 3
1011 #define REP(f, t) ((f)*8 + (t))
1012 #define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1013 sopno copy;
1014
1015 if (p->error != 0) /* head off possible runaway recursion */
1016 return;
1017
1018 assert(from <= to);
1019
1020 switch (REP(MAP(from), MAP(to))) {
1021 case REP(0, 0): /* must be user doing this */
1022 DROP(finish-start); /* drop the operand */
1023 break;
1024 case REP(0, 1): /* as x{1,1}? */
1025 case REP(0, N): /* as x{1,n}? */
1026 case REP(0, INF): /* as x{1,}? */
1027 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1028 INSERT(OCH_, start); /* offset is wrong... */
1029 repeat(p, start+1, 1, to);
1030 ASTERN(OOR1, start);
1031 AHEAD(start); /* ... fix it */
1032 EMIT(OOR2, 0);
1033 AHEAD(THERE());
1034 ASTERN(O_CH, THERETHERE());
1035 break;
1036 case REP(1, 1): /* trivial case */
1037 /* done */
1038 break;
1039 case REP(1, N): /* as x?x{1,n-1} */
1040 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1041 INSERT(OCH_, start);
1042 ASTERN(OOR1, start);
1043 AHEAD(start);
1044 EMIT(OOR2, 0); /* offset very wrong... */
1045 AHEAD(THERE()); /* ...so fix it */
1046 ASTERN(O_CH, THERETHERE());
1047 copy = dupl(p, start+1, finish+1);
1048 assert(copy == finish+4);
1049 repeat(p, copy, 1, to-1);
1050 break;
1051 case REP(1, INF): /* as x+ */
1052 INSERT(OPLUS_, start);
1053 ASTERN(O_PLUS, start);
1054 break;
1055 case REP(N, N): /* as xx{m-1,n-1} */
1056 copy = dupl(p, start, finish);
1057 repeat(p, copy, from-1, to-1);
1058 break;
1059 case REP(N, INF): /* as xx{n-1,INF} */
1060 copy = dupl(p, start, finish);
1061 repeat(p, copy, from-1, to);
1062 break;
1063 default: /* "can't happen" */
1064 SETERROR(REG_EFATAL); /* just in case */
1065 break;
1066 }
1067 }
1068
1069 /*
1070 * wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1071 * character from the parse struct, signals a REG_ILLSEQ error if the
1072 * character can't be converted. Returns the number of bytes consumed.
1073 */
1074 static wint_t
1075 wgetnext(struct parse *p)
1076 {
1077 mbstate_t mbs;
1078 wchar_t wc;
1079 size_t n;
1080
1081 (void) memset(&mbs, 0, sizeof (mbs));
1082 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1083 if (n == (size_t)-1 || n == (size_t)-2) {
1084 SETERROR(REG_ECHAR);
1085 return (0);
1086 }
1087 if (n == 0)
1088 n = 1;
1089 p->next += n;
1090 return (wc);
1091 }
1092
1093 /*
1094 * seterr - set an error condition
1095 */
1096 static int /* useless but makes type checking happy */
1097 seterr(struct parse *p, int e)
1098 {
1099 if (p->error == 0) /* keep earliest error condition */
1100 p->error = e;
1101 p->next = nuls; /* try to bring things to a halt */
1102 p->end = nuls;
1103 return (0); /* make the return value well-defined */
1104 }
1105
1106 /*
1107 * allocset - allocate a set of characters for []
1108 */
1109 static cset *
1110 allocset(struct parse *p)
1111 {
1112 cset *cs, *ncs;
1113
1114 ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs));
1115 if (ncs == NULL) {
1116 SETERROR(REG_ESPACE);
1117 return (NULL);
1118 }
1119 p->g->sets = ncs;
1120 cs = &p->g->sets[p->g->ncsets++];
1121 (void) memset(cs, 0, sizeof (*cs));
1122
1123 return (cs);
1124 }
1125
1126 /*
1127 * freeset - free a now-unused set
1128 */
1129 static void
1130 freeset(struct parse *p, cset *cs)
1131 {
1132 cset *top = &p->g->sets[p->g->ncsets];
1133
1134 free(cs->wides);
1135 free(cs->ranges);
1136 free(cs->types);
1137 (void) memset(cs, 0, sizeof (*cs));
1138 if (cs == top-1) /* recover only the easy case */
1139 p->g->ncsets--;
1140 }
1141
1142 /*
1143 * singleton - Determine whether a set contains only one character,
1144 * returning it if so, otherwise returning OUT.
1145 */
1146 static wint_t
1147 singleton(cset *cs)
1148 {
1149 wint_t i, s, n;
1150
1151 for (i = n = 0; i < NC; i++)
1152 if (CHIN(cs, i)) {
1153 n++;
1154 s = i;
1155 }
1156 if (n == 1)
1157 return (s);
1158 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1159 cs->icase == 0)
1160 return (cs->wides[0]);
1161 /* Don't bother handling the other cases. */
1162 return (OUT);
1163 }
1164
1165 /*
1166 * CHadd - add character to character set.
1167 */
1168 static void
1169 CHadd(struct parse *p, cset *cs, wint_t ch)
1170 {
1171 wint_t nch, *newwides;
1172 assert(ch >= 0);
1173 if (ch < NC)
1174 cs->bmp[ch >> 3] |= 1 << (ch & 7);
1175 else {
1176 newwides = realloc(cs->wides, (cs->nwides + 1) *
1177 sizeof (*cs->wides));
1178 if (newwides == NULL) {
1179 SETERROR(REG_ESPACE);
1180 return;
1181 }
1182 cs->wides = newwides;
1183 cs->wides[cs->nwides++] = ch;
1184 }
1185 if (cs->icase) {
1186 if ((nch = towlower(ch)) < NC)
1187 cs->bmp[nch >> 3] |= 1 << (nch & 7);
1188 if ((nch = towupper(ch)) < NC)
1189 cs->bmp[nch >> 3] |= 1 << (nch & 7);
1190 }
1191 }
1192
1193 /*
1194 * CHaddrange - add all characters in the range [min,max] to a character set.
1195 */
1196 static void
1197 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1198 {
1199 crange *newranges;
1200
1201 for (; min < NC && min <= max; min++)
1202 CHadd(p, cs, min);
1203 if (min >= max)
1204 return;
1205 newranges = realloc(cs->ranges, (cs->nranges + 1) *
1206 sizeof (*cs->ranges));
1207 if (newranges == NULL) {
1208 SETERROR(REG_ESPACE);
1209 return;
1210 }
1211 cs->ranges = newranges;
1212 cs->ranges[cs->nranges].min = min;
1213 cs->ranges[cs->nranges].max = max;
1214 cs->nranges++;
1215 }
1216
1217 /*
1218 * CHaddtype - add all characters of a certain type to a character set.
1219 */
1220 static void
1221 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1222 {
1223 wint_t i;
1224 wctype_t *newtypes;
1225
1226 for (i = 0; i < NC; i++)
1227 if (iswctype(i, wct))
1228 CHadd(p, cs, i);
1229 newtypes = realloc(cs->types, (cs->ntypes + 1) *
1230 sizeof (*cs->types));
1231 if (newtypes == NULL) {
1232 SETERROR(REG_ESPACE);
1233 return;
1234 }
1235 cs->types = newtypes;
1236 cs->types[cs->ntypes++] = wct;
1237 }
1238
1239 /*
1240 * dupl - emit a duplicate of a bunch of sops
1241 */
1242 static sopno /* start of duplicate */
1243 dupl(struct parse *p,
1244 sopno start, /* from here */
1245 sopno finish) /* to this less one */
1246 {
1247 sopno ret = HERE();
1248 sopno len = finish - start;
1249
1250 assert(finish >= start);
1251 if (len == 0)
1252 return (ret);
1253 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1254 return (ret);
1255 assert(p->ssize >= p->slen + len);
1256 (void) memcpy((char *)(p->strip + p->slen),
1257 (char *)(p->strip + start), (size_t)len*sizeof (sop));
1258 p->slen += len;
1259 return (ret);
1260 }
1261
1262 /*
1263 * doemit - emit a strip operator
1264 *
1265 * It might seem better to implement this as a macro with a function as
1266 * hard-case backup, but it's just too big and messy unless there are
1267 * some changes to the data structures. Maybe later.
1268 */
1269 static void
1270 doemit(struct parse *p, sop op, size_t opnd)
1271 {
1272 /* avoid making error situations worse */
1273 if (p->error != 0)
1274 return;
1275
1276 /* deal with oversize operands ("can't happen", more or less) */
1277 assert(opnd < 1<<OPSHIFT);
1278
1279 /* deal with undersized strip */
1280 if (p->slen >= p->ssize)
1281 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */
1282 return;
1283
1284 /* finally, it's all reduced to the easy case */
1285 p->strip[p->slen++] = SOP(op, opnd);
1286 }
1287
1288 /*
1289 * doinsert - insert a sop into the strip
1290 */
1291 static void
1292 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1293 {
1294 sopno sn;
1295 sop s;
1296 int i;
1297
1298 /* avoid making error situations worse */
1299 if (p->error != 0)
1300 return;
1301
1302 sn = HERE();
1303 EMIT(op, opnd); /* do checks, ensure space */
1304 assert(HERE() == sn+1);
1305 s = p->strip[sn];
1306
1307 /* adjust paren pointers */
1308 assert(pos > 0);
1309 for (i = 1; i < NPAREN; i++) {
1310 if (p->pbegin[i] >= pos) {
1311 p->pbegin[i]++;
1312 }
1313 if (p->pend[i] >= pos) {
1314 p->pend[i]++;
1315 }
1316 }
1317
1318 (void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1319 (HERE()-pos-1)*sizeof (sop));
1320 p->strip[pos] = s;
1321 }
1322
1323 /*
1324 * dofwd - complete a forward reference
1325 */
1326 static void
1327 dofwd(struct parse *p, sopno pos, sop value)
1328 {
1329 /* avoid making error situations worse */
1330 if (p->error != 0)
1331 return;
1332
1333 assert(value < 1<<OPSHIFT);
1334 p->strip[pos] = OP(p->strip[pos]) | value;
1335 }
1336
1337 /*
1338 * enlarge - enlarge the strip
1339 */
1340 static int
1341 enlarge(struct parse *p, sopno size)
1342 {
1343 sop *sp;
1344
1345 if (p->ssize >= size)
1346 return (1);
1347
1348 sp = (sop *)realloc(p->strip, size*sizeof (sop));
1349 if (sp == NULL) {
1350 SETERROR(REG_ESPACE);
1351 return (0);
1352 }
1353 p->strip = sp;
1354 p->ssize = size;
1355 return (1);
1356 }
1357
1358 /*
1359 * stripsnug - compact the strip
1360 */
1361 static void
1362 stripsnug(struct parse *p, struct re_guts *g)
1363 {
1364 g->nstates = p->slen;
1365 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop));
1366 if (g->strip == NULL) {
1367 SETERROR(REG_ESPACE);
1368 g->strip = p->strip;
1369 }
1370 }
1371
1372 /*
1373 * findmust - fill in must and mlen with longest mandatory literal string
1374 *
1375 * This algorithm could do fancy things like analyzing the operands of |
1376 * for common subsequences. Someday. This code is simple and finds most
1377 * of the interesting cases.
1378 *
1379 * Note that must and mlen got initialized during setup.
1380 */
1381 static void
1382 findmust(struct parse *p, struct re_guts *g)
1383 {
1384 sop *scan;
1385 sop *start = NULL;
1386 sop *newstart = NULL;
1387 sopno newlen;
1388 sop s;
1389 char *cp;
1390 int offset;
1391 char buf[MB_LEN_MAX];
1392 size_t clen;
1393 mbstate_t mbs;
1394 locale_t loc = uselocale(NULL);
1395
1396 /* avoid making error situations worse */
1397 if (p->error != 0)
1398 return;
1399
1400 /*
1401 * It's not generally safe to do a ``char'' substring search on
1402 * multibyte character strings, but it's safe for at least
1403 * UTF-8 (see RFC 3629).
1404 */
1405 if (MB_CUR_MAX > 1 &&
1406 strcmp(loc->runelocale->__encoding, "UTF-8") != 0)
1407 return;
1408
1409 /* find the longest OCHAR sequence in strip */
1410 newlen = 0;
1411 offset = 0;
1412 g->moffset = 0;
1413 scan = g->strip + 1;
1414 do {
1415 s = *scan++;
1416 switch (OP(s)) {
1417 case OCHAR: /* sequence member */
1418 if (newlen == 0) { /* new sequence */
1419 (void) memset(&mbs, 0, sizeof (mbs));
1420 newstart = scan - 1;
1421 }
1422 clen = wcrtomb(buf, OPND(s), &mbs);
1423 if (clen == (size_t)-1)
1424 goto toohard;
1425 newlen += clen;
1426 break;
1427 case OPLUS_: /* things that don't break one */
1428 case OLPAREN:
1429 case ORPAREN:
1430 break;
1431 case OQUEST_: /* things that must be skipped */
1432 case OCH_:
1433 offset = altoffset(scan, offset);
1434 scan--;
1435 do {
1436 scan += OPND(s);
1437 s = *scan;
1438 /* assert() interferes w debug printouts */
1439 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1440 OP(s) != OOR2) {
1441 g->iflags |= BAD;
1442 return;
1443 }
1444 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1445 /* FALLTHROUGH */
1446 case OBOW: /* things that break a sequence */
1447 case OEOW:
1448 case OBOL:
1449 case OEOL:
1450 case O_QUEST:
1451 case O_CH:
1452 case OEND:
1453 if (newlen > g->mlen) { /* ends one */
1454 start = newstart;
1455 g->mlen = newlen;
1456 if (offset > -1) {
1457 g->moffset += offset;
1458 offset = newlen;
1459 } else
1460 g->moffset = offset;
1461 } else {
1462 if (offset > -1)
1463 offset += newlen;
1464 }
1465 newlen = 0;
1466 break;
1467 case OANY:
1468 if (newlen > g->mlen) { /* ends one */
1469 start = newstart;
1470 g->mlen = newlen;
1471 if (offset > -1) {
1472 g->moffset += offset;
1473 offset = newlen;
1474 } else
1475 g->moffset = offset;
1476 } else {
1477 if (offset > -1)
1478 offset += newlen;
1479 }
1480 if (offset > -1)
1481 offset++;
1482 newlen = 0;
1483 break;
1484 case OANYOF: /* may or may not invalidate offset */
1485 /* First, everything as OANY */
1486 if (newlen > g->mlen) { /* ends one */
1487 start = newstart;
1488 g->mlen = newlen;
1489 if (offset > -1) {
1490 g->moffset += offset;
1491 offset = newlen;
1492 } else
1493 g->moffset = offset;
1494 } else {
1495 if (offset > -1)
1496 offset += newlen;
1497 }
1498 if (offset > -1)
1499 offset++;
1500 newlen = 0;
1501 break;
1502 toohard:
1503 default:
1504 /*
1505 * Anything here makes it impossible or too hard
1506 * to calculate the offset -- so we give up;
1507 * save the last known good offset, in case the
1508 * must sequence doesn't occur later.
1509 */
1510 if (newlen > g->mlen) { /* ends one */
1511 start = newstart;
1512 g->mlen = newlen;
1513 if (offset > -1)
1514 g->moffset += offset;
1515 else
1516 g->moffset = offset;
1517 }
1518 offset = -1;
1519 newlen = 0;
1520 break;
1521 }
1522 } while (OP(s) != OEND);
1523
1524 if (g->mlen == 0) { /* there isn't one */
1525 g->moffset = -1;
1526 return;
1527 }
1528
1529 /* turn it into a character string */
1530 g->must = malloc((size_t)g->mlen + 1);
1531 if (g->must == NULL) { /* argh; just forget it */
1532 g->mlen = 0;
1533 g->moffset = -1;
1534 return;
1535 }
1536 cp = g->must;
1537 scan = start;
1538 (void) memset(&mbs, 0, sizeof (mbs));
1539 while (cp < g->must + g->mlen) {
1540 while (OP(s = *scan++) != OCHAR)
1541 continue;
1542 clen = wcrtomb(cp, OPND(s), &mbs);
1543 assert(clen != (size_t)-1);
1544 cp += clen;
1545 }
1546 assert(cp == g->must + g->mlen);
1547 *cp++ = '\0'; /* just on general principles */
1548 }
1549
1550 /*
1551 * altoffset - choose biggest offset among multiple choices
1552 *
1553 * Compute, recursively if necessary, the largest offset among multiple
1554 * re paths.
1555 */
1556 static int
1557 altoffset(sop *scan, int offset)
1558 {
1559 int largest;
1560 int try;
1561 sop s;
1562
1563 /* If we gave up already on offsets, return */
1564 if (offset == -1)
1565 return (-1);
1566
1567 largest = 0;
1568 try = 0;
1569 s = *scan++;
1570 while (OP(s) != O_QUEST && OP(s) != O_CH) {
1571 switch (OP(s)) {
1572 case OOR1:
1573 if (try > largest)
1574 largest = try;
1575 try = 0;
1576 break;
1577 case OQUEST_:
1578 case OCH_:
1579 try = altoffset(scan, try);
1580 if (try == -1)
1581 return (-1);
1582 scan--;
1583 do {
1584 scan += OPND(s);
1585 s = *scan;
1586 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1587 OP(s) != OOR2)
1588 return (-1);
1589 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1590 /*
1591 * We must skip to the next position, or we'll
1592 * leave altoffset() too early.
1593 */
1594 scan++;
1595 break;
1596 case OANYOF:
1597 case OCHAR:
1598 case OANY:
1599 try++;
1600 /*FALLTHRU*/
1601 case OBOW:
1602 case OEOW:
1603 case OLPAREN:
1604 case ORPAREN:
1605 case OOR2:
1606 break;
1607 default:
1608 try = -1;
1609 break;
1610 }
1611 if (try == -1)
1612 return (-1);
1613 s = *scan++;
1614 }
1615
1616 if (try > largest)
1617 largest = try;
1618
1619 return (largest+offset);
1620 }
1621
1622 /*
1623 * computejumps - compute char jumps for BM scan
1624 *
1625 * This algorithm assumes g->must exists and is has size greater than
1626 * zero. It's based on the algorithm found on Computer Algorithms by
1627 * Sara Baase.
1628 *
1629 * A char jump is the number of characters one needs to jump based on
1630 * the value of the character from the text that was mismatched.
1631 */
1632 static void
1633 computejumps(struct parse *p, struct re_guts *g)
1634 {
1635 int ch;
1636 int mindex;
1637
1638 /* Avoid making errors worse */
1639 if (p->error != 0)
1640 return;
1641
1642 g->charjump = (int *)malloc((NC + 1) * sizeof (int));
1643 if (g->charjump == NULL) /* Not a fatal error */
1644 return;
1645 /* Adjust for signed chars, if necessary */
1646 g->charjump = &g->charjump[-(CHAR_MIN)];
1647
1648 /*
1649 * If the character does not exist in the pattern, the jump
1650 * is equal to the number of characters in the pattern.
1651 */
1652 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1653 g->charjump[ch] = g->mlen;
1654
1655 /*
1656 * If the character does exist, compute the jump that would
1657 * take us to the last character in the pattern equal to it
1658 * (notice that we match right to left, so that last character
1659 * is the first one that would be matched).
1660 */
1661 for (mindex = 0; mindex < g->mlen; mindex++)
1662 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1663 }
1664
1665 /*
1666 * computematchjumps - compute match jumps for BM scan
1667 *
1668 * This algorithm assumes g->must exists and is has size greater than
1669 * zero. It's based on the algorithm found on Computer Algorithms by
1670 * Sara Baase.
1671 *
1672 * A match jump is the number of characters one needs to advance based
1673 * on the already-matched suffix.
1674 * Notice that all values here are minus (g->mlen-1), because of the way
1675 * the search algorithm works.
1676 */
1677 static void
1678 computematchjumps(struct parse *p, struct re_guts *g)
1679 {
1680 int mindex; /* General "must" iterator */
1681 int suffix; /* Keeps track of matching suffix */
1682 int ssuffix; /* Keeps track of suffixes' suffix */
1683 int *pmatches;
1684 /*
1685 * pmatches[k] points to the next i
1686 * such that i+1...mlen is a substring
1687 * of k+1...k+mlen-i-1
1688 */
1689
1690 /* Avoid making errors worse */
1691 if (p->error != 0)
1692 return;
1693
1694 pmatches = (int *)malloc(g->mlen * sizeof (unsigned int));
1695 if (pmatches == NULL) {
1696 g->matchjump = NULL;
1697 return;
1698 }
1699
1700 g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int));
1701 if (g->matchjump == NULL) { /* Not a fatal error */
1702 free(pmatches);
1703 return;
1704 }
1705
1706 /* Set maximum possible jump for each character in the pattern */
1707 for (mindex = 0; mindex < g->mlen; mindex++)
1708 g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1709
1710 /* Compute pmatches[] */
1711 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1712 mindex--, suffix--) {
1713 pmatches[mindex] = suffix;
1714
1715 /*
1716 * If a mismatch is found, interrupting the substring,
1717 * compute the matchjump for that position. If no
1718 * mismatch is found, then a text substring mismatched
1719 * against the suffix will also mismatch against the
1720 * substring.
1721 */
1722 while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) {
1723 g->matchjump[suffix] = MIN(g->matchjump[suffix],
1724 g->mlen - mindex - 1);
1725 suffix = pmatches[suffix];
1726 }
1727 }
1728
1729 /*
1730 * Compute the matchjump up to the last substring found to jump
1731 * to the beginning of the largest must pattern prefix matching
1732 * it's own suffix.
1733 */
1734 for (mindex = 0; mindex <= suffix; mindex++)
1735 g->matchjump[mindex] = MIN(g->matchjump[mindex],
1736 g->mlen + suffix - mindex);
1737
1738 ssuffix = pmatches[suffix];
1739 while (suffix < g->mlen) {
1740 while (suffix <= ssuffix && suffix < g->mlen) {
1741 g->matchjump[suffix] = MIN(g->matchjump[suffix],
1742 g->mlen + ssuffix - suffix);
1743 suffix++;
1744 }
1745 if (suffix < g->mlen)
1746 ssuffix = pmatches[ssuffix];
1747 }
1748
1749 free(pmatches);
1750 }
1751
1752 /*
1753 * pluscount - count + nesting
1754 */
1755 static sopno /* nesting depth */
1756 pluscount(struct parse *p, struct re_guts *g)
1757 {
1758 sop *scan;
1759 sop s;
1760 sopno plusnest = 0;
1761 sopno maxnest = 0;
1762
1763 if (p->error != 0)
1764 return (0); /* there may not be an OEND */
1765
1766 scan = g->strip + 1;
1767 do {
1768 s = *scan++;
1769 switch (OP(s)) {
1770 case OPLUS_:
1771 plusnest++;
1772 break;
1773 case O_PLUS:
1774 if (plusnest > maxnest)
1775 maxnest = plusnest;
1776 plusnest--;
1777 break;
1778 }
1779 } while (OP(s) != OEND);
1780 if (plusnest != 0)
1781 g->iflags |= BAD;
1782 return (maxnest);
1783 }
|
1 /*
2 * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 tre_regcomp.c - TRE POSIX compatible regex compilation functions.
31 */
32
33 #include <string.h>
34 #include <errno.h>
35 #include <stdlib.h>
36
37 #include "tre.h"
38 #include "tre-internal.h"
39
40 int
41 tre_regncomp_l(regex_t *preg, const char *regex, size_t n, int cflags,
42 locale_t loc)
43 {
44 int ret;
45 tre_char_t *wregex;
46 size_t wlen;
47
48 wregex = malloc(sizeof(tre_char_t) * (n + 1));
49 if (wregex == NULL)
50 return REG_ESPACE;
51
52 /* If the current locale uses the standard single byte encoding of
53 characters, we don't do a multibyte string conversion. If we did,
54 many applications which use the default locale would break since
55 the default "C" locale uses the 7-bit ASCII character set, and
56 all characters with the eighth bit set would be considered invalid. */
57 if (TRE_MB_CUR_MAX_L(loc) == 1)
58 {
59 unsigned int i;
60 const unsigned char *str = (const unsigned char *)regex;
61 tre_char_t *wstr = wregex;
62
63 for (i = 0; i < n; i++)
64 *(wstr++) = *(str++);
65 wlen = n;
66 }
67 else
68 {
69 size_t consumed;
70 tre_char_t *wcptr = wregex;
71 mbstate_t state;
72 memset(&state, '\0', sizeof(state));
73 while (n > 0)
74 {
75 consumed = tre_mbrtowc_l(wcptr, regex, n, &state, loc);
76
77 switch (consumed)
78 {
79 case 0:
80 if (*regex == '\0')
81 consumed = 1;
82 else
83 {
84 free(wregex);
85 return REG_BADPAT;
86 }
87 break;
88 case (size_t)-1:
89 case (size_t)-2:
90 DPRINT(("mbrtowc: error %d: %s.\n", errno, strerror(errno)));
91 free(wregex);
92 return REG_ILLSEQ;
93 }
94 regex += consumed;
95 n -= consumed;
96 wcptr++;
97 }
98 wlen = wcptr - wregex;
99 }
100
101 wregex[wlen] = L'\0';
102 ret = tre_compile(preg, wregex, wlen, cflags, loc);
103 free(wregex);
104
105 return ret;
106 }
107
108 int
109 tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags)
110 {
111 return tre_regncomp_l(preg, regex, n, cflags, uselocale((locale_t)0));
112 }
113
114 int
115 tre_regcomp_l(regex_t *preg, const char *regex, int cflags, locale_t loc)
116 {
117 size_t len;
118
119 if (cflags & REG_PEND)
120 {
121 if ((const char *)(preg->re_endp) < regex)
122 return REG_INVARG;
123 len = (const char *)(preg->re_endp) - regex;
124 }
125 else
126 len = strlen(regex);
127 return tre_regncomp_l(preg, regex, len, cflags, loc);
128 }
129
130 int
131 regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD regex,
132 int cflags)
133 {
134 return tre_regcomp_l(preg, regex, cflags, uselocale((locale_t)0));
135 }
136
137 void
138 regfree(regex_t *preg)
139 {
140 tre_free(preg);
141 }
|