Print this page
9083 replace regex implementation with tre
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/lib/libc/port/regex/regcomp.c
+++ new/usr/src/lib/libc/port/regex/regcomp.c
1 1 /*
2 - * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3 - * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
4 - * Copyright 2012 Milan Jurik. All rights reserved.
5 - * Copyright (c) 1992, 1993, 1994 Henry Spencer.
6 - * Copyright (c) 1992, 1993, 1994
7 - * The Regents of the University of California. All rights reserved.
2 + * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
3 + * All rights reserved.
8 4 *
9 - * This code is derived from software contributed to Berkeley by
10 - * Henry Spencer.
11 - *
12 5 * Redistribution and use in source and binary forms, with or without
13 6 * modification, are permitted provided that the following conditions
14 7 * are met:
8 + *
15 9 * 1. Redistributions of source code must retain the above copyright
16 10 * notice, this list of conditions and the following disclaimer.
11 + *
17 12 * 2. Redistributions in binary form must reproduce the above copyright
18 13 * notice, this list of conditions and the following disclaimer in the
19 14 * documentation and/or other materials provided with the distribution.
20 - * 3. Neither the name of the University nor the names of its contributors
21 - * may be used to endorse or promote products derived from this software
22 - * without specific prior written permission.
23 15 *
24 - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 - * SUCH DAMAGE.
16 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
17 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 27 */
36 28
37 -#include "lint.h"
38 -#include "file64.h"
39 -#include <sys/types.h>
40 -#include <stdio.h>
29 +/*
30 + tre_regcomp.c - TRE POSIX compatible regex compilation functions.
31 +*/
32 +
41 33 #include <string.h>
42 -#include <ctype.h>
43 -#include <limits.h>
34 +#include <errno.h>
44 35 #include <stdlib.h>
45 -#include <regex.h>
46 -#include <wchar.h>
47 -#include <wctype.h>
48 36
49 -#include "../locale/runetype.h"
50 -#include "../locale/collate.h"
37 +#include "tre.h"
38 +#include "tre-internal.h"
51 39
52 -#include "utils.h"
53 -#include "regex2.h"
54 -
55 -#include "cname.h"
56 -#include "../locale/mblocal.h"
57 -
58 -/*
59 - * parse structure, passed up and down to avoid global variables and
60 - * other clumsinesses
61 - */
62 -struct parse {
63 - const char *next; /* next character in RE */
64 - const char *end; /* end of string (-> NUL normally) */
65 - int error; /* has an error been seen? */
66 - sop *strip; /* malloced strip */
67 - sopno ssize; /* malloced strip size (allocated) */
68 - sopno slen; /* malloced strip length (used) */
69 - int ncsalloc; /* number of csets allocated */
70 - struct re_guts *g;
71 -#define NPAREN 10 /* we need to remember () 1-9 for back refs */
72 - sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
73 - sopno pend[NPAREN]; /* -> ) ([0] unused) */
74 -};
75 -
76 -/* ========= begin header generated by ./mkh ========= */
77 -#ifdef __cplusplus
78 -extern "C" {
79 -#endif
80 -
81 -/* === regcomp.c === */
82 -static void p_ere(struct parse *p, int stop);
83 -static void p_ere_exp(struct parse *p);
84 -static void p_str(struct parse *p);
85 -static void p_bre(struct parse *p, int end1, int end2);
86 -static int p_simp_re(struct parse *p, int starordinary);
87 -static int p_count(struct parse *p);
88 -static void p_bracket(struct parse *p);
89 -static void p_b_term(struct parse *p, cset *cs);
90 -static void p_b_cclass(struct parse *p, cset *cs);
91 -static void p_b_eclass(struct parse *p, cset *cs);
92 -static wint_t p_b_symbol(struct parse *p);
93 -static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
94 -static wint_t othercase(wint_t ch);
95 -static void bothcases(struct parse *p, wint_t ch);
96 -static void ordinary(struct parse *p, wint_t ch);
97 -static void nonnewline(struct parse *p);
98 -static void repeat(struct parse *p, sopno start, int from, int to);
99 -static int seterr(struct parse *p, int e);
100 -static cset *allocset(struct parse *p);
101 -static void freeset(struct parse *p, cset *cs);
102 -static void CHadd(struct parse *p, cset *cs, wint_t ch);
103 -static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
104 -static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
105 -static wint_t singleton(cset *cs);
106 -static sopno dupl(struct parse *p, sopno start, sopno finish);
107 -static void doemit(struct parse *p, sop op, size_t opnd);
108 -static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
109 -static void dofwd(struct parse *p, sopno pos, sop value);
110 -static int enlarge(struct parse *p, sopno size);
111 -static void stripsnug(struct parse *p, struct re_guts *g);
112 -static void findmust(struct parse *p, struct re_guts *g);
113 -static int altoffset(sop *scan, int offset);
114 -static void computejumps(struct parse *p, struct re_guts *g);
115 -static void computematchjumps(struct parse *p, struct re_guts *g);
116 -static sopno pluscount(struct parse *p, struct re_guts *g);
117 -static wint_t wgetnext(struct parse *p);
118 -
119 -#ifdef __cplusplus
120 -}
121 -#endif
122 -/* ========= end header generated by ./mkh ========= */
123 -
124 -static char nuls[10]; /* place to point scanner in event of error */
125 -
126 -/*
127 - * macros for use with parse structure
128 - * BEWARE: these know that the parse structure is named `p' !!!
129 - */
130 -#define PEEK() (*p->next)
131 -#define PEEK2() (*(p->next+1))
132 -#define MORE() (p->next < p->end)
133 -#define MORE2() (p->next+1 < p->end)
134 -#define SEE(c) (MORE() && PEEK() == (c))
135 -#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
136 -#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
137 -#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
138 -#define NEXT() (p->next++)
139 -#define NEXT2() (p->next += 2)
140 -#define NEXTn(n) (p->next += (n))
141 -#define GETNEXT() (*p->next++)
142 -#define WGETNEXT() wgetnext(p)
143 -#define SETERROR(e) ((void)seterr(p, (e)))
144 -#define REQUIRE(co, e) ((co) || seterr(p, e))
145 -#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
146 -#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
147 -#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
148 -#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
149 -#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
150 -#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
151 -#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
152 -#define HERE() (p->slen)
153 -#define THERE() (p->slen - 1)
154 -#define THERETHERE() (p->slen - 2)
155 -#define DROP(n) (p->slen -= (n))
156 -
157 -#ifndef NDEBUG
158 -static int never = 0; /* for use in asserts; shuts lint up */
159 -#else
160 -#define never 0 /* some <assert.h>s have bugs too */
161 -#endif
162 -
163 -/*
164 - * regcomp - interface for parser and compilation
165 - */
166 -int /* 0 success, otherwise REG_something */
167 -regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern,
168 - int cflags)
40 +int
41 +tre_regncomp_l(regex_t *preg, const char *regex, size_t n, int cflags,
42 + locale_t loc)
169 43 {
170 - struct parse pa;
171 - struct re_guts *g;
172 - struct parse *p = &pa;
173 - int i;
174 - size_t len;
175 - size_t maxlen;
176 -#ifdef REDEBUG
177 -#define GOODFLAGS(f) (f)
178 -#else
179 -#define GOODFLAGS(f) ((f)&~REG_DUMP)
180 -#endif
44 + int ret;
45 + tre_char_t *wregex;
46 + size_t wlen;
181 47
182 - /* We had REG_INVARG, but we don't have that on Solaris. */
183 - cflags = GOODFLAGS(cflags);
184 - if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
185 - return (REG_EFATAL);
48 + wregex = malloc(sizeof(tre_char_t) * (n + 1));
49 + if (wregex == NULL)
50 + return REG_ESPACE;
186 51
187 - if (cflags®_PEND) {
188 - if (preg->re_endp < pattern)
189 - return (REG_EFATAL);
190 - len = preg->re_endp - pattern;
191 - } else
192 - len = strlen(pattern);
52 + /* If the current locale uses the standard single byte encoding of
53 + characters, we don't do a multibyte string conversion. If we did,
54 + many applications which use the default locale would break since
55 + the default "C" locale uses the 7-bit ASCII character set, and
56 + all characters with the eighth bit set would be considered invalid. */
57 + if (TRE_MB_CUR_MAX_L(loc) == 1)
58 + {
59 + unsigned int i;
60 + const unsigned char *str = (const unsigned char *)regex;
61 + tre_char_t *wstr = wregex;
193 62
194 - /* do the mallocs early so failure handling is easy */
195 - g = (struct re_guts *)malloc(sizeof (struct re_guts));
196 - if (g == NULL)
197 - return (REG_ESPACE);
198 - /*
199 - * Limit the pattern space to avoid a 32-bit overflow on buffer
200 - * extension. Also avoid any signed overflow in case of conversion
201 - * so make the real limit based on a 31-bit overflow.
202 - *
203 - * Likely not applicable on 64-bit systems but handle the case
204 - * generically (who are we to stop people from using ~715MB+
205 - * patterns?).
206 - */
207 - maxlen = ((size_t)-1 >> 1) / sizeof (sop) * 2 / 3;
208 - if (len >= maxlen) {
209 - free((char *)g);
210 - return (REG_ESPACE);
211 - }
212 - p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
213 - assert(p->ssize >= len);
63 + for (i = 0; i < n; i++)
64 + *(wstr++) = *(str++);
65 + wlen = n;
66 + }
67 + else
68 + {
69 + size_t consumed;
70 + tre_char_t *wcptr = wregex;
71 + mbstate_t state;
72 + memset(&state, '\0', sizeof(state));
73 + while (n > 0)
74 + {
75 + consumed = tre_mbrtowc_l(wcptr, regex, n, &state, loc);
214 76
215 - p->strip = (sop *)malloc(p->ssize * sizeof (sop));
216 - p->slen = 0;
217 - if (p->strip == NULL) {
218 - free((char *)g);
219 - return (REG_ESPACE);
220 - }
221 -
222 - /* set things up */
223 - p->g = g;
224 - p->next = pattern; /* convenience; we do not modify it */
225 - p->end = p->next + len;
226 - p->error = 0;
227 - p->ncsalloc = 0;
228 - for (i = 0; i < NPAREN; i++) {
229 - p->pbegin[i] = 0;
230 - p->pend[i] = 0;
231 - }
232 - g->sets = NULL;
233 - g->ncsets = 0;
234 - g->cflags = cflags;
235 - g->iflags = 0;
236 - g->nbol = 0;
237 - g->neol = 0;
238 - g->must = NULL;
239 - g->moffset = -1;
240 - g->charjump = NULL;
241 - g->matchjump = NULL;
242 - g->mlen = 0;
243 - g->nsub = 0;
244 - g->backrefs = 0;
245 -
246 - /* do it */
247 - EMIT(OEND, 0);
248 - g->firststate = THERE();
249 - if (cflags®_EXTENDED)
250 - p_ere(p, OUT);
251 - else if (cflags®_NOSPEC)
252 - p_str(p);
253 - else
254 - p_bre(p, OUT, OUT);
255 - EMIT(OEND, 0);
256 - g->laststate = THERE();
257 -
258 - /* tidy up loose ends and fill things in */
259 - stripsnug(p, g);
260 - findmust(p, g);
261 - /*
262 - * only use Boyer-Moore algorithm if the pattern is bigger
263 - * than three characters
264 - */
265 - if (g->mlen > 3) {
266 - computejumps(p, g);
267 - computematchjumps(p, g);
268 - if (g->matchjump == NULL && g->charjump != NULL) {
269 - free(g->charjump);
270 - g->charjump = NULL;
77 + switch (consumed)
78 + {
79 + case 0:
80 + if (*regex == '\0')
81 + consumed = 1;
82 + else
83 + {
84 + free(wregex);
85 + return REG_BADPAT;
271 86 }
87 + break;
88 + case (size_t)-1:
89 + case (size_t)-2:
90 + DPRINT(("mbrtowc: error %d: %s.\n", errno, strerror(errno)));
91 + free(wregex);
92 + return REG_ILLSEQ;
93 + }
94 + regex += consumed;
95 + n -= consumed;
96 + wcptr++;
272 97 }
273 - g->nplus = pluscount(p, g);
274 - g->magic = MAGIC2;
275 - preg->re_nsub = g->nsub;
276 - preg->re_g = g;
277 - preg->re_magic = MAGIC1;
278 -#ifndef REDEBUG
279 - /* not debugging, so can't rely on the assert() in regexec() */
280 - if (g->iflags&BAD)
281 - SETERROR(REG_EFATAL);
282 -#endif
98 + wlen = wcptr - wregex;
99 + }
283 100
284 - /* win or lose, we're done */
285 - if (p->error != 0) /* lose */
286 - regfree(preg);
287 - return (p->error);
288 -}
101 + wregex[wlen] = L'\0';
102 + ret = tre_compile(preg, wregex, wlen, cflags, loc);
103 + free(wregex);
289 104
290 -/*
291 - * p_ere - ERE parser top level, concatenation and alternation
292 - */
293 -static void
294 -p_ere(struct parse *p,
295 - int stop) /* character this ERE should end at */
296 -{
297 - char c;
298 - sopno prevback;
299 - sopno prevfwd;
300 - sopno conc;
301 - int first = 1; /* is this the first alternative? */
302 -
303 - for (;;) {
304 - /* do a bunch of concatenated expressions */
305 - conc = HERE();
306 - while (MORE() && (c = PEEK()) != '|' && c != stop)
307 - p_ere_exp(p);
308 - /* require nonempty */
309 - (void) REQUIRE(HERE() != conc, REG_BADPAT);
310 -
311 - if (!EAT('|'))
312 - break; /* NOTE BREAK OUT */
313 -
314 - if (first) {
315 - INSERT(OCH_, conc); /* offset is wrong */
316 - prevfwd = conc;
317 - prevback = conc;
318 - first = 0;
319 - }
320 - ASTERN(OOR1, prevback);
321 - prevback = THERE();
322 - AHEAD(prevfwd); /* fix previous offset */
323 - prevfwd = HERE();
324 - EMIT(OOR2, 0); /* offset is very wrong */
325 - }
326 -
327 - if (!first) { /* tail-end fixups */
328 - AHEAD(prevfwd);
329 - ASTERN(O_CH, prevback);
330 - }
331 -
332 - assert(!MORE() || SEE(stop));
105 + return ret;
333 106 }
334 107
335 -/*
336 - * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
337 - */
338 -static void
339 -p_ere_exp(struct parse *p)
108 +int
109 +tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags)
340 110 {
341 - char c;
342 - wint_t wc;
343 - sopno pos;
344 - int count;
345 - int count2;
346 - sopno subno;
347 - int wascaret = 0;
348 -
349 - assert(MORE()); /* caller should have ensured this */
350 - c = GETNEXT();
351 -
352 - pos = HERE();
353 - switch (c) {
354 - case '(':
355 - (void) REQUIRE(MORE(), REG_EPAREN);
356 - p->g->nsub++;
357 - subno = p->g->nsub;
358 - if (subno < NPAREN)
359 - p->pbegin[subno] = HERE();
360 - EMIT(OLPAREN, subno);
361 - if (!SEE(')'))
362 - p_ere(p, ')');
363 - if (subno < NPAREN) {
364 - p->pend[subno] = HERE();
365 - assert(p->pend[subno] != 0);
366 - }
367 - EMIT(ORPAREN, subno);
368 - (void) MUSTEAT(')', REG_EPAREN);
369 - break;
370 -#ifndef POSIX_MISTAKE
371 - case ')': /* happens only if no current unmatched ( */
372 - /*
373 - * You may ask, why the ifndef? Because I didn't notice
374 - * this until slightly too late for 1003.2, and none of the
375 - * other 1003.2 regular-expression reviewers noticed it at
376 - * all. So an unmatched ) is legal POSIX, at least until
377 - * we can get it fixed.
378 - */
379 - SETERROR(REG_EPAREN);
380 - break;
381 -#endif
382 - case '^':
383 - EMIT(OBOL, 0);
384 - p->g->iflags |= USEBOL;
385 - p->g->nbol++;
386 - wascaret = 1;
387 - break;
388 - case '$':
389 - EMIT(OEOL, 0);
390 - p->g->iflags |= USEEOL;
391 - p->g->neol++;
392 - break;
393 - case '|':
394 - SETERROR(REG_BADPAT);
395 - break;
396 - case '*':
397 - case '+':
398 - case '?':
399 - case '{':
400 - SETERROR(REG_BADRPT);
401 - break;
402 - case '.':
403 - if (p->g->cflags®_NEWLINE)
404 - nonnewline(p);
405 - else
406 - EMIT(OANY, 0);
407 - break;
408 - case '[':
409 - p_bracket(p);
410 - break;
411 - case '\\':
412 - (void) REQUIRE(MORE(), REG_EESCAPE);
413 - wc = WGETNEXT();
414 - switch (wc) {
415 - case '<':
416 - EMIT(OBOW, 0);
417 - break;
418 - case '>':
419 - EMIT(OEOW, 0);
420 - break;
421 - default:
422 - ordinary(p, wc);
423 - break;
424 - }
425 - break;
426 - default:
427 - if (p->error != 0)
428 - return;
429 - p->next--;
430 - wc = WGETNEXT();
431 - ordinary(p, wc);
432 - break;
433 - }
434 -
435 - if (!MORE())
436 - return;
437 - c = PEEK();
438 - /* we call { a repetition if followed by a digit */
439 - if (!(c == '*' || c == '+' || c == '?' || c == '{'))
440 - return; /* no repetition, we're done */
441 - else if (c == '{')
442 - (void) REQUIRE(MORE2() && \
443 - (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
444 - NEXT();
445 -
446 - (void) REQUIRE(!wascaret, REG_BADRPT);
447 - switch (c) {
448 - case '*': /* implemented as +? */
449 - /* this case does not require the (y|) trick, noKLUDGE */
450 - INSERT(OPLUS_, pos);
451 - ASTERN(O_PLUS, pos);
452 - INSERT(OQUEST_, pos);
453 - ASTERN(O_QUEST, pos);
454 - break;
455 - case '+':
456 - INSERT(OPLUS_, pos);
457 - ASTERN(O_PLUS, pos);
458 - break;
459 - case '?':
460 - /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
461 - INSERT(OCH_, pos); /* offset slightly wrong */
462 - ASTERN(OOR1, pos); /* this one's right */
463 - AHEAD(pos); /* fix the OCH_ */
464 - EMIT(OOR2, 0); /* offset very wrong... */
465 - AHEAD(THERE()); /* ...so fix it */
466 - ASTERN(O_CH, THERETHERE());
467 - break;
468 - case '{':
469 - count = p_count(p);
470 - if (EAT(',')) {
471 - if (isdigit((uch)PEEK())) {
472 - count2 = p_count(p);
473 - (void) REQUIRE(count <= count2, REG_BADBR);
474 - } else /* single number with comma */
475 - count2 = INFINITY;
476 - } else /* just a single number */
477 - count2 = count;
478 - repeat(p, pos, count, count2);
479 - if (!EAT('}')) { /* error heuristics */
480 - while (MORE() && PEEK() != '}')
481 - NEXT();
482 - (void) REQUIRE(MORE(), REG_EBRACE);
483 - SETERROR(REG_BADBR);
484 - }
485 - break;
486 - }
487 -
488 - if (!MORE())
489 - return;
490 - c = PEEK();
491 - if (!(c == '*' || c == '+' || c == '?' ||
492 - (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
493 - return;
494 - SETERROR(REG_BADRPT);
111 + return tre_regncomp_l(preg, regex, n, cflags, uselocale((locale_t)0));
495 112 }
496 113
497 -/*
498 - * p_str - string (no metacharacters) "parser"
499 - */
500 -static void
501 -p_str(struct parse *p)
114 +int
115 +tre_regcomp_l(regex_t *preg, const char *regex, int cflags, locale_t loc)
502 116 {
503 - (void) REQUIRE(MORE(), REG_BADPAT);
504 - while (MORE())
505 - ordinary(p, WGETNEXT());
506 -}
117 + size_t len;
507 118
508 -/*
509 - * p_bre - BRE parser top level, anchoring and concatenation
510 - * Giving end1 as OUT essentially eliminates the end1/end2 check.
511 - *
512 - * This implementation is a bit of a kludge, in that a trailing $ is first
513 - * taken as an ordinary character and then revised to be an anchor.
514 - * The amount of lookahead needed to avoid this kludge is excessive.
515 - */
516 -static void
517 -p_bre(struct parse *p,
518 - int end1, /* first terminating character */
519 - int end2) /* second terminating character */
520 -{
521 - sopno start = HERE();
522 - int first = 1; /* first subexpression? */
523 - int wasdollar = 0;
524 -
525 - if (EAT('^')) {
526 - EMIT(OBOL, 0);
527 - p->g->iflags |= USEBOL;
528 - p->g->nbol++;
529 - }
530 - while (MORE() && !SEETWO(end1, end2)) {
531 - wasdollar = p_simp_re(p, first);
532 - first = 0;
533 - }
534 - if (wasdollar) { /* oops, that was a trailing anchor */
535 - DROP(1);
536 - EMIT(OEOL, 0);
537 - p->g->iflags |= USEEOL;
538 - p->g->neol++;
539 - }
540 -
541 - (void) REQUIRE(HERE() != start, REG_BADPAT); /* require nonempty */
119 + if (cflags & REG_PEND)
120 + {
121 + if ((const char *)(preg->re_endp) < regex)
122 + return REG_INVARG;
123 + len = (const char *)(preg->re_endp) - regex;
124 + }
125 + else
126 + len = strlen(regex);
127 + return tre_regncomp_l(preg, regex, len, cflags, loc);
542 128 }
543 129
544 -/*
545 - * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
546 - */
547 -static int /* was the simple RE an unbackslashed $? */
548 -p_simp_re(struct parse *p,
549 - int starordinary) /* is a leading * an ordinary character? */
130 +int
131 +regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD regex,
132 + int cflags)
550 133 {
551 - int c;
552 - int count;
553 - int count2;
554 - sopno pos;
555 - int i;
556 - wint_t wc;
557 - sopno subno;
558 -#define BACKSL (1<<CHAR_BIT)
559 -
560 - pos = HERE(); /* repetition op, if any, covers from here */
561 -
562 - assert(MORE()); /* caller should have ensured this */
563 - c = GETNEXT();
564 - if (c == '\\') {
565 - (void) REQUIRE(MORE(), REG_EESCAPE);
566 - c = BACKSL | GETNEXT();
567 - }
568 - switch (c) {
569 - case '.':
570 - if (p->g->cflags®_NEWLINE)
571 - nonnewline(p);
572 - else
573 - EMIT(OANY, 0);
574 - break;
575 - case '[':
576 - p_bracket(p);
577 - break;
578 - case BACKSL|'<':
579 - EMIT(OBOW, 0);
580 - break;
581 - case BACKSL|'>':
582 - EMIT(OEOW, 0);
583 - break;
584 - case BACKSL|'{':
585 - SETERROR(REG_BADRPT);
586 - break;
587 - case BACKSL|'(':
588 - p->g->nsub++;
589 - subno = p->g->nsub;
590 - if (subno < NPAREN)
591 - p->pbegin[subno] = HERE();
592 - EMIT(OLPAREN, subno);
593 - /* the MORE here is an error heuristic */
594 - if (MORE() && !SEETWO('\\', ')'))
595 - p_bre(p, '\\', ')');
596 - if (subno < NPAREN) {
597 - p->pend[subno] = HERE();
598 - assert(p->pend[subno] != 0);
599 - }
600 - EMIT(ORPAREN, subno);
601 - (void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
602 - break;
603 - case BACKSL|')': /* should not get here -- must be user */
604 - SETERROR(REG_EPAREN);
605 - break;
606 - case BACKSL|'1':
607 - case BACKSL|'2':
608 - case BACKSL|'3':
609 - case BACKSL|'4':
610 - case BACKSL|'5':
611 - case BACKSL|'6':
612 - case BACKSL|'7':
613 - case BACKSL|'8':
614 - case BACKSL|'9':
615 - i = (c&~BACKSL) - '0';
616 - assert(i < NPAREN);
617 - if (p->pend[i] != 0) {
618 - assert(i <= p->g->nsub);
619 - EMIT(OBACK_, i);
620 - assert(p->pbegin[i] != 0);
621 - assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
622 - assert(OP(p->strip[p->pend[i]]) == ORPAREN);
623 - (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
624 - EMIT(O_BACK, i);
625 - } else
626 - SETERROR(REG_ESUBREG);
627 - p->g->backrefs = 1;
628 - break;
629 - case '*':
630 - (void) REQUIRE(starordinary, REG_BADRPT);
631 - /* FALLTHROUGH */
632 - default:
633 - if (p->error != 0)
634 - return (0); /* Definitely not $... */
635 - p->next--;
636 - wc = WGETNEXT();
637 - ordinary(p, wc);
638 - break;
639 - }
640 -
641 - if (EAT('*')) { /* implemented as +? */
642 - /* this case does not require the (y|) trick, noKLUDGE */
643 - INSERT(OPLUS_, pos);
644 - ASTERN(O_PLUS, pos);
645 - INSERT(OQUEST_, pos);
646 - ASTERN(O_QUEST, pos);
647 - } else if (EATTWO('\\', '{')) {
648 - count = p_count(p);
649 - if (EAT(',')) {
650 - if (MORE() && isdigit((uch)PEEK())) {
651 - count2 = p_count(p);
652 - (void) REQUIRE(count <= count2, REG_BADBR);
653 - } else /* single number with comma */
654 - count2 = INFINITY;
655 - } else /* just a single number */
656 - count2 = count;
657 - repeat(p, pos, count, count2);
658 - if (!EATTWO('\\', '}')) { /* error heuristics */
659 - while (MORE() && !SEETWO('\\', '}'))
660 - NEXT();
661 - (void) REQUIRE(MORE(), REG_EBRACE);
662 - SETERROR(REG_BADBR);
663 - }
664 - } else if (c == '$') /* $ (but not \$) ends it */
665 - return (1);
666 -
667 - return (0);
134 + return tre_regcomp_l(preg, regex, cflags, uselocale((locale_t)0));
668 135 }
669 136
670 -/*
671 - * p_count - parse a repetition count
672 - */
673 -static int /* the value */
674 -p_count(struct parse *p)
137 +void
138 +regfree(regex_t *preg)
675 139 {
676 - int count = 0;
677 - int ndigits = 0;
678 -
679 - while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
680 - count = count*10 + (GETNEXT() - '0');
681 - ndigits++;
682 - }
683 -
684 - (void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
685 - return (count);
686 -}
687 -
688 -/*
689 - * p_bracket - parse a bracketed character list
690 - */
691 -static void
692 -p_bracket(struct parse *p)
693 -{
694 - cset *cs;
695 - wint_t ch;
696 -
697 - /* Dept of Truly Sickening Special-Case Kludges */
698 - if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
699 - EMIT(OBOW, 0);
700 - NEXTn(6);
701 - return;
702 - }
703 - if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
704 - EMIT(OEOW, 0);
705 - NEXTn(6);
706 - return;
707 - }
708 -
709 - if ((cs = allocset(p)) == NULL)
710 - return;
711 -
712 - if (p->g->cflags®_ICASE)
713 - cs->icase = 1;
714 - if (EAT('^'))
715 - cs->invert = 1;
716 - if (EAT(']'))
717 - CHadd(p, cs, ']');
718 - else if (EAT('-'))
719 - CHadd(p, cs, '-');
720 - while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
721 - p_b_term(p, cs);
722 - if (EAT('-'))
723 - CHadd(p, cs, '-');
724 - (void) MUSTEAT(']', REG_EBRACK);
725 -
726 - if (p->error != 0) /* don't mess things up further */
727 - return;
728 -
729 - if (cs->invert && p->g->cflags®_NEWLINE)
730 - cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
731 -
732 - if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
733 - ordinary(p, ch);
734 - freeset(p, cs);
735 - } else
736 - EMIT(OANYOF, (int)(cs - p->g->sets));
737 -}
738 -
739 -/*
740 - * p_b_term - parse one term of a bracketed character list
741 - */
742 -static void
743 -p_b_term(struct parse *p, cset *cs)
744 -{
745 - char c;
746 - wint_t start, finish;
747 - wint_t i;
748 - locale_t loc = uselocale(NULL);
749 -
750 - /* classify what we've got */
751 - switch ((MORE()) ? PEEK() : '\0') {
752 - case '[':
753 - c = (MORE2()) ? PEEK2() : '\0';
754 - break;
755 - case '-':
756 - SETERROR(REG_ERANGE);
757 - return; /* NOTE RETURN */
758 - default:
759 - c = '\0';
760 - break;
761 - }
762 -
763 - switch (c) {
764 - case ':': /* character class */
765 - NEXT2();
766 - (void) REQUIRE(MORE(), REG_EBRACK);
767 - c = PEEK();
768 - (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);
769 - p_b_cclass(p, cs);
770 - (void) REQUIRE(MORE(), REG_EBRACK);
771 - (void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
772 - break;
773 - case '=': /* equivalence class */
774 - NEXT2();
775 - (void) REQUIRE(MORE(), REG_EBRACK);
776 - c = PEEK();
777 - (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
778 - p_b_eclass(p, cs);
779 - (void) REQUIRE(MORE(), REG_EBRACK);
780 - (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
781 - break;
782 - default: /* symbol, ordinary character, or range */
783 - start = p_b_symbol(p);
784 - if (SEE('-') && MORE2() && PEEK2() != ']') {
785 - /* range */
786 - NEXT();
787 - if (EAT('-'))
788 - finish = '-';
789 - else
790 - finish = p_b_symbol(p);
791 - } else
792 - finish = start;
793 - if (start == finish)
794 - CHadd(p, cs, start);
795 - else {
796 - if (loc->collate->lc_is_posix) {
797 - (void) REQUIRE((uch)start <= (uch)finish,
798 - REG_ERANGE);
799 - CHaddrange(p, cs, start, finish);
800 - } else {
801 - (void) REQUIRE(_collate_range_cmp(start,
802 - finish, loc) <= 0, REG_ERANGE);
803 - for (i = 0; i <= UCHAR_MAX; i++) {
804 - if (_collate_range_cmp(start, i, loc)
805 - <= 0 &&
806 - _collate_range_cmp(i, finish, loc)
807 - <= 0)
808 - CHadd(p, cs, i);
809 - }
810 - }
811 - }
812 - break;
813 - }
814 -}
815 -
816 -/*
817 - * p_b_cclass - parse a character-class name and deal with it
818 - */
819 -static void
820 -p_b_cclass(struct parse *p, cset *cs)
821 -{
822 - const char *sp = p->next;
823 - size_t len;
824 - wctype_t wct;
825 - char clname[16];
826 -
827 - while (MORE() && isalpha((uch)PEEK()))
828 - NEXT();
829 - len = p->next - sp;
830 - if (len >= sizeof (clname) - 1) {
831 - SETERROR(REG_ECTYPE);
832 - return;
833 - }
834 - (void) memcpy(clname, sp, len);
835 - clname[len] = '\0';
836 - if ((wct = wctype(clname)) == 0) {
837 - SETERROR(REG_ECTYPE);
838 - return;
839 - }
840 - CHaddtype(p, cs, wct);
841 -}
842 -
843 -/*
844 - * p_b_eclass - parse an equivalence-class name and deal with it
845 - *
846 - * This implementation is incomplete. xxx
847 - */
848 -static void
849 -p_b_eclass(struct parse *p, cset *cs)
850 -{
851 - wint_t c;
852 -
853 - c = p_b_coll_elem(p, '=');
854 - CHadd(p, cs, c);
855 -}
856 -
857 -/*
858 - * p_b_symbol - parse a character or [..]ed multicharacter collating symbol
859 - */
860 -static wint_t /* value of symbol */
861 -p_b_symbol(struct parse *p)
862 -{
863 - wint_t value;
864 -
865 - (void) REQUIRE(MORE(), REG_EBRACK);
866 - if (!EATTWO('[', '.'))
867 - return (WGETNEXT());
868 -
869 - /* collating symbol */
870 - value = p_b_coll_elem(p, '.');
871 - (void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
872 - return (value);
873 -}
874 -
875 -/*
876 - * p_b_coll_elem - parse a collating-element name and look it up
877 - */
878 -static wint_t /* value of collating element */
879 -p_b_coll_elem(struct parse *p,
880 - wint_t endc) /* name ended by endc,']' */
881 -{
882 - const char *sp = p->next;
883 - struct cname *cp;
884 - mbstate_t mbs;
885 - wchar_t wc;
886 - size_t clen, len;
887 -
888 - while (MORE() && !SEETWO(endc, ']'))
889 - NEXT();
890 - if (!MORE()) {
891 - SETERROR(REG_EBRACK);
892 - return (0);
893 - }
894 - len = p->next - sp;
895 - for (cp = cnames; cp->name != NULL; cp++)
896 - if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
897 - return (cp->code); /* known name */
898 - (void) memset(&mbs, 0, sizeof (mbs));
899 - if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
900 - return (wc); /* single character */
901 - else if (clen == (size_t)-1 || clen == (size_t)-2)
902 - SETERROR(REG_ECHAR);
903 - else
904 - SETERROR(REG_ECOLLATE); /* neither */
905 - return (0);
906 -}
907 -
908 -/*
909 - * othercase - return the case counterpart of an alphabetic
910 - */
911 -static wint_t /* if no counterpart, return ch */
912 -othercase(wint_t ch)
913 -{
914 - assert(iswalpha(ch));
915 - if (iswupper(ch))
916 - return (towlower(ch));
917 - else if (iswlower(ch))
918 - return (towupper(ch));
919 - else /* peculiar, but could happen */
920 - return (ch);
921 -}
922 -
923 -/*
924 - * bothcases - emit a dualcase version of a two-case character
925 - *
926 - * Boy, is this implementation ever a kludge...
927 - */
928 -static void
929 -bothcases(struct parse *p, wint_t ch)
930 -{
931 - const char *oldnext = p->next;
932 - const char *oldend = p->end;
933 - char bracket[3 + MB_LEN_MAX];
934 - size_t n;
935 - mbstate_t mbs;
936 -
937 - assert(othercase(ch) != ch); /* p_bracket() would recurse */
938 - p->next = bracket;
939 - (void) memset(&mbs, 0, sizeof (mbs));
940 - n = wcrtomb(bracket, ch, &mbs);
941 - assert(n != (size_t)-1);
942 - bracket[n] = ']';
943 - bracket[n + 1] = '\0';
944 - p->end = bracket+n+1;
945 - p_bracket(p);
946 - assert(p->next == p->end);
947 - p->next = oldnext;
948 - p->end = oldend;
949 -}
950 -
951 -/*
952 - * ordinary - emit an ordinary character
953 - */
954 -static void
955 -ordinary(struct parse *p, wint_t ch)
956 -{
957 - cset *cs;
958 -
959 - if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch)
960 - bothcases(p, ch);
961 - else if ((ch & OPDMASK) == ch)
962 - EMIT(OCHAR, ch);
963 - else {
964 - /*
965 - * Kludge: character is too big to fit into an OCHAR operand.
966 - * Emit a singleton set.
967 - */
968 - if ((cs = allocset(p)) == NULL)
969 - return;
970 - CHadd(p, cs, ch);
971 - EMIT(OANYOF, (int)(cs - p->g->sets));
972 - }
973 -}
974 -
975 -/*
976 - * nonnewline - emit REG_NEWLINE version of OANY
977 - *
978 - * Boy, is this implementation ever a kludge...
979 - */
980 -static void
981 -nonnewline(struct parse *p)
982 -{
983 - const char *oldnext = p->next;
984 - const char *oldend = p->end;
985 - char bracket[4];
986 -
987 - p->next = bracket;
988 - p->end = bracket+3;
989 - bracket[0] = '^';
990 - bracket[1] = '\n';
991 - bracket[2] = ']';
992 - bracket[3] = '\0';
993 - p_bracket(p);
994 - assert(p->next == bracket+3);
995 - p->next = oldnext;
996 - p->end = oldend;
997 -}
998 -
999 -/*
1000 - * repeat - generate code for a bounded repetition, recursively if needed
1001 - */
1002 -static void
1003 -repeat(struct parse *p,
1004 - sopno start, /* operand from here to end of strip */
1005 - int from, /* repeated from this number */
1006 - int to) /* to this number of times (maybe INFINITY) */
1007 -{
1008 - sopno finish = HERE();
1009 -#define N 2
1010 -#define INF 3
1011 -#define REP(f, t) ((f)*8 + (t))
1012 -#define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1013 - sopno copy;
1014 -
1015 - if (p->error != 0) /* head off possible runaway recursion */
1016 - return;
1017 -
1018 - assert(from <= to);
1019 -
1020 - switch (REP(MAP(from), MAP(to))) {
1021 - case REP(0, 0): /* must be user doing this */
1022 - DROP(finish-start); /* drop the operand */
1023 - break;
1024 - case REP(0, 1): /* as x{1,1}? */
1025 - case REP(0, N): /* as x{1,n}? */
1026 - case REP(0, INF): /* as x{1,}? */
1027 - /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1028 - INSERT(OCH_, start); /* offset is wrong... */
1029 - repeat(p, start+1, 1, to);
1030 - ASTERN(OOR1, start);
1031 - AHEAD(start); /* ... fix it */
1032 - EMIT(OOR2, 0);
1033 - AHEAD(THERE());
1034 - ASTERN(O_CH, THERETHERE());
1035 - break;
1036 - case REP(1, 1): /* trivial case */
1037 - /* done */
1038 - break;
1039 - case REP(1, N): /* as x?x{1,n-1} */
1040 - /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1041 - INSERT(OCH_, start);
1042 - ASTERN(OOR1, start);
1043 - AHEAD(start);
1044 - EMIT(OOR2, 0); /* offset very wrong... */
1045 - AHEAD(THERE()); /* ...so fix it */
1046 - ASTERN(O_CH, THERETHERE());
1047 - copy = dupl(p, start+1, finish+1);
1048 - assert(copy == finish+4);
1049 - repeat(p, copy, 1, to-1);
1050 - break;
1051 - case REP(1, INF): /* as x+ */
1052 - INSERT(OPLUS_, start);
1053 - ASTERN(O_PLUS, start);
1054 - break;
1055 - case REP(N, N): /* as xx{m-1,n-1} */
1056 - copy = dupl(p, start, finish);
1057 - repeat(p, copy, from-1, to-1);
1058 - break;
1059 - case REP(N, INF): /* as xx{n-1,INF} */
1060 - copy = dupl(p, start, finish);
1061 - repeat(p, copy, from-1, to);
1062 - break;
1063 - default: /* "can't happen" */
1064 - SETERROR(REG_EFATAL); /* just in case */
1065 - break;
1066 - }
1067 -}
1068 -
1069 -/*
1070 - * wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1071 - * character from the parse struct, signals a REG_ILLSEQ error if the
1072 - * character can't be converted. Returns the number of bytes consumed.
1073 - */
1074 -static wint_t
1075 -wgetnext(struct parse *p)
1076 -{
1077 - mbstate_t mbs;
1078 - wchar_t wc;
1079 - size_t n;
1080 -
1081 - (void) memset(&mbs, 0, sizeof (mbs));
1082 - n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1083 - if (n == (size_t)-1 || n == (size_t)-2) {
1084 - SETERROR(REG_ECHAR);
1085 - return (0);
1086 - }
1087 - if (n == 0)
1088 - n = 1;
1089 - p->next += n;
1090 - return (wc);
1091 -}
1092 -
1093 -/*
1094 - * seterr - set an error condition
1095 - */
1096 -static int /* useless but makes type checking happy */
1097 -seterr(struct parse *p, int e)
1098 -{
1099 - if (p->error == 0) /* keep earliest error condition */
1100 - p->error = e;
1101 - p->next = nuls; /* try to bring things to a halt */
1102 - p->end = nuls;
1103 - return (0); /* make the return value well-defined */
1104 -}
1105 -
1106 -/*
1107 - * allocset - allocate a set of characters for []
1108 - */
1109 -static cset *
1110 -allocset(struct parse *p)
1111 -{
1112 - cset *cs, *ncs;
1113 -
1114 - ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs));
1115 - if (ncs == NULL) {
1116 - SETERROR(REG_ESPACE);
1117 - return (NULL);
1118 - }
1119 - p->g->sets = ncs;
1120 - cs = &p->g->sets[p->g->ncsets++];
1121 - (void) memset(cs, 0, sizeof (*cs));
1122 -
1123 - return (cs);
1124 -}
1125 -
1126 -/*
1127 - * freeset - free a now-unused set
1128 - */
1129 -static void
1130 -freeset(struct parse *p, cset *cs)
1131 -{
1132 - cset *top = &p->g->sets[p->g->ncsets];
1133 -
1134 - free(cs->wides);
1135 - free(cs->ranges);
1136 - free(cs->types);
1137 - (void) memset(cs, 0, sizeof (*cs));
1138 - if (cs == top-1) /* recover only the easy case */
1139 - p->g->ncsets--;
1140 -}
1141 -
1142 -/*
1143 - * singleton - Determine whether a set contains only one character,
1144 - * returning it if so, otherwise returning OUT.
1145 - */
1146 -static wint_t
1147 -singleton(cset *cs)
1148 -{
1149 - wint_t i, s, n;
1150 -
1151 - for (i = n = 0; i < NC; i++)
1152 - if (CHIN(cs, i)) {
1153 - n++;
1154 - s = i;
1155 - }
1156 - if (n == 1)
1157 - return (s);
1158 - if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1159 - cs->icase == 0)
1160 - return (cs->wides[0]);
1161 - /* Don't bother handling the other cases. */
1162 - return (OUT);
1163 -}
1164 -
1165 -/*
1166 - * CHadd - add character to character set.
1167 - */
1168 -static void
1169 -CHadd(struct parse *p, cset *cs, wint_t ch)
1170 -{
1171 - wint_t nch, *newwides;
1172 - assert(ch >= 0);
1173 - if (ch < NC)
1174 - cs->bmp[ch >> 3] |= 1 << (ch & 7);
1175 - else {
1176 - newwides = realloc(cs->wides, (cs->nwides + 1) *
1177 - sizeof (*cs->wides));
1178 - if (newwides == NULL) {
1179 - SETERROR(REG_ESPACE);
1180 - return;
1181 - }
1182 - cs->wides = newwides;
1183 - cs->wides[cs->nwides++] = ch;
1184 - }
1185 - if (cs->icase) {
1186 - if ((nch = towlower(ch)) < NC)
1187 - cs->bmp[nch >> 3] |= 1 << (nch & 7);
1188 - if ((nch = towupper(ch)) < NC)
1189 - cs->bmp[nch >> 3] |= 1 << (nch & 7);
1190 - }
1191 -}
1192 -
1193 -/*
1194 - * CHaddrange - add all characters in the range [min,max] to a character set.
1195 - */
1196 -static void
1197 -CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1198 -{
1199 - crange *newranges;
1200 -
1201 - for (; min < NC && min <= max; min++)
1202 - CHadd(p, cs, min);
1203 - if (min >= max)
1204 - return;
1205 - newranges = realloc(cs->ranges, (cs->nranges + 1) *
1206 - sizeof (*cs->ranges));
1207 - if (newranges == NULL) {
1208 - SETERROR(REG_ESPACE);
1209 - return;
1210 - }
1211 - cs->ranges = newranges;
1212 - cs->ranges[cs->nranges].min = min;
1213 - cs->ranges[cs->nranges].max = max;
1214 - cs->nranges++;
1215 -}
1216 -
1217 -/*
1218 - * CHaddtype - add all characters of a certain type to a character set.
1219 - */
1220 -static void
1221 -CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1222 -{
1223 - wint_t i;
1224 - wctype_t *newtypes;
1225 -
1226 - for (i = 0; i < NC; i++)
1227 - if (iswctype(i, wct))
1228 - CHadd(p, cs, i);
1229 - newtypes = realloc(cs->types, (cs->ntypes + 1) *
1230 - sizeof (*cs->types));
1231 - if (newtypes == NULL) {
1232 - SETERROR(REG_ESPACE);
1233 - return;
1234 - }
1235 - cs->types = newtypes;
1236 - cs->types[cs->ntypes++] = wct;
1237 -}
1238 -
1239 -/*
1240 - * dupl - emit a duplicate of a bunch of sops
1241 - */
1242 -static sopno /* start of duplicate */
1243 -dupl(struct parse *p,
1244 - sopno start, /* from here */
1245 - sopno finish) /* to this less one */
1246 -{
1247 - sopno ret = HERE();
1248 - sopno len = finish - start;
1249 -
1250 - assert(finish >= start);
1251 - if (len == 0)
1252 - return (ret);
1253 - if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1254 - return (ret);
1255 - assert(p->ssize >= p->slen + len);
1256 - (void) memcpy((char *)(p->strip + p->slen),
1257 - (char *)(p->strip + start), (size_t)len*sizeof (sop));
1258 - p->slen += len;
1259 - return (ret);
1260 -}
1261 -
1262 -/*
1263 - * doemit - emit a strip operator
1264 - *
1265 - * It might seem better to implement this as a macro with a function as
1266 - * hard-case backup, but it's just too big and messy unless there are
1267 - * some changes to the data structures. Maybe later.
1268 - */
1269 -static void
1270 -doemit(struct parse *p, sop op, size_t opnd)
1271 -{
1272 - /* avoid making error situations worse */
1273 - if (p->error != 0)
1274 - return;
1275 -
1276 - /* deal with oversize operands ("can't happen", more or less) */
1277 - assert(opnd < 1<<OPSHIFT);
1278 -
1279 - /* deal with undersized strip */
1280 - if (p->slen >= p->ssize)
1281 - if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */
1282 - return;
1283 -
1284 - /* finally, it's all reduced to the easy case */
1285 - p->strip[p->slen++] = SOP(op, opnd);
1286 -}
1287 -
1288 -/*
1289 - * doinsert - insert a sop into the strip
1290 - */
1291 -static void
1292 -doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1293 -{
1294 - sopno sn;
1295 - sop s;
1296 - int i;
1297 -
1298 - /* avoid making error situations worse */
1299 - if (p->error != 0)
1300 - return;
1301 -
1302 - sn = HERE();
1303 - EMIT(op, opnd); /* do checks, ensure space */
1304 - assert(HERE() == sn+1);
1305 - s = p->strip[sn];
1306 -
1307 - /* adjust paren pointers */
1308 - assert(pos > 0);
1309 - for (i = 1; i < NPAREN; i++) {
1310 - if (p->pbegin[i] >= pos) {
1311 - p->pbegin[i]++;
1312 - }
1313 - if (p->pend[i] >= pos) {
1314 - p->pend[i]++;
1315 - }
1316 - }
1317 -
1318 - (void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1319 - (HERE()-pos-1)*sizeof (sop));
1320 - p->strip[pos] = s;
1321 -}
1322 -
1323 -/*
1324 - * dofwd - complete a forward reference
1325 - */
1326 -static void
1327 -dofwd(struct parse *p, sopno pos, sop value)
1328 -{
1329 - /* avoid making error situations worse */
1330 - if (p->error != 0)
1331 - return;
1332 -
1333 - assert(value < 1<<OPSHIFT);
1334 - p->strip[pos] = OP(p->strip[pos]) | value;
1335 -}
1336 -
1337 -/*
1338 - * enlarge - enlarge the strip
1339 - */
1340 -static int
1341 -enlarge(struct parse *p, sopno size)
1342 -{
1343 - sop *sp;
1344 -
1345 - if (p->ssize >= size)
1346 - return (1);
1347 -
1348 - sp = (sop *)realloc(p->strip, size*sizeof (sop));
1349 - if (sp == NULL) {
1350 - SETERROR(REG_ESPACE);
1351 - return (0);
1352 - }
1353 - p->strip = sp;
1354 - p->ssize = size;
1355 - return (1);
1356 -}
1357 -
1358 -/*
1359 - * stripsnug - compact the strip
1360 - */
1361 -static void
1362 -stripsnug(struct parse *p, struct re_guts *g)
1363 -{
1364 - g->nstates = p->slen;
1365 - g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop));
1366 - if (g->strip == NULL) {
1367 - SETERROR(REG_ESPACE);
1368 - g->strip = p->strip;
1369 - }
1370 -}
1371 -
1372 -/*
1373 - * findmust - fill in must and mlen with longest mandatory literal string
1374 - *
1375 - * This algorithm could do fancy things like analyzing the operands of |
1376 - * for common subsequences. Someday. This code is simple and finds most
1377 - * of the interesting cases.
1378 - *
1379 - * Note that must and mlen got initialized during setup.
1380 - */
1381 -static void
1382 -findmust(struct parse *p, struct re_guts *g)
1383 -{
1384 - sop *scan;
1385 - sop *start = NULL;
1386 - sop *newstart = NULL;
1387 - sopno newlen;
1388 - sop s;
1389 - char *cp;
1390 - int offset;
1391 - char buf[MB_LEN_MAX];
1392 - size_t clen;
1393 - mbstate_t mbs;
1394 - locale_t loc = uselocale(NULL);
1395 -
1396 - /* avoid making error situations worse */
1397 - if (p->error != 0)
1398 - return;
1399 -
1400 - /*
1401 - * It's not generally safe to do a ``char'' substring search on
1402 - * multibyte character strings, but it's safe for at least
1403 - * UTF-8 (see RFC 3629).
1404 - */
1405 - if (MB_CUR_MAX > 1 &&
1406 - strcmp(loc->runelocale->__encoding, "UTF-8") != 0)
1407 - return;
1408 -
1409 - /* find the longest OCHAR sequence in strip */
1410 - newlen = 0;
1411 - offset = 0;
1412 - g->moffset = 0;
1413 - scan = g->strip + 1;
1414 - do {
1415 - s = *scan++;
1416 - switch (OP(s)) {
1417 - case OCHAR: /* sequence member */
1418 - if (newlen == 0) { /* new sequence */
1419 - (void) memset(&mbs, 0, sizeof (mbs));
1420 - newstart = scan - 1;
1421 - }
1422 - clen = wcrtomb(buf, OPND(s), &mbs);
1423 - if (clen == (size_t)-1)
1424 - goto toohard;
1425 - newlen += clen;
1426 - break;
1427 - case OPLUS_: /* things that don't break one */
1428 - case OLPAREN:
1429 - case ORPAREN:
1430 - break;
1431 - case OQUEST_: /* things that must be skipped */
1432 - case OCH_:
1433 - offset = altoffset(scan, offset);
1434 - scan--;
1435 - do {
1436 - scan += OPND(s);
1437 - s = *scan;
1438 - /* assert() interferes w debug printouts */
1439 - if (OP(s) != O_QUEST && OP(s) != O_CH &&
1440 - OP(s) != OOR2) {
1441 - g->iflags |= BAD;
1442 - return;
1443 - }
1444 - } while (OP(s) != O_QUEST && OP(s) != O_CH);
1445 - /* FALLTHROUGH */
1446 - case OBOW: /* things that break a sequence */
1447 - case OEOW:
1448 - case OBOL:
1449 - case OEOL:
1450 - case O_QUEST:
1451 - case O_CH:
1452 - case OEND:
1453 - if (newlen > g->mlen) { /* ends one */
1454 - start = newstart;
1455 - g->mlen = newlen;
1456 - if (offset > -1) {
1457 - g->moffset += offset;
1458 - offset = newlen;
1459 - } else
1460 - g->moffset = offset;
1461 - } else {
1462 - if (offset > -1)
1463 - offset += newlen;
1464 - }
1465 - newlen = 0;
1466 - break;
1467 - case OANY:
1468 - if (newlen > g->mlen) { /* ends one */
1469 - start = newstart;
1470 - g->mlen = newlen;
1471 - if (offset > -1) {
1472 - g->moffset += offset;
1473 - offset = newlen;
1474 - } else
1475 - g->moffset = offset;
1476 - } else {
1477 - if (offset > -1)
1478 - offset += newlen;
1479 - }
1480 - if (offset > -1)
1481 - offset++;
1482 - newlen = 0;
1483 - break;
1484 - case OANYOF: /* may or may not invalidate offset */
1485 - /* First, everything as OANY */
1486 - if (newlen > g->mlen) { /* ends one */
1487 - start = newstart;
1488 - g->mlen = newlen;
1489 - if (offset > -1) {
1490 - g->moffset += offset;
1491 - offset = newlen;
1492 - } else
1493 - g->moffset = offset;
1494 - } else {
1495 - if (offset > -1)
1496 - offset += newlen;
1497 - }
1498 - if (offset > -1)
1499 - offset++;
1500 - newlen = 0;
1501 - break;
1502 - toohard:
1503 - default:
1504 - /*
1505 - * Anything here makes it impossible or too hard
1506 - * to calculate the offset -- so we give up;
1507 - * save the last known good offset, in case the
1508 - * must sequence doesn't occur later.
1509 - */
1510 - if (newlen > g->mlen) { /* ends one */
1511 - start = newstart;
1512 - g->mlen = newlen;
1513 - if (offset > -1)
1514 - g->moffset += offset;
1515 - else
1516 - g->moffset = offset;
1517 - }
1518 - offset = -1;
1519 - newlen = 0;
1520 - break;
1521 - }
1522 - } while (OP(s) != OEND);
1523 -
1524 - if (g->mlen == 0) { /* there isn't one */
1525 - g->moffset = -1;
1526 - return;
1527 - }
1528 -
1529 - /* turn it into a character string */
1530 - g->must = malloc((size_t)g->mlen + 1);
1531 - if (g->must == NULL) { /* argh; just forget it */
1532 - g->mlen = 0;
1533 - g->moffset = -1;
1534 - return;
1535 - }
1536 - cp = g->must;
1537 - scan = start;
1538 - (void) memset(&mbs, 0, sizeof (mbs));
1539 - while (cp < g->must + g->mlen) {
1540 - while (OP(s = *scan++) != OCHAR)
1541 - continue;
1542 - clen = wcrtomb(cp, OPND(s), &mbs);
1543 - assert(clen != (size_t)-1);
1544 - cp += clen;
1545 - }
1546 - assert(cp == g->must + g->mlen);
1547 - *cp++ = '\0'; /* just on general principles */
1548 -}
1549 -
1550 -/*
1551 - * altoffset - choose biggest offset among multiple choices
1552 - *
1553 - * Compute, recursively if necessary, the largest offset among multiple
1554 - * re paths.
1555 - */
1556 -static int
1557 -altoffset(sop *scan, int offset)
1558 -{
1559 - int largest;
1560 - int try;
1561 - sop s;
1562 -
1563 - /* If we gave up already on offsets, return */
1564 - if (offset == -1)
1565 - return (-1);
1566 -
1567 - largest = 0;
1568 - try = 0;
1569 - s = *scan++;
1570 - while (OP(s) != O_QUEST && OP(s) != O_CH) {
1571 - switch (OP(s)) {
1572 - case OOR1:
1573 - if (try > largest)
1574 - largest = try;
1575 - try = 0;
1576 - break;
1577 - case OQUEST_:
1578 - case OCH_:
1579 - try = altoffset(scan, try);
1580 - if (try == -1)
1581 - return (-1);
1582 - scan--;
1583 - do {
1584 - scan += OPND(s);
1585 - s = *scan;
1586 - if (OP(s) != O_QUEST && OP(s) != O_CH &&
1587 - OP(s) != OOR2)
1588 - return (-1);
1589 - } while (OP(s) != O_QUEST && OP(s) != O_CH);
1590 - /*
1591 - * We must skip to the next position, or we'll
1592 - * leave altoffset() too early.
1593 - */
1594 - scan++;
1595 - break;
1596 - case OANYOF:
1597 - case OCHAR:
1598 - case OANY:
1599 - try++;
1600 - /*FALLTHRU*/
1601 - case OBOW:
1602 - case OEOW:
1603 - case OLPAREN:
1604 - case ORPAREN:
1605 - case OOR2:
1606 - break;
1607 - default:
1608 - try = -1;
1609 - break;
1610 - }
1611 - if (try == -1)
1612 - return (-1);
1613 - s = *scan++;
1614 - }
1615 -
1616 - if (try > largest)
1617 - largest = try;
1618 -
1619 - return (largest+offset);
1620 -}
1621 -
1622 -/*
1623 - * computejumps - compute char jumps for BM scan
1624 - *
1625 - * This algorithm assumes g->must exists and is has size greater than
1626 - * zero. It's based on the algorithm found on Computer Algorithms by
1627 - * Sara Baase.
1628 - *
1629 - * A char jump is the number of characters one needs to jump based on
1630 - * the value of the character from the text that was mismatched.
1631 - */
1632 -static void
1633 -computejumps(struct parse *p, struct re_guts *g)
1634 -{
1635 - int ch;
1636 - int mindex;
1637 -
1638 - /* Avoid making errors worse */
1639 - if (p->error != 0)
1640 - return;
1641 -
1642 - g->charjump = (int *)malloc((NC + 1) * sizeof (int));
1643 - if (g->charjump == NULL) /* Not a fatal error */
1644 - return;
1645 - /* Adjust for signed chars, if necessary */
1646 - g->charjump = &g->charjump[-(CHAR_MIN)];
1647 -
1648 - /*
1649 - * If the character does not exist in the pattern, the jump
1650 - * is equal to the number of characters in the pattern.
1651 - */
1652 - for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1653 - g->charjump[ch] = g->mlen;
1654 -
1655 - /*
1656 - * If the character does exist, compute the jump that would
1657 - * take us to the last character in the pattern equal to it
1658 - * (notice that we match right to left, so that last character
1659 - * is the first one that would be matched).
1660 - */
1661 - for (mindex = 0; mindex < g->mlen; mindex++)
1662 - g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1663 -}
1664 -
1665 -/*
1666 - * computematchjumps - compute match jumps for BM scan
1667 - *
1668 - * This algorithm assumes g->must exists and is has size greater than
1669 - * zero. It's based on the algorithm found on Computer Algorithms by
1670 - * Sara Baase.
1671 - *
1672 - * A match jump is the number of characters one needs to advance based
1673 - * on the already-matched suffix.
1674 - * Notice that all values here are minus (g->mlen-1), because of the way
1675 - * the search algorithm works.
1676 - */
1677 -static void
1678 -computematchjumps(struct parse *p, struct re_guts *g)
1679 -{
1680 - int mindex; /* General "must" iterator */
1681 - int suffix; /* Keeps track of matching suffix */
1682 - int ssuffix; /* Keeps track of suffixes' suffix */
1683 - int *pmatches;
1684 - /*
1685 - * pmatches[k] points to the next i
1686 - * such that i+1...mlen is a substring
1687 - * of k+1...k+mlen-i-1
1688 - */
1689 -
1690 - /* Avoid making errors worse */
1691 - if (p->error != 0)
1692 - return;
1693 -
1694 - pmatches = (int *)malloc(g->mlen * sizeof (unsigned int));
1695 - if (pmatches == NULL) {
1696 - g->matchjump = NULL;
1697 - return;
1698 - }
1699 -
1700 - g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int));
1701 - if (g->matchjump == NULL) { /* Not a fatal error */
1702 - free(pmatches);
1703 - return;
1704 - }
1705 -
1706 - /* Set maximum possible jump for each character in the pattern */
1707 - for (mindex = 0; mindex < g->mlen; mindex++)
1708 - g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1709 -
1710 - /* Compute pmatches[] */
1711 - for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1712 - mindex--, suffix--) {
1713 - pmatches[mindex] = suffix;
1714 -
1715 - /*
1716 - * If a mismatch is found, interrupting the substring,
1717 - * compute the matchjump for that position. If no
1718 - * mismatch is found, then a text substring mismatched
1719 - * against the suffix will also mismatch against the
1720 - * substring.
1721 - */
1722 - while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) {
1723 - g->matchjump[suffix] = MIN(g->matchjump[suffix],
1724 - g->mlen - mindex - 1);
1725 - suffix = pmatches[suffix];
1726 - }
1727 - }
1728 -
1729 - /*
1730 - * Compute the matchjump up to the last substring found to jump
1731 - * to the beginning of the largest must pattern prefix matching
1732 - * it's own suffix.
1733 - */
1734 - for (mindex = 0; mindex <= suffix; mindex++)
1735 - g->matchjump[mindex] = MIN(g->matchjump[mindex],
1736 - g->mlen + suffix - mindex);
1737 -
1738 - ssuffix = pmatches[suffix];
1739 - while (suffix < g->mlen) {
1740 - while (suffix <= ssuffix && suffix < g->mlen) {
1741 - g->matchjump[suffix] = MIN(g->matchjump[suffix],
1742 - g->mlen + ssuffix - suffix);
1743 - suffix++;
1744 - }
1745 - if (suffix < g->mlen)
1746 - ssuffix = pmatches[ssuffix];
1747 - }
1748 -
1749 - free(pmatches);
1750 -}
1751 -
1752 -/*
1753 - * pluscount - count + nesting
1754 - */
1755 -static sopno /* nesting depth */
1756 -pluscount(struct parse *p, struct re_guts *g)
1757 -{
1758 - sop *scan;
1759 - sop s;
1760 - sopno plusnest = 0;
1761 - sopno maxnest = 0;
1762 -
1763 - if (p->error != 0)
1764 - return (0); /* there may not be an OEND */
1765 -
1766 - scan = g->strip + 1;
1767 - do {
1768 - s = *scan++;
1769 - switch (OP(s)) {
1770 - case OPLUS_:
1771 - plusnest++;
1772 - break;
1773 - case O_PLUS:
1774 - if (plusnest > maxnest)
1775 - maxnest = plusnest;
1776 - plusnest--;
1777 - break;
1778 - }
1779 - } while (OP(s) != OEND);
1780 - if (plusnest != 0)
1781 - g->iflags |= BAD;
1782 - return (maxnest);
140 + tre_free(preg);
1783 141 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX