Print this page
9083 replace regex implementation with tre

*** 1,1783 **** /* ! * Copyright 2013 Garrett D'Amore <garrett@damore.org> ! * Copyright 2010 Nexenta Systems, Inc. All rights reserved. ! * Copyright 2012 Milan Jurik. All rights reserved. ! * Copyright (c) 1992, 1993, 1994 Henry Spencer. ! * Copyright (c) 1992, 1993, 1994 ! * The Regents of the University of California. All rights reserved. * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. * ! * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ! * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ! * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ! * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ! * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ! * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ! * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ! * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ! * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ! * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ! * SUCH DAMAGE. */ ! #include "lint.h" ! #include "file64.h" ! #include <sys/types.h> ! #include <stdio.h> #include <string.h> ! #include <ctype.h> ! #include <limits.h> #include <stdlib.h> - #include <regex.h> - #include <wchar.h> - #include <wctype.h> ! #include "../locale/runetype.h" ! #include "../locale/collate.h" ! #include "utils.h" ! #include "regex2.h" ! ! #include "cname.h" ! #include "../locale/mblocal.h" ! ! /* ! * parse structure, passed up and down to avoid global variables and ! * other clumsinesses ! */ ! struct parse { ! const char *next; /* next character in RE */ ! const char *end; /* end of string (-> NUL normally) */ ! int error; /* has an error been seen? */ ! sop *strip; /* malloced strip */ ! sopno ssize; /* malloced strip size (allocated) */ ! sopno slen; /* malloced strip length (used) */ ! int ncsalloc; /* number of csets allocated */ ! struct re_guts *g; ! #define NPAREN 10 /* we need to remember () 1-9 for back refs */ ! sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ ! sopno pend[NPAREN]; /* -> ) ([0] unused) */ ! }; ! ! /* ========= begin header generated by ./mkh ========= */ ! #ifdef __cplusplus ! extern "C" { ! #endif ! ! /* === regcomp.c === */ ! static void p_ere(struct parse *p, int stop); ! static void p_ere_exp(struct parse *p); ! static void p_str(struct parse *p); ! static void p_bre(struct parse *p, int end1, int end2); ! static int p_simp_re(struct parse *p, int starordinary); ! static int p_count(struct parse *p); ! static void p_bracket(struct parse *p); ! static void p_b_term(struct parse *p, cset *cs); ! static void p_b_cclass(struct parse *p, cset *cs); ! static void p_b_eclass(struct parse *p, cset *cs); ! static wint_t p_b_symbol(struct parse *p); ! static wint_t p_b_coll_elem(struct parse *p, wint_t endc); ! static wint_t othercase(wint_t ch); ! static void bothcases(struct parse *p, wint_t ch); ! static void ordinary(struct parse *p, wint_t ch); ! static void nonnewline(struct parse *p); ! static void repeat(struct parse *p, sopno start, int from, int to); ! static int seterr(struct parse *p, int e); ! static cset *allocset(struct parse *p); ! static void freeset(struct parse *p, cset *cs); ! static void CHadd(struct parse *p, cset *cs, wint_t ch); ! static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); ! static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); ! static wint_t singleton(cset *cs); ! static sopno dupl(struct parse *p, sopno start, sopno finish); ! static void doemit(struct parse *p, sop op, size_t opnd); ! static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); ! static void dofwd(struct parse *p, sopno pos, sop value); ! static int enlarge(struct parse *p, sopno size); ! static void stripsnug(struct parse *p, struct re_guts *g); ! static void findmust(struct parse *p, struct re_guts *g); ! static int altoffset(sop *scan, int offset); ! static void computejumps(struct parse *p, struct re_guts *g); ! static void computematchjumps(struct parse *p, struct re_guts *g); ! static sopno pluscount(struct parse *p, struct re_guts *g); ! static wint_t wgetnext(struct parse *p); ! ! #ifdef __cplusplus ! } ! #endif ! /* ========= end header generated by ./mkh ========= */ ! ! static char nuls[10]; /* place to point scanner in event of error */ ! ! /* ! * macros for use with parse structure ! * BEWARE: these know that the parse structure is named `p' !!! ! */ ! #define PEEK() (*p->next) ! #define PEEK2() (*(p->next+1)) ! #define MORE() (p->next < p->end) ! #define MORE2() (p->next+1 < p->end) ! #define SEE(c) (MORE() && PEEK() == (c)) ! #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) ! #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) ! #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) ! #define NEXT() (p->next++) ! #define NEXT2() (p->next += 2) ! #define NEXTn(n) (p->next += (n)) ! #define GETNEXT() (*p->next++) ! #define WGETNEXT() wgetnext(p) ! #define SETERROR(e) ((void)seterr(p, (e))) ! #define REQUIRE(co, e) ((co) || seterr(p, e)) ! #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) ! #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) ! #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) ! #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) ! #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) ! #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) ! #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) ! #define HERE() (p->slen) ! #define THERE() (p->slen - 1) ! #define THERETHERE() (p->slen - 2) ! #define DROP(n) (p->slen -= (n)) ! ! #ifndef NDEBUG ! static int never = 0; /* for use in asserts; shuts lint up */ ! #else ! #define never 0 /* some <assert.h>s have bugs too */ ! #endif ! ! /* ! * regcomp - interface for parser and compilation ! */ ! int /* 0 success, otherwise REG_something */ ! regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern, ! int cflags) { ! struct parse pa; ! struct re_guts *g; ! struct parse *p = &pa; ! int i; ! size_t len; ! size_t maxlen; ! #ifdef REDEBUG ! #define GOODFLAGS(f) (f) ! #else ! #define GOODFLAGS(f) ((f)&~REG_DUMP) ! #endif ! /* We had REG_INVARG, but we don't have that on Solaris. */ ! cflags = GOODFLAGS(cflags); ! if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC)) ! return (REG_EFATAL); ! if (cflags&REG_PEND) { ! if (preg->re_endp < pattern) ! return (REG_EFATAL); ! len = preg->re_endp - pattern; ! } else ! len = strlen(pattern); ! /* do the mallocs early so failure handling is easy */ ! g = (struct re_guts *)malloc(sizeof (struct re_guts)); ! if (g == NULL) ! return (REG_ESPACE); ! /* ! * Limit the pattern space to avoid a 32-bit overflow on buffer ! * extension. Also avoid any signed overflow in case of conversion ! * so make the real limit based on a 31-bit overflow. ! * ! * Likely not applicable on 64-bit systems but handle the case ! * generically (who are we to stop people from using ~715MB+ ! * patterns?). ! */ ! maxlen = ((size_t)-1 >> 1) / sizeof (sop) * 2 / 3; ! if (len >= maxlen) { ! free((char *)g); ! return (REG_ESPACE); } - p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ - assert(p->ssize >= len); - - p->strip = (sop *)malloc(p->ssize * sizeof (sop)); - p->slen = 0; - if (p->strip == NULL) { - free((char *)g); - return (REG_ESPACE); - } - - /* set things up */ - p->g = g; - p->next = pattern; /* convenience; we do not modify it */ - p->end = p->next + len; - p->error = 0; - p->ncsalloc = 0; - for (i = 0; i < NPAREN; i++) { - p->pbegin[i] = 0; - p->pend[i] = 0; - } - g->sets = NULL; - g->ncsets = 0; - g->cflags = cflags; - g->iflags = 0; - g->nbol = 0; - g->neol = 0; - g->must = NULL; - g->moffset = -1; - g->charjump = NULL; - g->matchjump = NULL; - g->mlen = 0; - g->nsub = 0; - g->backrefs = 0; - - /* do it */ - EMIT(OEND, 0); - g->firststate = THERE(); - if (cflags&REG_EXTENDED) - p_ere(p, OUT); - else if (cflags&REG_NOSPEC) - p_str(p); else ! p_bre(p, OUT, OUT); ! EMIT(OEND, 0); ! g->laststate = THERE(); ! /* tidy up loose ends and fill things in */ ! stripsnug(p, g); ! findmust(p, g); ! /* ! * only use Boyer-Moore algorithm if the pattern is bigger ! * than three characters ! */ ! if (g->mlen > 3) { ! computejumps(p, g); ! computematchjumps(p, g); ! if (g->matchjump == NULL && g->charjump != NULL) { ! free(g->charjump); ! g->charjump = NULL; ! } ! } ! g->nplus = pluscount(p, g); ! g->magic = MAGIC2; ! preg->re_nsub = g->nsub; ! preg->re_g = g; ! preg->re_magic = MAGIC1; ! #ifndef REDEBUG ! /* not debugging, so can't rely on the assert() in regexec() */ ! if (g->iflags&BAD) ! SETERROR(REG_EFATAL); ! #endif ! ! /* win or lose, we're done */ ! if (p->error != 0) /* lose */ ! regfree(preg); ! return (p->error); ! } ! ! /* ! * p_ere - ERE parser top level, concatenation and alternation ! */ ! static void ! p_ere(struct parse *p, ! int stop) /* character this ERE should end at */ ! { ! char c; ! sopno prevback; ! sopno prevfwd; ! sopno conc; ! int first = 1; /* is this the first alternative? */ ! ! for (;;) { ! /* do a bunch of concatenated expressions */ ! conc = HERE(); ! while (MORE() && (c = PEEK()) != '|' && c != stop) ! p_ere_exp(p); ! /* require nonempty */ ! (void) REQUIRE(HERE() != conc, REG_BADPAT); ! ! if (!EAT('|')) ! break; /* NOTE BREAK OUT */ ! ! if (first) { ! INSERT(OCH_, conc); /* offset is wrong */ ! prevfwd = conc; ! prevback = conc; ! first = 0; ! } ! ASTERN(OOR1, prevback); ! prevback = THERE(); ! AHEAD(prevfwd); /* fix previous offset */ ! prevfwd = HERE(); ! EMIT(OOR2, 0); /* offset is very wrong */ ! } ! ! if (!first) { /* tail-end fixups */ ! AHEAD(prevfwd); ! ASTERN(O_CH, prevback); ! } ! ! assert(!MORE() || SEE(stop)); ! } ! ! /* ! * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op ! */ ! static void ! p_ere_exp(struct parse *p) ! { ! char c; ! wint_t wc; ! sopno pos; ! int count; ! int count2; ! sopno subno; ! int wascaret = 0; ! ! assert(MORE()); /* caller should have ensured this */ ! c = GETNEXT(); ! ! pos = HERE(); ! switch (c) { ! case '(': ! (void) REQUIRE(MORE(), REG_EPAREN); ! p->g->nsub++; ! subno = p->g->nsub; ! if (subno < NPAREN) ! p->pbegin[subno] = HERE(); ! EMIT(OLPAREN, subno); ! if (!SEE(')')) ! p_ere(p, ')'); ! if (subno < NPAREN) { ! p->pend[subno] = HERE(); ! assert(p->pend[subno] != 0); ! } ! EMIT(ORPAREN, subno); ! (void) MUSTEAT(')', REG_EPAREN); ! break; ! #ifndef POSIX_MISTAKE ! case ')': /* happens only if no current unmatched ( */ ! /* ! * You may ask, why the ifndef? Because I didn't notice ! * this until slightly too late for 1003.2, and none of the ! * other 1003.2 regular-expression reviewers noticed it at ! * all. So an unmatched ) is legal POSIX, at least until ! * we can get it fixed. ! */ ! SETERROR(REG_EPAREN); ! break; ! #endif ! case '^': ! EMIT(OBOL, 0); ! p->g->iflags |= USEBOL; ! p->g->nbol++; ! wascaret = 1; ! break; ! case '$': ! EMIT(OEOL, 0); ! p->g->iflags |= USEEOL; ! p->g->neol++; ! break; ! case '|': ! SETERROR(REG_BADPAT); ! break; ! case '*': ! case '+': ! case '?': ! case '{': ! SETERROR(REG_BADRPT); ! break; ! case '.': ! if (p->g->cflags&REG_NEWLINE) ! nonnewline(p); else ! EMIT(OANY, 0); ! break; ! case '[': ! p_bracket(p); ! break; ! case '\\': ! (void) REQUIRE(MORE(), REG_EESCAPE); ! wc = WGETNEXT(); ! switch (wc) { ! case '<': ! EMIT(OBOW, 0); ! break; ! case '>': ! EMIT(OEOW, 0); ! break; ! default: ! ordinary(p, wc); ! break; } break; ! default: ! if (p->error != 0) ! return; ! p->next--; ! wc = WGETNEXT(); ! ordinary(p, wc); ! break; } ! ! if (!MORE()) ! return; ! c = PEEK(); ! /* we call { a repetition if followed by a digit */ ! if (!(c == '*' || c == '+' || c == '?' || c == '{')) ! return; /* no repetition, we're done */ ! else if (c == '{') ! (void) REQUIRE(MORE2() && \ ! (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); ! NEXT(); ! ! (void) REQUIRE(!wascaret, REG_BADRPT); ! switch (c) { ! case '*': /* implemented as +? */ ! /* this case does not require the (y|) trick, noKLUDGE */ ! INSERT(OPLUS_, pos); ! ASTERN(O_PLUS, pos); ! INSERT(OQUEST_, pos); ! ASTERN(O_QUEST, pos); ! break; ! case '+': ! INSERT(OPLUS_, pos); ! ASTERN(O_PLUS, pos); ! break; ! case '?': ! /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ ! INSERT(OCH_, pos); /* offset slightly wrong */ ! ASTERN(OOR1, pos); /* this one's right */ ! AHEAD(pos); /* fix the OCH_ */ ! EMIT(OOR2, 0); /* offset very wrong... */ ! AHEAD(THERE()); /* ...so fix it */ ! ASTERN(O_CH, THERETHERE()); ! break; ! case '{': ! count = p_count(p); ! if (EAT(',')) { ! if (isdigit((uch)PEEK())) { ! count2 = p_count(p); ! (void) REQUIRE(count <= count2, REG_BADBR); ! } else /* single number with comma */ ! count2 = INFINITY; ! } else /* just a single number */ ! count2 = count; ! repeat(p, pos, count, count2); ! if (!EAT('}')) { /* error heuristics */ ! while (MORE() && PEEK() != '}') ! NEXT(); ! (void) REQUIRE(MORE(), REG_EBRACE); ! SETERROR(REG_BADBR); } ! break; } ! if (!MORE()) ! return; ! c = PEEK(); ! if (!(c == '*' || c == '+' || c == '?' || ! (c == '{' && MORE2() && isdigit((uch)PEEK2())))) ! return; ! SETERROR(REG_BADRPT); ! } ! /* ! * p_str - string (no metacharacters) "parser" ! */ ! static void ! p_str(struct parse *p) ! { ! (void) REQUIRE(MORE(), REG_BADPAT); ! while (MORE()) ! ordinary(p, WGETNEXT()); } ! /* ! * p_bre - BRE parser top level, anchoring and concatenation ! * Giving end1 as OUT essentially eliminates the end1/end2 check. ! * ! * This implementation is a bit of a kludge, in that a trailing $ is first ! * taken as an ordinary character and then revised to be an anchor. ! * The amount of lookahead needed to avoid this kludge is excessive. ! */ ! static void ! p_bre(struct parse *p, ! int end1, /* first terminating character */ ! int end2) /* second terminating character */ { ! sopno start = HERE(); ! int first = 1; /* first subexpression? */ ! int wasdollar = 0; ! ! if (EAT('^')) { ! EMIT(OBOL, 0); ! p->g->iflags |= USEBOL; ! p->g->nbol++; ! } ! while (MORE() && !SEETWO(end1, end2)) { ! wasdollar = p_simp_re(p, first); ! first = 0; ! } ! if (wasdollar) { /* oops, that was a trailing anchor */ ! DROP(1); ! EMIT(OEOL, 0); ! p->g->iflags |= USEEOL; ! p->g->neol++; ! } ! ! (void) REQUIRE(HERE() != start, REG_BADPAT); /* require nonempty */ } ! /* ! * p_simp_re - parse a simple RE, an atom possibly followed by a repetition ! */ ! static int /* was the simple RE an unbackslashed $? */ ! p_simp_re(struct parse *p, ! int starordinary) /* is a leading * an ordinary character? */ { - int c; - int count; - int count2; - sopno pos; - int i; - wint_t wc; - sopno subno; - #define BACKSL (1<<CHAR_BIT) - - pos = HERE(); /* repetition op, if any, covers from here */ - - assert(MORE()); /* caller should have ensured this */ - c = GETNEXT(); - if (c == '\\') { - (void) REQUIRE(MORE(), REG_EESCAPE); - c = BACKSL | GETNEXT(); - } - switch (c) { - case '.': - if (p->g->cflags&REG_NEWLINE) - nonnewline(p); - else - EMIT(OANY, 0); - break; - case '[': - p_bracket(p); - break; - case BACKSL|'<': - EMIT(OBOW, 0); - break; - case BACKSL|'>': - EMIT(OEOW, 0); - break; - case BACKSL|'{': - SETERROR(REG_BADRPT); - break; - case BACKSL|'(': - p->g->nsub++; - subno = p->g->nsub; - if (subno < NPAREN) - p->pbegin[subno] = HERE(); - EMIT(OLPAREN, subno); - /* the MORE here is an error heuristic */ - if (MORE() && !SEETWO('\\', ')')) - p_bre(p, '\\', ')'); - if (subno < NPAREN) { - p->pend[subno] = HERE(); - assert(p->pend[subno] != 0); - } - EMIT(ORPAREN, subno); - (void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN); - break; - case BACKSL|')': /* should not get here -- must be user */ - SETERROR(REG_EPAREN); - break; - case BACKSL|'1': - case BACKSL|'2': - case BACKSL|'3': - case BACKSL|'4': - case BACKSL|'5': - case BACKSL|'6': - case BACKSL|'7': - case BACKSL|'8': - case BACKSL|'9': - i = (c&~BACKSL) - '0'; - assert(i < NPAREN); - if (p->pend[i] != 0) { - assert(i <= p->g->nsub); - EMIT(OBACK_, i); - assert(p->pbegin[i] != 0); - assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); - assert(OP(p->strip[p->pend[i]]) == ORPAREN); - (void) dupl(p, p->pbegin[i]+1, p->pend[i]); - EMIT(O_BACK, i); - } else - SETERROR(REG_ESUBREG); - p->g->backrefs = 1; - break; - case '*': - (void) REQUIRE(starordinary, REG_BADRPT); - /* FALLTHROUGH */ - default: - if (p->error != 0) - return (0); /* Definitely not $... */ - p->next--; - wc = WGETNEXT(); - ordinary(p, wc); - break; - } - - if (EAT('*')) { /* implemented as +? */ - /* this case does not require the (y|) trick, noKLUDGE */ - INSERT(OPLUS_, pos); - ASTERN(O_PLUS, pos); - INSERT(OQUEST_, pos); - ASTERN(O_QUEST, pos); - } else if (EATTWO('\\', '{')) { - count = p_count(p); - if (EAT(',')) { - if (MORE() && isdigit((uch)PEEK())) { - count2 = p_count(p); - (void) REQUIRE(count <= count2, REG_BADBR); - } else /* single number with comma */ - count2 = INFINITY; - } else /* just a single number */ - count2 = count; - repeat(p, pos, count, count2); - if (!EATTWO('\\', '}')) { /* error heuristics */ - while (MORE() && !SEETWO('\\', '}')) - NEXT(); - (void) REQUIRE(MORE(), REG_EBRACE); - SETERROR(REG_BADBR); - } - } else if (c == '$') /* $ (but not \$) ends it */ - return (1); - - return (0); - } - - /* - * p_count - parse a repetition count - */ - static int /* the value */ - p_count(struct parse *p) - { - int count = 0; - int ndigits = 0; - - while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { - count = count*10 + (GETNEXT() - '0'); - ndigits++; - } - - (void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); - return (count); - } - - /* - * p_bracket - parse a bracketed character list - */ - static void - p_bracket(struct parse *p) - { - cset *cs; - wint_t ch; - - /* Dept of Truly Sickening Special-Case Kludges */ - if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { - EMIT(OBOW, 0); - NEXTn(6); - return; - } - if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { - EMIT(OEOW, 0); - NEXTn(6); - return; - } - - if ((cs = allocset(p)) == NULL) - return; - - if (p->g->cflags&REG_ICASE) - cs->icase = 1; - if (EAT('^')) - cs->invert = 1; - if (EAT(']')) - CHadd(p, cs, ']'); - else if (EAT('-')) - CHadd(p, cs, '-'); - while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) - p_b_term(p, cs); - if (EAT('-')) - CHadd(p, cs, '-'); - (void) MUSTEAT(']', REG_EBRACK); - - if (p->error != 0) /* don't mess things up further */ - return; - - if (cs->invert && p->g->cflags&REG_NEWLINE) - cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); - - if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ - ordinary(p, ch); - freeset(p, cs); - } else - EMIT(OANYOF, (int)(cs - p->g->sets)); - } - - /* - * p_b_term - parse one term of a bracketed character list - */ - static void - p_b_term(struct parse *p, cset *cs) - { - char c; - wint_t start, finish; - wint_t i; - locale_t loc = uselocale(NULL); - - /* classify what we've got */ - switch ((MORE()) ? PEEK() : '\0') { - case '[': - c = (MORE2()) ? PEEK2() : '\0'; - break; - case '-': - SETERROR(REG_ERANGE); - return; /* NOTE RETURN */ - default: - c = '\0'; - break; - } - - switch (c) { - case ':': /* character class */ - NEXT2(); - (void) REQUIRE(MORE(), REG_EBRACK); - c = PEEK(); - (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE); - p_b_cclass(p, cs); - (void) REQUIRE(MORE(), REG_EBRACK); - (void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE); - break; - case '=': /* equivalence class */ - NEXT2(); - (void) REQUIRE(MORE(), REG_EBRACK); - c = PEEK(); - (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE); - p_b_eclass(p, cs); - (void) REQUIRE(MORE(), REG_EBRACK); - (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); - break; - default: /* symbol, ordinary character, or range */ - start = p_b_symbol(p); - if (SEE('-') && MORE2() && PEEK2() != ']') { - /* range */ - NEXT(); - if (EAT('-')) - finish = '-'; - else - finish = p_b_symbol(p); - } else - finish = start; - if (start == finish) - CHadd(p, cs, start); - else { - if (loc->collate->lc_is_posix) { - (void) REQUIRE((uch)start <= (uch)finish, - REG_ERANGE); - CHaddrange(p, cs, start, finish); - } else { - (void) REQUIRE(_collate_range_cmp(start, - finish, loc) <= 0, REG_ERANGE); - for (i = 0; i <= UCHAR_MAX; i++) { - if (_collate_range_cmp(start, i, loc) - <= 0 && - _collate_range_cmp(i, finish, loc) - <= 0) - CHadd(p, cs, i); - } - } - } - break; - } - } - - /* - * p_b_cclass - parse a character-class name and deal with it - */ - static void - p_b_cclass(struct parse *p, cset *cs) - { - const char *sp = p->next; size_t len; - wctype_t wct; - char clname[16]; ! while (MORE() && isalpha((uch)PEEK())) ! NEXT(); ! len = p->next - sp; ! if (len >= sizeof (clname) - 1) { ! SETERROR(REG_ECTYPE); ! return; } - (void) memcpy(clname, sp, len); - clname[len] = '\0'; - if ((wct = wctype(clname)) == 0) { - SETERROR(REG_ECTYPE); - return; - } - CHaddtype(p, cs, wct); - } - - /* - * p_b_eclass - parse an equivalence-class name and deal with it - * - * This implementation is incomplete. xxx - */ - static void - p_b_eclass(struct parse *p, cset *cs) - { - wint_t c; - - c = p_b_coll_elem(p, '='); - CHadd(p, cs, c); - } - - /* - * p_b_symbol - parse a character or [..]ed multicharacter collating symbol - */ - static wint_t /* value of symbol */ - p_b_symbol(struct parse *p) - { - wint_t value; - - (void) REQUIRE(MORE(), REG_EBRACK); - if (!EATTWO('[', '.')) - return (WGETNEXT()); - - /* collating symbol */ - value = p_b_coll_elem(p, '.'); - (void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); - return (value); - } - - /* - * p_b_coll_elem - parse a collating-element name and look it up - */ - static wint_t /* value of collating element */ - p_b_coll_elem(struct parse *p, - wint_t endc) /* name ended by endc,']' */ - { - const char *sp = p->next; - struct cname *cp; - mbstate_t mbs; - wchar_t wc; - size_t clen, len; - - while (MORE() && !SEETWO(endc, ']')) - NEXT(); - if (!MORE()) { - SETERROR(REG_EBRACK); - return (0); - } - len = p->next - sp; - for (cp = cnames; cp->name != NULL; cp++) - if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') - return (cp->code); /* known name */ - (void) memset(&mbs, 0, sizeof (mbs)); - if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) - return (wc); /* single character */ - else if (clen == (size_t)-1 || clen == (size_t)-2) - SETERROR(REG_ECHAR); else ! SETERROR(REG_ECOLLATE); /* neither */ ! return (0); } ! /* ! * othercase - return the case counterpart of an alphabetic ! */ ! static wint_t /* if no counterpart, return ch */ ! othercase(wint_t ch) { ! assert(iswalpha(ch)); ! if (iswupper(ch)) ! return (towlower(ch)); ! else if (iswlower(ch)) ! return (towupper(ch)); ! else /* peculiar, but could happen */ ! return (ch); } ! /* ! * bothcases - emit a dualcase version of a two-case character ! * ! * Boy, is this implementation ever a kludge... ! */ ! static void ! bothcases(struct parse *p, wint_t ch) { ! const char *oldnext = p->next; ! const char *oldend = p->end; ! char bracket[3 + MB_LEN_MAX]; ! size_t n; ! mbstate_t mbs; ! ! assert(othercase(ch) != ch); /* p_bracket() would recurse */ ! p->next = bracket; ! (void) memset(&mbs, 0, sizeof (mbs)); ! n = wcrtomb(bracket, ch, &mbs); ! assert(n != (size_t)-1); ! bracket[n] = ']'; ! bracket[n + 1] = '\0'; ! p->end = bracket+n+1; ! p_bracket(p); ! assert(p->next == p->end); ! p->next = oldnext; ! p->end = oldend; ! } ! ! /* ! * ordinary - emit an ordinary character ! */ ! static void ! ordinary(struct parse *p, wint_t ch) ! { ! cset *cs; ! ! if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch) ! bothcases(p, ch); ! else if ((ch & OPDMASK) == ch) ! EMIT(OCHAR, ch); ! else { ! /* ! * Kludge: character is too big to fit into an OCHAR operand. ! * Emit a singleton set. ! */ ! if ((cs = allocset(p)) == NULL) ! return; ! CHadd(p, cs, ch); ! EMIT(OANYOF, (int)(cs - p->g->sets)); ! } ! } ! ! /* ! * nonnewline - emit REG_NEWLINE version of OANY ! * ! * Boy, is this implementation ever a kludge... ! */ ! static void ! nonnewline(struct parse *p) ! { ! const char *oldnext = p->next; ! const char *oldend = p->end; ! char bracket[4]; ! ! p->next = bracket; ! p->end = bracket+3; ! bracket[0] = '^'; ! bracket[1] = '\n'; ! bracket[2] = ']'; ! bracket[3] = '\0'; ! p_bracket(p); ! assert(p->next == bracket+3); ! p->next = oldnext; ! p->end = oldend; ! } ! ! /* ! * repeat - generate code for a bounded repetition, recursively if needed ! */ ! static void ! repeat(struct parse *p, ! sopno start, /* operand from here to end of strip */ ! int from, /* repeated from this number */ ! int to) /* to this number of times (maybe INFINITY) */ ! { ! sopno finish = HERE(); ! #define N 2 ! #define INF 3 ! #define REP(f, t) ((f)*8 + (t)) ! #define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) ! sopno copy; ! ! if (p->error != 0) /* head off possible runaway recursion */ ! return; ! ! assert(from <= to); ! ! switch (REP(MAP(from), MAP(to))) { ! case REP(0, 0): /* must be user doing this */ ! DROP(finish-start); /* drop the operand */ ! break; ! case REP(0, 1): /* as x{1,1}? */ ! case REP(0, N): /* as x{1,n}? */ ! case REP(0, INF): /* as x{1,}? */ ! /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ ! INSERT(OCH_, start); /* offset is wrong... */ ! repeat(p, start+1, 1, to); ! ASTERN(OOR1, start); ! AHEAD(start); /* ... fix it */ ! EMIT(OOR2, 0); ! AHEAD(THERE()); ! ASTERN(O_CH, THERETHERE()); ! break; ! case REP(1, 1): /* trivial case */ ! /* done */ ! break; ! case REP(1, N): /* as x?x{1,n-1} */ ! /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ ! INSERT(OCH_, start); ! ASTERN(OOR1, start); ! AHEAD(start); ! EMIT(OOR2, 0); /* offset very wrong... */ ! AHEAD(THERE()); /* ...so fix it */ ! ASTERN(O_CH, THERETHERE()); ! copy = dupl(p, start+1, finish+1); ! assert(copy == finish+4); ! repeat(p, copy, 1, to-1); ! break; ! case REP(1, INF): /* as x+ */ ! INSERT(OPLUS_, start); ! ASTERN(O_PLUS, start); ! break; ! case REP(N, N): /* as xx{m-1,n-1} */ ! copy = dupl(p, start, finish); ! repeat(p, copy, from-1, to-1); ! break; ! case REP(N, INF): /* as xx{n-1,INF} */ ! copy = dupl(p, start, finish); ! repeat(p, copy, from-1, to); ! break; ! default: /* "can't happen" */ ! SETERROR(REG_EFATAL); /* just in case */ ! break; ! } ! } ! ! /* ! * wgetnext - helper function for WGETNEXT() macro. Gets the next wide ! * character from the parse struct, signals a REG_ILLSEQ error if the ! * character can't be converted. Returns the number of bytes consumed. ! */ ! static wint_t ! wgetnext(struct parse *p) ! { ! mbstate_t mbs; ! wchar_t wc; ! size_t n; ! ! (void) memset(&mbs, 0, sizeof (mbs)); ! n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); ! if (n == (size_t)-1 || n == (size_t)-2) { ! SETERROR(REG_ECHAR); ! return (0); ! } ! if (n == 0) ! n = 1; ! p->next += n; ! return (wc); ! } ! ! /* ! * seterr - set an error condition ! */ ! static int /* useless but makes type checking happy */ ! seterr(struct parse *p, int e) ! { ! if (p->error == 0) /* keep earliest error condition */ ! p->error = e; ! p->next = nuls; /* try to bring things to a halt */ ! p->end = nuls; ! return (0); /* make the return value well-defined */ ! } ! ! /* ! * allocset - allocate a set of characters for [] ! */ ! static cset * ! allocset(struct parse *p) ! { ! cset *cs, *ncs; ! ! ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs)); ! if (ncs == NULL) { ! SETERROR(REG_ESPACE); ! return (NULL); ! } ! p->g->sets = ncs; ! cs = &p->g->sets[p->g->ncsets++]; ! (void) memset(cs, 0, sizeof (*cs)); ! ! return (cs); ! } ! ! /* ! * freeset - free a now-unused set ! */ ! static void ! freeset(struct parse *p, cset *cs) ! { ! cset *top = &p->g->sets[p->g->ncsets]; ! ! free(cs->wides); ! free(cs->ranges); ! free(cs->types); ! (void) memset(cs, 0, sizeof (*cs)); ! if (cs == top-1) /* recover only the easy case */ ! p->g->ncsets--; ! } ! ! /* ! * singleton - Determine whether a set contains only one character, ! * returning it if so, otherwise returning OUT. ! */ ! static wint_t ! singleton(cset *cs) ! { ! wint_t i, s, n; ! ! for (i = n = 0; i < NC; i++) ! if (CHIN(cs, i)) { ! n++; ! s = i; ! } ! if (n == 1) ! return (s); ! if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && ! cs->icase == 0) ! return (cs->wides[0]); ! /* Don't bother handling the other cases. */ ! return (OUT); ! } ! ! /* ! * CHadd - add character to character set. ! */ ! static void ! CHadd(struct parse *p, cset *cs, wint_t ch) ! { ! wint_t nch, *newwides; ! assert(ch >= 0); ! if (ch < NC) ! cs->bmp[ch >> 3] |= 1 << (ch & 7); ! else { ! newwides = realloc(cs->wides, (cs->nwides + 1) * ! sizeof (*cs->wides)); ! if (newwides == NULL) { ! SETERROR(REG_ESPACE); ! return; ! } ! cs->wides = newwides; ! cs->wides[cs->nwides++] = ch; ! } ! if (cs->icase) { ! if ((nch = towlower(ch)) < NC) ! cs->bmp[nch >> 3] |= 1 << (nch & 7); ! if ((nch = towupper(ch)) < NC) ! cs->bmp[nch >> 3] |= 1 << (nch & 7); ! } ! } ! ! /* ! * CHaddrange - add all characters in the range [min,max] to a character set. ! */ ! static void ! CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) ! { ! crange *newranges; ! ! for (; min < NC && min <= max; min++) ! CHadd(p, cs, min); ! if (min >= max) ! return; ! newranges = realloc(cs->ranges, (cs->nranges + 1) * ! sizeof (*cs->ranges)); ! if (newranges == NULL) { ! SETERROR(REG_ESPACE); ! return; ! } ! cs->ranges = newranges; ! cs->ranges[cs->nranges].min = min; ! cs->ranges[cs->nranges].max = max; ! cs->nranges++; ! } ! ! /* ! * CHaddtype - add all characters of a certain type to a character set. ! */ ! static void ! CHaddtype(struct parse *p, cset *cs, wctype_t wct) ! { ! wint_t i; ! wctype_t *newtypes; ! ! for (i = 0; i < NC; i++) ! if (iswctype(i, wct)) ! CHadd(p, cs, i); ! newtypes = realloc(cs->types, (cs->ntypes + 1) * ! sizeof (*cs->types)); ! if (newtypes == NULL) { ! SETERROR(REG_ESPACE); ! return; ! } ! cs->types = newtypes; ! cs->types[cs->ntypes++] = wct; ! } ! ! /* ! * dupl - emit a duplicate of a bunch of sops ! */ ! static sopno /* start of duplicate */ ! dupl(struct parse *p, ! sopno start, /* from here */ ! sopno finish) /* to this less one */ ! { ! sopno ret = HERE(); ! sopno len = finish - start; ! ! assert(finish >= start); ! if (len == 0) ! return (ret); ! if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ ! return (ret); ! assert(p->ssize >= p->slen + len); ! (void) memcpy((char *)(p->strip + p->slen), ! (char *)(p->strip + start), (size_t)len*sizeof (sop)); ! p->slen += len; ! return (ret); ! } ! ! /* ! * doemit - emit a strip operator ! * ! * It might seem better to implement this as a macro with a function as ! * hard-case backup, but it's just too big and messy unless there are ! * some changes to the data structures. Maybe later. ! */ ! static void ! doemit(struct parse *p, sop op, size_t opnd) ! { ! /* avoid making error situations worse */ ! if (p->error != 0) ! return; ! ! /* deal with oversize operands ("can't happen", more or less) */ ! assert(opnd < 1<<OPSHIFT); ! ! /* deal with undersized strip */ ! if (p->slen >= p->ssize) ! if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ ! return; ! ! /* finally, it's all reduced to the easy case */ ! p->strip[p->slen++] = SOP(op, opnd); ! } ! ! /* ! * doinsert - insert a sop into the strip ! */ ! static void ! doinsert(struct parse *p, sop op, size_t opnd, sopno pos) ! { ! sopno sn; ! sop s; ! int i; ! ! /* avoid making error situations worse */ ! if (p->error != 0) ! return; ! ! sn = HERE(); ! EMIT(op, opnd); /* do checks, ensure space */ ! assert(HERE() == sn+1); ! s = p->strip[sn]; ! ! /* adjust paren pointers */ ! assert(pos > 0); ! for (i = 1; i < NPAREN; i++) { ! if (p->pbegin[i] >= pos) { ! p->pbegin[i]++; ! } ! if (p->pend[i] >= pos) { ! p->pend[i]++; ! } ! } ! ! (void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], ! (HERE()-pos-1)*sizeof (sop)); ! p->strip[pos] = s; ! } ! ! /* ! * dofwd - complete a forward reference ! */ ! static void ! dofwd(struct parse *p, sopno pos, sop value) ! { ! /* avoid making error situations worse */ ! if (p->error != 0) ! return; ! ! assert(value < 1<<OPSHIFT); ! p->strip[pos] = OP(p->strip[pos]) | value; ! } ! ! /* ! * enlarge - enlarge the strip ! */ ! static int ! enlarge(struct parse *p, sopno size) ! { ! sop *sp; ! ! if (p->ssize >= size) ! return (1); ! ! sp = (sop *)realloc(p->strip, size*sizeof (sop)); ! if (sp == NULL) { ! SETERROR(REG_ESPACE); ! return (0); ! } ! p->strip = sp; ! p->ssize = size; ! return (1); ! } ! ! /* ! * stripsnug - compact the strip ! */ ! static void ! stripsnug(struct parse *p, struct re_guts *g) ! { ! g->nstates = p->slen; ! g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop)); ! if (g->strip == NULL) { ! SETERROR(REG_ESPACE); ! g->strip = p->strip; ! } ! } ! ! /* ! * findmust - fill in must and mlen with longest mandatory literal string ! * ! * This algorithm could do fancy things like analyzing the operands of | ! * for common subsequences. Someday. This code is simple and finds most ! * of the interesting cases. ! * ! * Note that must and mlen got initialized during setup. ! */ ! static void ! findmust(struct parse *p, struct re_guts *g) ! { ! sop *scan; ! sop *start = NULL; ! sop *newstart = NULL; ! sopno newlen; ! sop s; ! char *cp; ! int offset; ! char buf[MB_LEN_MAX]; ! size_t clen; ! mbstate_t mbs; ! locale_t loc = uselocale(NULL); ! ! /* avoid making error situations worse */ ! if (p->error != 0) ! return; ! ! /* ! * It's not generally safe to do a ``char'' substring search on ! * multibyte character strings, but it's safe for at least ! * UTF-8 (see RFC 3629). ! */ ! if (MB_CUR_MAX > 1 && ! strcmp(loc->runelocale->__encoding, "UTF-8") != 0) ! return; ! ! /* find the longest OCHAR sequence in strip */ ! newlen = 0; ! offset = 0; ! g->moffset = 0; ! scan = g->strip + 1; ! do { ! s = *scan++; ! switch (OP(s)) { ! case OCHAR: /* sequence member */ ! if (newlen == 0) { /* new sequence */ ! (void) memset(&mbs, 0, sizeof (mbs)); ! newstart = scan - 1; ! } ! clen = wcrtomb(buf, OPND(s), &mbs); ! if (clen == (size_t)-1) ! goto toohard; ! newlen += clen; ! break; ! case OPLUS_: /* things that don't break one */ ! case OLPAREN: ! case ORPAREN: ! break; ! case OQUEST_: /* things that must be skipped */ ! case OCH_: ! offset = altoffset(scan, offset); ! scan--; ! do { ! scan += OPND(s); ! s = *scan; ! /* assert() interferes w debug printouts */ ! if (OP(s) != O_QUEST && OP(s) != O_CH && ! OP(s) != OOR2) { ! g->iflags |= BAD; ! return; ! } ! } while (OP(s) != O_QUEST && OP(s) != O_CH); ! /* FALLTHROUGH */ ! case OBOW: /* things that break a sequence */ ! case OEOW: ! case OBOL: ! case OEOL: ! case O_QUEST: ! case O_CH: ! case OEND: ! if (newlen > g->mlen) { /* ends one */ ! start = newstart; ! g->mlen = newlen; ! if (offset > -1) { ! g->moffset += offset; ! offset = newlen; ! } else ! g->moffset = offset; ! } else { ! if (offset > -1) ! offset += newlen; ! } ! newlen = 0; ! break; ! case OANY: ! if (newlen > g->mlen) { /* ends one */ ! start = newstart; ! g->mlen = newlen; ! if (offset > -1) { ! g->moffset += offset; ! offset = newlen; ! } else ! g->moffset = offset; ! } else { ! if (offset > -1) ! offset += newlen; ! } ! if (offset > -1) ! offset++; ! newlen = 0; ! break; ! case OANYOF: /* may or may not invalidate offset */ ! /* First, everything as OANY */ ! if (newlen > g->mlen) { /* ends one */ ! start = newstart; ! g->mlen = newlen; ! if (offset > -1) { ! g->moffset += offset; ! offset = newlen; ! } else ! g->moffset = offset; ! } else { ! if (offset > -1) ! offset += newlen; ! } ! if (offset > -1) ! offset++; ! newlen = 0; ! break; ! toohard: ! default: ! /* ! * Anything here makes it impossible or too hard ! * to calculate the offset -- so we give up; ! * save the last known good offset, in case the ! * must sequence doesn't occur later. ! */ ! if (newlen > g->mlen) { /* ends one */ ! start = newstart; ! g->mlen = newlen; ! if (offset > -1) ! g->moffset += offset; ! else ! g->moffset = offset; ! } ! offset = -1; ! newlen = 0; ! break; ! } ! } while (OP(s) != OEND); ! ! if (g->mlen == 0) { /* there isn't one */ ! g->moffset = -1; ! return; ! } ! ! /* turn it into a character string */ ! g->must = malloc((size_t)g->mlen + 1); ! if (g->must == NULL) { /* argh; just forget it */ ! g->mlen = 0; ! g->moffset = -1; ! return; ! } ! cp = g->must; ! scan = start; ! (void) memset(&mbs, 0, sizeof (mbs)); ! while (cp < g->must + g->mlen) { ! while (OP(s = *scan++) != OCHAR) ! continue; ! clen = wcrtomb(cp, OPND(s), &mbs); ! assert(clen != (size_t)-1); ! cp += clen; ! } ! assert(cp == g->must + g->mlen); ! *cp++ = '\0'; /* just on general principles */ ! } ! ! /* ! * altoffset - choose biggest offset among multiple choices ! * ! * Compute, recursively if necessary, the largest offset among multiple ! * re paths. ! */ ! static int ! altoffset(sop *scan, int offset) ! { ! int largest; ! int try; ! sop s; ! ! /* If we gave up already on offsets, return */ ! if (offset == -1) ! return (-1); ! ! largest = 0; ! try = 0; ! s = *scan++; ! while (OP(s) != O_QUEST && OP(s) != O_CH) { ! switch (OP(s)) { ! case OOR1: ! if (try > largest) ! largest = try; ! try = 0; ! break; ! case OQUEST_: ! case OCH_: ! try = altoffset(scan, try); ! if (try == -1) ! return (-1); ! scan--; ! do { ! scan += OPND(s); ! s = *scan; ! if (OP(s) != O_QUEST && OP(s) != O_CH && ! OP(s) != OOR2) ! return (-1); ! } while (OP(s) != O_QUEST && OP(s) != O_CH); ! /* ! * We must skip to the next position, or we'll ! * leave altoffset() too early. ! */ ! scan++; ! break; ! case OANYOF: ! case OCHAR: ! case OANY: ! try++; ! /*FALLTHRU*/ ! case OBOW: ! case OEOW: ! case OLPAREN: ! case ORPAREN: ! case OOR2: ! break; ! default: ! try = -1; ! break; ! } ! if (try == -1) ! return (-1); ! s = *scan++; ! } ! ! if (try > largest) ! largest = try; ! ! return (largest+offset); ! } ! ! /* ! * computejumps - compute char jumps for BM scan ! * ! * This algorithm assumes g->must exists and is has size greater than ! * zero. It's based on the algorithm found on Computer Algorithms by ! * Sara Baase. ! * ! * A char jump is the number of characters one needs to jump based on ! * the value of the character from the text that was mismatched. ! */ ! static void ! computejumps(struct parse *p, struct re_guts *g) ! { ! int ch; ! int mindex; ! ! /* Avoid making errors worse */ ! if (p->error != 0) ! return; ! ! g->charjump = (int *)malloc((NC + 1) * sizeof (int)); ! if (g->charjump == NULL) /* Not a fatal error */ ! return; ! /* Adjust for signed chars, if necessary */ ! g->charjump = &g->charjump[-(CHAR_MIN)]; ! ! /* ! * If the character does not exist in the pattern, the jump ! * is equal to the number of characters in the pattern. ! */ ! for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) ! g->charjump[ch] = g->mlen; ! ! /* ! * If the character does exist, compute the jump that would ! * take us to the last character in the pattern equal to it ! * (notice that we match right to left, so that last character ! * is the first one that would be matched). ! */ ! for (mindex = 0; mindex < g->mlen; mindex++) ! g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; ! } ! ! /* ! * computematchjumps - compute match jumps for BM scan ! * ! * This algorithm assumes g->must exists and is has size greater than ! * zero. It's based on the algorithm found on Computer Algorithms by ! * Sara Baase. ! * ! * A match jump is the number of characters one needs to advance based ! * on the already-matched suffix. ! * Notice that all values here are minus (g->mlen-1), because of the way ! * the search algorithm works. ! */ ! static void ! computematchjumps(struct parse *p, struct re_guts *g) ! { ! int mindex; /* General "must" iterator */ ! int suffix; /* Keeps track of matching suffix */ ! int ssuffix; /* Keeps track of suffixes' suffix */ ! int *pmatches; ! /* ! * pmatches[k] points to the next i ! * such that i+1...mlen is a substring ! * of k+1...k+mlen-i-1 ! */ ! ! /* Avoid making errors worse */ ! if (p->error != 0) ! return; ! ! pmatches = (int *)malloc(g->mlen * sizeof (unsigned int)); ! if (pmatches == NULL) { ! g->matchjump = NULL; ! return; ! } ! ! g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int)); ! if (g->matchjump == NULL) { /* Not a fatal error */ ! free(pmatches); ! return; ! } ! ! /* Set maximum possible jump for each character in the pattern */ ! for (mindex = 0; mindex < g->mlen; mindex++) ! g->matchjump[mindex] = 2*g->mlen - mindex - 1; ! ! /* Compute pmatches[] */ ! for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; ! mindex--, suffix--) { ! pmatches[mindex] = suffix; ! ! /* ! * If a mismatch is found, interrupting the substring, ! * compute the matchjump for that position. If no ! * mismatch is found, then a text substring mismatched ! * against the suffix will also mismatch against the ! * substring. ! */ ! while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) { ! g->matchjump[suffix] = MIN(g->matchjump[suffix], ! g->mlen - mindex - 1); ! suffix = pmatches[suffix]; ! } ! } ! ! /* ! * Compute the matchjump up to the last substring found to jump ! * to the beginning of the largest must pattern prefix matching ! * it's own suffix. ! */ ! for (mindex = 0; mindex <= suffix; mindex++) ! g->matchjump[mindex] = MIN(g->matchjump[mindex], ! g->mlen + suffix - mindex); ! ! ssuffix = pmatches[suffix]; ! while (suffix < g->mlen) { ! while (suffix <= ssuffix && suffix < g->mlen) { ! g->matchjump[suffix] = MIN(g->matchjump[suffix], ! g->mlen + ssuffix - suffix); ! suffix++; ! } ! if (suffix < g->mlen) ! ssuffix = pmatches[ssuffix]; ! } ! ! free(pmatches); ! } ! ! /* ! * pluscount - count + nesting ! */ ! static sopno /* nesting depth */ ! pluscount(struct parse *p, struct re_guts *g) ! { ! sop *scan; ! sop s; ! sopno plusnest = 0; ! sopno maxnest = 0; ! ! if (p->error != 0) ! return (0); /* there may not be an OEND */ ! ! scan = g->strip + 1; ! do { ! s = *scan++; ! switch (OP(s)) { ! case OPLUS_: ! plusnest++; ! break; ! case O_PLUS: ! if (plusnest > maxnest) ! maxnest = plusnest; ! plusnest--; ! break; ! } ! } while (OP(s) != OEND); ! if (plusnest != 0) ! g->iflags |= BAD; ! return (maxnest); } --- 1,141 ---- /* ! * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> ! * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: + * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. + * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * ! * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ! * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ! * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ! * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ! * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ! * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ! * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ! * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ! * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ! * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ! * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ ! /* ! tre_regcomp.c - TRE POSIX compatible regex compilation functions. ! */ ! #include <string.h> ! #include <errno.h> #include <stdlib.h> ! #include "tre.h" ! #include "tre-internal.h" ! int ! tre_regncomp_l(regex_t *preg, const char *regex, size_t n, int cflags, ! locale_t loc) { ! int ret; ! tre_char_t *wregex; ! size_t wlen; ! wregex = malloc(sizeof(tre_char_t) * (n + 1)); ! if (wregex == NULL) ! return REG_ESPACE; ! /* If the current locale uses the standard single byte encoding of ! characters, we don't do a multibyte string conversion. If we did, ! many applications which use the default locale would break since ! the default "C" locale uses the 7-bit ASCII character set, and ! all characters with the eighth bit set would be considered invalid. */ ! if (TRE_MB_CUR_MAX_L(loc) == 1) ! { ! unsigned int i; ! const unsigned char *str = (const unsigned char *)regex; ! tre_char_t *wstr = wregex; ! for (i = 0; i < n; i++) ! *(wstr++) = *(str++); ! wlen = n; } else ! { ! size_t consumed; ! tre_char_t *wcptr = wregex; ! mbstate_t state; ! memset(&state, '\0', sizeof(state)); ! while (n > 0) ! { ! consumed = tre_mbrtowc_l(wcptr, regex, n, &state, loc); ! switch (consumed) ! { ! case 0: ! if (*regex == '\0') ! consumed = 1; else ! { ! free(wregex); ! return REG_BADPAT; } break; ! case (size_t)-1: ! case (size_t)-2: ! DPRINT(("mbrtowc: error %d: %s.\n", errno, strerror(errno))); ! free(wregex); ! return REG_ILLSEQ; } ! regex += consumed; ! n -= consumed; ! wcptr++; } ! wlen = wcptr - wregex; } ! wregex[wlen] = L'\0'; ! ret = tre_compile(preg, wregex, wlen, cflags, loc); ! free(wregex); ! return ret; } ! int ! tre_regncomp(regex_t *preg, const char *regex, size_t n, int cflags) { ! return tre_regncomp_l(preg, regex, n, cflags, uselocale((locale_t)0)); } ! int ! tre_regcomp_l(regex_t *preg, const char *regex, int cflags, locale_t loc) { size_t len; ! if (cflags & REG_PEND) ! { ! if ((const char *)(preg->re_endp) < regex) ! return REG_INVARG; ! len = (const char *)(preg->re_endp) - regex; } else ! len = strlen(regex); ! return tre_regncomp_l(preg, regex, len, cflags, loc); } ! int ! regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD regex, ! int cflags) { ! return tre_regcomp_l(preg, regex, cflags, uselocale((locale_t)0)); } ! void ! regfree(regex_t *preg) { ! tre_free(preg); }