Print this page
8993 sync regcomp(3C) with upstream

*** 39,50 **** #include <sys/types.h> #include <stdio.h> #include <string.h> #include <ctype.h> #include <limits.h> - #include <stdlib.h> #include <regex.h> #include <wchar.h> #include <wctype.h> #include "../locale/runetype.h" #include "../locale/collate.h" --- 39,51 ---- #include <sys/types.h> #include <stdio.h> #include <string.h> #include <ctype.h> #include <limits.h> #include <regex.h> + #include <stdlib.h> + #include <stdbool.h> #include <wchar.h> #include <wctype.h> #include "../locale/runetype.h" #include "../locale/collate.h"
*** 54,63 **** --- 55,82 ---- #include "cname.h" #include "../locale/mblocal.h" /* + * Branching context, used to keep track of branch state for all of the branch- + * aware functions. In addition to keeping track of branch positions for the + * p_branch_* functions, we use this to simplify some clumsiness in BREs for + * detection of whether ^ is acting as an anchor or being used erroneously and + * also for whether we're in a sub-expression or not. + */ + struct branchc { + sopno start; + sopno back; + sopno fwd; + + int nbranch; + int nchain; + bool outer; + bool terminate; + }; + + /* * parse structure, passed up and down to avoid global variables and * other clumsinesses */ struct parse { const char *next; /* next character in RE */
*** 69,91 **** int ncsalloc; /* number of csets allocated */ struct re_guts *g; #define NPAREN 10 /* we need to remember () 1-9 for back refs */ sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ sopno pend[NPAREN]; /* -> ) ([0] unused) */ }; /* ========= begin header generated by ./mkh ========= */ #ifdef __cplusplus extern "C" { #endif /* === regcomp.c === */ ! static void p_ere(struct parse *p, int stop); ! static void p_ere_exp(struct parse *p); static void p_str(struct parse *p); ! static void p_bre(struct parse *p, int end1, int end2); ! static int p_simp_re(struct parse *p, int starordinary); static int p_count(struct parse *p); static void p_bracket(struct parse *p); static void p_b_term(struct parse *p, cset *cs); static void p_b_cclass(struct parse *p, cset *cs); static void p_b_eclass(struct parse *p, cset *cs); --- 88,121 ---- int ncsalloc; /* number of csets allocated */ struct re_guts *g; #define NPAREN 10 /* we need to remember () 1-9 for back refs */ sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ sopno pend[NPAREN]; /* -> ) ([0] unused) */ + bool allowbranch; /* can this expression branch? */ + bool bre; /* convenience; is this a BRE? */ + bool (*parse_expr)(struct parse *, struct branchc *); + void (*pre_parse)(struct parse *, struct branchc *); + void (*post_parse)(struct parse *, struct branchc *); }; /* ========= begin header generated by ./mkh ========= */ #ifdef __cplusplus extern "C" { #endif /* === regcomp.c === */ ! static bool p_ere_exp(struct parse *p, struct branchc *bc); static void p_str(struct parse *p); ! static int p_branch_eat_delim(struct parse *p, struct branchc *bc); ! static void p_branch_ins_offset(struct parse *p, struct branchc *bc); ! static void p_branch_fix_tail(struct parse *p, struct branchc *bc); ! static bool p_branch_empty(struct parse *p, struct branchc *bc); ! static bool p_branch_do(struct parse *p, struct branchc *bc); ! static void p_bre_pre_parse(struct parse *p, struct branchc *bc); ! static void p_bre_post_parse(struct parse *p, struct branchc *bc); ! static void p_re(struct parse *p, int end1, int end2); ! static bool p_simp_re(struct parse *p, struct branchc *bc); static int p_count(struct parse *p); static void p_bracket(struct parse *p); static void p_b_term(struct parse *p, cset *cs); static void p_b_cclass(struct parse *p, cset *cs); static void p_b_eclass(struct parse *p, cset *cs);
*** 131,140 **** --- 161,171 ---- #define PEEK2() (*(p->next+1)) #define MORE() (p->next < p->end) #define MORE2() (p->next+1 < p->end) #define SEE(c) (MORE() && PEEK() == (c)) #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) + #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) #define NEXT() (p->next++) #define NEXT2() (p->next += 2) #define NEXTn(n) (p->next += (n))
*** 227,236 **** --- 258,280 ---- p->ncsalloc = 0; for (i = 0; i < NPAREN; i++) { p->pbegin[i] = 0; p->pend[i] = 0; } + if (cflags & REG_EXTENDED) { + p->allowbranch = true; + p->bre = false; + p->parse_expr = p_ere_exp; + p->pre_parse = NULL; + p->post_parse = NULL; + } else { + p->allowbranch = false; + p->bre = true; + p->parse_expr = p_simp_re; + p->pre_parse = p_bre_pre_parse; + p->post_parse = p_bre_post_parse; + } g->sets = NULL; g->ncsets = 0; g->cflags = cflags; g->iflags = 0; g->nbol = 0;
*** 244,259 **** g->backrefs = 0; /* do it */ EMIT(OEND, 0); g->firststate = THERE(); ! if (cflags&REG_EXTENDED) ! p_ere(p, OUT); ! else if (cflags&REG_NOSPEC) p_str(p); else ! p_bre(p, OUT, OUT); EMIT(OEND, 0); g->laststate = THERE(); /* tidy up loose ends and fill things in */ stripsnug(p, g); --- 288,301 ---- g->backrefs = 0; /* do it */ EMIT(OEND, 0); g->firststate = THERE(); ! if (cflags & REG_NOSPEC) p_str(p); else ! p_re(p, OUT, OUT); EMIT(OEND, 0); g->laststate = THERE(); /* tidy up loose ends and fill things in */ stripsnug(p, g);
*** 286,353 **** regfree(preg); return (p->error); } /* ! * p_ere - ERE parser top level, concatenation and alternation */ ! static void ! p_ere(struct parse *p, ! int stop) /* character this ERE should end at */ { char c; - sopno prevback; - sopno prevfwd; - sopno conc; - int first = 1; /* is this the first alternative? */ - - for (;;) { - /* do a bunch of concatenated expressions */ - conc = HERE(); - while (MORE() && (c = PEEK()) != '|' && c != stop) - p_ere_exp(p); - /* require nonempty */ - (void) REQUIRE(HERE() != conc, REG_BADPAT); - - if (!EAT('|')) - break; /* NOTE BREAK OUT */ - - if (first) { - INSERT(OCH_, conc); /* offset is wrong */ - prevfwd = conc; - prevback = conc; - first = 0; - } - ASTERN(OOR1, prevback); - prevback = THERE(); - AHEAD(prevfwd); /* fix previous offset */ - prevfwd = HERE(); - EMIT(OOR2, 0); /* offset is very wrong */ - } - - if (!first) { /* tail-end fixups */ - AHEAD(prevfwd); - ASTERN(O_CH, prevback); - } - - assert(!MORE() || SEE(stop)); - } - - /* - * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op - */ - static void - p_ere_exp(struct parse *p) - { - char c; wint_t wc; sopno pos; int count; int count2; sopno subno; int wascaret = 0; assert(MORE()); /* caller should have ensured this */ c = GETNEXT(); pos = HERE(); switch (c) { --- 328,352 ---- regfree(preg); return (p->error); } /* ! * Parse one subERE, an atom possibly followed by a repetition op, ! * return whether we should terminate or not. */ ! static bool ! p_ere_exp(struct parse *p, struct branchc *bc) { char c; wint_t wc; sopno pos; int count; int count2; sopno subno; int wascaret = 0; + (void) bc; assert(MORE()); /* caller should have ensured this */ c = GETNEXT(); pos = HERE(); switch (c) {
*** 357,367 **** subno = p->g->nsub; if (subno < NPAREN) p->pbegin[subno] = HERE(); EMIT(OLPAREN, subno); if (!SEE(')')) ! p_ere(p, ')'); if (subno < NPAREN) { p->pend[subno] = HERE(); assert(p->pend[subno] != 0); } EMIT(ORPAREN, subno); --- 356,366 ---- subno = p->g->nsub; if (subno < NPAREN) p->pbegin[subno] = HERE(); EMIT(OLPAREN, subno); if (!SEE(')')) ! p_re(p, ')', IGN); if (subno < NPAREN) { p->pend[subno] = HERE(); assert(p->pend[subno] != 0); } EMIT(ORPAREN, subno);
*** 423,445 **** break; } break; default: if (p->error != 0) ! return; p->next--; wc = WGETNEXT(); ordinary(p, wc); break; } if (!MORE()) ! return; c = PEEK(); /* we call { a repetition if followed by a digit */ if (!(c == '*' || c == '+' || c == '?' || c == '{')) ! return; /* no repetition, we're done */ else if (c == '{') (void) REQUIRE(MORE2() && \ (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); NEXT(); --- 422,444 ---- break; } break; default: if (p->error != 0) ! return (false); p->next--; wc = WGETNEXT(); ordinary(p, wc); break; } if (!MORE()) ! return (false); c = PEEK(); /* we call { a repetition if followed by a digit */ if (!(c == '*' || c == '+' || c == '?' || c == '{')) ! return (false); /* no repetition, we're done */ else if (c == '{') (void) REQUIRE(MORE2() && \ (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); NEXT();
*** 484,499 **** } break; } if (!MORE()) ! return; c = PEEK(); if (!(c == '*' || c == '+' || c == '?' || (c == '{' && MORE2() && isdigit((uch)PEEK2())))) ! return; SETERROR(REG_BADRPT); } /* * p_str - string (no metacharacters) "parser" */ --- 483,499 ---- } break; } if (!MORE()) ! return (false); c = PEEK(); if (!(c == '*' || c == '+' || c == '?' || (c == '{' && MORE2() && isdigit((uch)PEEK2())))) ! return (false); SETERROR(REG_BADRPT); + return (false); } /* * p_str - string (no metacharacters) "parser" */
*** 504,554 **** while (MORE()) ordinary(p, WGETNEXT()); } /* ! * p_bre - BRE parser top level, anchoring and concatenation ! * Giving end1 as OUT essentially eliminates the end1/end2 check. ! * ! * This implementation is a bit of a kludge, in that a trailing $ is first ! * taken as an ordinary character and then revised to be an anchor. ! * The amount of lookahead needed to avoid this kludge is excessive. */ static void ! p_bre(struct parse *p, ! int end1, /* first terminating character */ ! int end2) /* second terminating character */ { ! sopno start = HERE(); ! int first = 1; /* first subexpression? */ ! int wasdollar = 0; if (EAT('^')) { EMIT(OBOL, 0); p->g->iflags |= USEBOL; p->g->nbol++; } ! while (MORE() && !SEETWO(end1, end2)) { ! wasdollar = p_simp_re(p, first); ! first = 0; ! } ! if (wasdollar) { /* oops, that was a trailing anchor */ DROP(1); EMIT(OEOL, 0); p->g->iflags |= USEEOL; p->g->neol++; } ! (void) REQUIRE(HERE() != start, REG_BADPAT); /* require nonempty */ } /* * p_simp_re - parse a simple RE, an atom possibly followed by a repetition */ ! static int /* was the simple RE an unbackslashed $? */ ! p_simp_re(struct parse *p, ! int starordinary) /* is a leading * an ordinary character? */ { int c; int count; int count2; sopno pos; --- 504,688 ---- while (MORE()) ordinary(p, WGETNEXT()); } /* ! * Eat consecutive branch delimiters for the kind of expression that we are ! * parsing, return the number of delimiters that we ate. */ + static int + p_branch_eat_delim(struct parse *p, struct branchc *bc) + { + int nskip; + + (void) bc; + nskip = 0; + while (EAT('|')) + ++nskip; + return (nskip); + } + + /* + * Insert necessary branch book-keeping operations. This emits a + * bogus 'next' offset, since we still have more to parse + */ static void ! p_branch_ins_offset(struct parse *p, struct branchc *bc) { ! if (bc->nbranch == 0) { ! INSERT(OCH_, bc->start); /* offset is wrong */ ! bc->fwd = bc->start; ! bc->back = bc->start; ! } + ASTERN(OOR1, bc->back); + bc->back = THERE(); + AHEAD(bc->fwd); /* fix previous offset */ + bc->fwd = HERE(); + EMIT(OOR2, 0); /* offset is very wrong */ + ++bc->nbranch; + } + + /* + * Fix the offset of the tail branch, if we actually had any branches. + * This is to correct the bogus placeholder offset that we use. + */ + static void + p_branch_fix_tail(struct parse *p, struct branchc *bc) + { + /* Fix bogus offset at the tail if we actually have branches */ + if (bc->nbranch > 0) { + AHEAD(bc->fwd); + ASTERN(O_CH, bc->back); + } + } + + /* + * Signal to the parser that an empty branch has been encountered; this will, + * in the future, be used to allow for more permissive behavior with empty + * branches. The return value should indicate whether parsing may continue + * or not. + */ + static bool + p_branch_empty(struct parse *p, struct branchc *bc) + { + (void) bc; + SETERROR(REG_BADPAT); + return (false); + } + + /* + * Take care of any branching requirements. This includes inserting the + * appropriate branching instructions as well as eating all of the branch + * delimiters until we either run out of pattern or need to parse more pattern. + */ + static bool + p_branch_do(struct parse *p, struct branchc *bc) + { + int ate = 0; + + ate = p_branch_eat_delim(p, bc); + if (ate == 0) + return (false); + else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc)) + /* + * Halt parsing only if we have an empty branch and + * p_branch_empty indicates that we must not continue. + * In the future, this will not necessarily be an error. + */ + return (false); + p_branch_ins_offset(p, bc); + + return (true); + } + + static void + p_bre_pre_parse(struct parse *p, struct branchc *bc) + { + (void) bc; + /* + * Does not move cleanly into expression parser because of + * ordinary interpration of * at the beginning position of + * an expression. + */ if (EAT('^')) { EMIT(OBOL, 0); p->g->iflags |= USEBOL; p->g->nbol++; } ! } ! ! static void ! p_bre_post_parse(struct parse *p, struct branchc *bc) ! { ! /* Expression is terminating due to EOL token */ ! if (bc->terminate) { DROP(1); EMIT(OEOL, 0); p->g->iflags |= USEEOL; p->g->neol++; } + } ! /* ! * Top level parser, concatenation and BRE anchoring. ! * Giving end1 as OUT essentially eliminates the end1/end2 check. ! * ! * This implementation is a bit of a kludge, in that a trailing $ is first ! * taken as an ordinary character and then revised to be an anchor. ! * The amount of lookahead needed to avoid this kludge is excessive. ! */ ! static void ! p_re(struct parse *p, ! int end1, /* first terminating character */ ! int end2) /* second terminating character; ignored for EREs */ ! { ! struct branchc bc; ! ! bc.nbranch = 0; ! if (end1 == OUT && end2 == OUT) ! bc.outer = true; ! else ! bc.outer = false; ! #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) ! for (;;) { ! bc.start = HERE(); ! bc.nchain = 0; ! bc.terminate = false; ! if (p->pre_parse != NULL) ! p->pre_parse(p, &bc); ! while (MORE() && (!p->allowbranch || !SEESPEC('|')) && ! !SEEEND()) { ! bc.terminate = p->parse_expr(p, &bc); ! ++bc.nchain; ! } ! if (p->post_parse != NULL) ! p->post_parse(p, &bc); ! (void) REQUIRE(HERE() != bc.start, REG_BADPAT); ! if (!p->allowbranch) ! break; ! /* ! * p_branch_do's return value indicates whether we should ! * continue parsing or not. This is both for correctness and ! * a slight optimization, because it will check if we've ! * encountered an empty branch or the end of the string ! * immediately following a branch delimiter. ! */ ! if (!p_branch_do(p, &bc)) ! break; ! } ! #undef SEE_END ! if (p->allowbranch) ! p_branch_fix_tail(p, &bc); ! assert(!MORE() || SEE(end1)); } /* * p_simp_re - parse a simple RE, an atom possibly followed by a repetition */ ! static bool /* was the simple RE an unbackslashed $? */ ! p_simp_re(struct parse *p, struct branchc *bc) { int c; int count; int count2; sopno pos;
*** 590,600 **** if (subno < NPAREN) p->pbegin[subno] = HERE(); EMIT(OLPAREN, subno); /* the MORE here is an error heuristic */ if (MORE() && !SEETWO('\\', ')')) ! p_bre(p, '\\', ')'); if (subno < NPAREN) { p->pend[subno] = HERE(); assert(p->pend[subno] != 0); } EMIT(ORPAREN, subno); --- 724,734 ---- if (subno < NPAREN) p->pbegin[subno] = HERE(); EMIT(OLPAREN, subno); /* the MORE here is an error heuristic */ if (MORE() && !SEETWO('\\', ')')) ! p_re(p, '\\', ')'); if (subno < NPAREN) { p->pend[subno] = HERE(); assert(p->pend[subno] != 0); } EMIT(ORPAREN, subno);
*** 625,639 **** } else SETERROR(REG_ESUBREG); p->g->backrefs = 1; break; case '*': ! (void) REQUIRE(starordinary, REG_BADRPT); /* FALLTHROUGH */ default: if (p->error != 0) ! return (0); /* Definitely not $... */ p->next--; wc = WGETNEXT(); ordinary(p, wc); break; } --- 759,778 ---- } else SETERROR(REG_ESUBREG); p->g->backrefs = 1; break; case '*': ! /* ! * Ordinary if used as the first character beyond BOL anchor of ! * a (sub-)expression, counts as a bad repetition operator if it ! * appears otherwise. ! */ ! (void) REQUIRE(bc->nchain == 0, REG_BADRPT); /* FALLTHROUGH */ default: if (p->error != 0) ! return (false); /* Definitely not $... */ p->next--; wc = WGETNEXT(); ordinary(p, wc); break; }
*** 660,672 **** NEXT(); (void) REQUIRE(MORE(), REG_EBRACE); SETERROR(REG_BADBR); } } else if (c == '$') /* $ (but not \$) ends it */ ! return (1); ! return (0); } /* * p_count - parse a repetition count */ --- 799,811 ---- NEXT(); (void) REQUIRE(MORE(), REG_EBRACE); SETERROR(REG_BADBR); } } else if (c == '$') /* $ (but not \$) ends it */ ! return (true); ! return (false); } /* * p_count - parse a repetition count */
*** 891,901 **** SETERROR(REG_EBRACK); return (0); } len = p->next - sp; for (cp = cnames; cp->name != NULL; cp++) ! if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') return (cp->code); /* known name */ (void) memset(&mbs, 0, sizeof (mbs)); if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) return (wc); /* single character */ else if (clen == (size_t)-1 || clen == (size_t)-2) --- 1030,1040 ---- SETERROR(REG_EBRACK); return (0); } len = p->next - sp; for (cp = cnames; cp->name != NULL; cp++) ! if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) return (cp->code); /* known name */ (void) memset(&mbs, 0, sizeof (mbs)); if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) return (wc); /* single character */ else if (clen == (size_t)-1 || clen == (size_t)-2)
*** 1434,1458 **** scan--; do { scan += OPND(s); s = *scan; /* assert() interferes w debug printouts */ ! if (OP(s) != O_QUEST && OP(s) != O_CH && ! OP(s) != OOR2) { g->iflags |= BAD; return; } ! } while (OP(s) != O_QUEST && OP(s) != O_CH); /* FALLTHROUGH */ case OBOW: /* things that break a sequence */ case OEOW: case OBOL: case OEOL: case O_QUEST: case O_CH: case OEND: ! if (newlen > g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { g->moffset += offset; offset = newlen; --- 1573,1597 ---- scan--; do { scan += OPND(s); s = *scan; /* assert() interferes w debug printouts */ ! if (OP(s) != (sop)O_QUEST && ! OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) { g->iflags |= BAD; return; } ! } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); /* FALLTHROUGH */ case OBOW: /* things that break a sequence */ case OEOW: case OBOL: case OEOL: case O_QUEST: case O_CH: case OEND: ! if (newlen > (sopno)g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { g->moffset += offset; offset = newlen;
*** 1463,1473 **** offset += newlen; } newlen = 0; break; case OANY: ! if (newlen > g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { g->moffset += offset; offset = newlen; --- 1602,1612 ---- offset += newlen; } newlen = 0; break; case OANY: ! if (newlen > (sopno)g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { g->moffset += offset; offset = newlen;
*** 1481,1491 **** offset++; newlen = 0; break; case OANYOF: /* may or may not invalidate offset */ /* First, everything as OANY */ ! if (newlen > g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { g->moffset += offset; offset = newlen; --- 1620,1630 ---- offset++; newlen = 0; break; case OANYOF: /* may or may not invalidate offset */ /* First, everything as OANY */ ! if (newlen > (sopno)g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { g->moffset += offset; offset = newlen;
*** 1505,1515 **** * Anything here makes it impossible or too hard * to calculate the offset -- so we give up; * save the last known good offset, in case the * must sequence doesn't occur later. */ ! if (newlen > g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) g->moffset += offset; else --- 1644,1654 ---- * Anything here makes it impossible or too hard * to calculate the offset -- so we give up; * save the last known good offset, in case the * must sequence doesn't occur later. */ ! if (newlen > (sopno)g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) g->moffset += offset; else
*** 1565,1575 **** return (-1); largest = 0; try = 0; s = *scan++; ! while (OP(s) != O_QUEST && OP(s) != O_CH) { switch (OP(s)) { case OOR1: if (try > largest) largest = try; try = 0; --- 1704,1714 ---- return (-1); largest = 0; try = 0; s = *scan++; ! while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) { switch (OP(s)) { case OOR1: if (try > largest) largest = try; try = 0;
*** 1581,1594 **** return (-1); scan--; do { scan += OPND(s); s = *scan; ! if (OP(s) != O_QUEST && OP(s) != O_CH && ! OP(s) != OOR2) return (-1); ! } while (OP(s) != O_QUEST && OP(s) != O_CH); /* * We must skip to the next position, or we'll * leave altoffset() too early. */ scan++; --- 1720,1733 ---- return (-1); scan--; do { scan += OPND(s); s = *scan; ! if (OP(s) != (sop)O_QUEST && ! OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) return (-1); ! } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH); /* * We must skip to the next position, or we'll * leave altoffset() too early. */ scan++;