Print this page
8993 sync regcomp(3C) with upstream

@@ -39,12 +39,13 @@
 #include <sys/types.h>
 #include <stdio.h>
 #include <string.h>
 #include <ctype.h>
 #include <limits.h>
-#include <stdlib.h>
 #include <regex.h>
+#include <stdlib.h>
+#include <stdbool.h>
 #include <wchar.h>
 #include <wctype.h>
 
 #include "../locale/runetype.h"
 #include "../locale/collate.h"

@@ -54,10 +55,28 @@
 
 #include "cname.h"
 #include "../locale/mblocal.h"
 
 /*
+ * Branching context, used to keep track of branch state for all of the branch-
+ * aware functions. In addition to keeping track of branch positions for the
+ * p_branch_* functions, we use this to simplify some clumsiness in BREs for
+ * detection of whether ^ is acting as an anchor or being used erroneously and
+ * also for whether we're in a sub-expression or not.
+ */
+struct branchc {
+        sopno start;
+        sopno back;
+        sopno fwd;
+
+        int nbranch;
+        int nchain;
+        bool outer;
+        bool terminate;
+};
+
+/*
  * parse structure, passed up and down to avoid global variables and
  * other clumsinesses
  */
 struct parse {
         const char *next;       /* next character in RE */

@@ -69,23 +88,34 @@
         int ncsalloc;           /* number of csets allocated */
         struct re_guts *g;
 #define NPAREN  10              /* we need to remember () 1-9 for back refs */
         sopno pbegin[NPAREN];   /* -> ( ([0] unused) */
         sopno pend[NPAREN];     /* -> ) ([0] unused) */
+        bool allowbranch;       /* can this expression branch? */
+        bool bre;               /* convenience; is this a BRE? */
+        bool (*parse_expr)(struct parse *, struct branchc *);
+        void (*pre_parse)(struct parse *, struct branchc *);
+        void (*post_parse)(struct parse *, struct branchc *);
 };
 
 /* ========= begin header generated by ./mkh ========= */
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /* === regcomp.c === */
-static void p_ere(struct parse *p, int stop);
-static void p_ere_exp(struct parse *p);
+static bool p_ere_exp(struct parse *p, struct branchc *bc);
 static void p_str(struct parse *p);
-static void p_bre(struct parse *p, int end1, int end2);
-static int p_simp_re(struct parse *p, int starordinary);
+static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
+static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
+static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
+static bool p_branch_empty(struct parse *p, struct branchc *bc);
+static bool p_branch_do(struct parse *p, struct branchc *bc);
+static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
+static void p_bre_post_parse(struct parse *p, struct branchc *bc);
+static void p_re(struct parse *p, int end1, int end2);
+static bool p_simp_re(struct parse *p, struct branchc *bc);
 static int p_count(struct parse *p);
 static void p_bracket(struct parse *p);
 static void p_b_term(struct parse *p, cset *cs);
 static void p_b_cclass(struct parse *p, cset *cs);
 static void p_b_eclass(struct parse *p, cset *cs);

@@ -131,10 +161,11 @@
 #define PEEK2() (*(p->next+1))
 #define MORE()  (p->next < p->end)
 #define MORE2() (p->next+1 < p->end)
 #define SEE(c)  (MORE() && PEEK() == (c))
 #define SEETWO(a, b)    (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
+#define SEESPEC(a)      (p->bre ? SEETWO('\\', a) : SEE(a))
 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
 #define EATTWO(a, b)    ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
 #define NEXT()  (p->next++)
 #define NEXT2() (p->next += 2)
 #define NEXTn(n)        (p->next += (n))

@@ -227,10 +258,23 @@
         p->ncsalloc = 0;
         for (i = 0; i < NPAREN; i++) {
                 p->pbegin[i] = 0;
                 p->pend[i] = 0;
         }
+        if (cflags & REG_EXTENDED) {
+                p->allowbranch = true;
+                p->bre = false;
+                p->parse_expr = p_ere_exp;
+                p->pre_parse = NULL;
+                p->post_parse = NULL;
+        } else {
+                p->allowbranch = false;
+                p->bre = true;
+                p->parse_expr = p_simp_re;
+                p->pre_parse = p_bre_pre_parse;
+                p->post_parse = p_bre_post_parse;
+        }
         g->sets = NULL;
         g->ncsets = 0;
         g->cflags = cflags;
         g->iflags = 0;
         g->nbol = 0;

@@ -244,16 +288,14 @@
         g->backrefs = 0;
 
         /* do it */
         EMIT(OEND, 0);
         g->firststate = THERE();
-        if (cflags&REG_EXTENDED)
-                p_ere(p, OUT);
-        else if (cflags&REG_NOSPEC)
+        if (cflags & REG_NOSPEC)
                 p_str(p);
         else
-                p_bre(p, OUT, OUT);
+                p_re(p, OUT, OUT);
         EMIT(OEND, 0);
         g->laststate = THERE();
 
         /* tidy up loose ends and fill things in */
         stripsnug(p, g);

@@ -286,68 +328,25 @@
                 regfree(preg);
         return (p->error);
 }
 
 /*
- * p_ere - ERE parser top level, concatenation and alternation
+ * Parse one subERE, an atom possibly followed by a repetition op,
+ * return whether we should terminate or not.
  */
-static void
-p_ere(struct parse *p,
-    int stop)           /* character this ERE should end at */
+static bool
+p_ere_exp(struct parse *p, struct branchc *bc)
 {
         char c;
-        sopno prevback;
-        sopno prevfwd;
-        sopno conc;
-        int first = 1;          /* is this the first alternative? */
-
-        for (;;) {
-                /* do a bunch of concatenated expressions */
-                conc = HERE();
-                while (MORE() && (c = PEEK()) != '|' && c != stop)
-                        p_ere_exp(p);
-                /* require nonempty */
-                (void) REQUIRE(HERE() != conc, REG_BADPAT);
-
-                if (!EAT('|'))
-                        break;          /* NOTE BREAK OUT */
-
-                if (first) {
-                        INSERT(OCH_, conc);     /* offset is wrong */
-                        prevfwd = conc;
-                        prevback = conc;
-                        first = 0;
-                }
-                ASTERN(OOR1, prevback);
-                prevback = THERE();
-                AHEAD(prevfwd);                 /* fix previous offset */
-                prevfwd = HERE();
-                EMIT(OOR2, 0);                  /* offset is very wrong */
-        }
-
-        if (!first) {           /* tail-end fixups */
-                AHEAD(prevfwd);
-                ASTERN(O_CH, prevback);
-        }
-
-        assert(!MORE() || SEE(stop));
-}
-
-/*
- * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
- */
-static void
-p_ere_exp(struct parse *p)
-{
-        char c;
         wint_t wc;
         sopno pos;
         int count;
         int count2;
         sopno subno;
         int wascaret = 0;
 
+        (void) bc;
         assert(MORE());         /* caller should have ensured this */
         c = GETNEXT();
 
         pos = HERE();
         switch (c) {

@@ -357,11 +356,11 @@
                 subno = p->g->nsub;
                 if (subno < NPAREN)
                         p->pbegin[subno] = HERE();
                 EMIT(OLPAREN, subno);
                 if (!SEE(')'))
-                        p_ere(p, ')');
+                        p_re(p, ')', IGN);
                 if (subno < NPAREN) {
                         p->pend[subno] = HERE();
                         assert(p->pend[subno] != 0);
                 }
                 EMIT(ORPAREN, subno);

@@ -423,23 +422,23 @@
                         break;
                 }
                 break;
         default:
                 if (p->error != 0)
-                        return;
+                        return (false);
                 p->next--;
                 wc = WGETNEXT();
                 ordinary(p, wc);
                 break;
         }
 
         if (!MORE())
-                return;
+                return (false);
         c = PEEK();
         /* we call { a repetition if followed by a digit */
         if (!(c == '*' || c == '+' || c == '?' || c == '{'))
-                return;         /* no repetition, we're done */
+                return (false);         /* no repetition, we're done */
         else if (c == '{')
                 (void) REQUIRE(MORE2() && \
                     (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
         NEXT();
 

@@ -484,16 +483,17 @@
                 }
                 break;
         }
 
         if (!MORE())
-                return;
+                return (false);
         c = PEEK();
         if (!(c == '*' || c == '+' || c == '?' ||
             (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
-                return;
+                return (false);
         SETERROR(REG_BADRPT);
+        return (false);
 }
 
 /*
  * p_str - string (no metacharacters) "parser"
  */

@@ -504,51 +504,185 @@
         while (MORE())
                 ordinary(p, WGETNEXT());
 }
 
 /*
- * p_bre - BRE parser top level, anchoring and concatenation
- * Giving end1 as OUT essentially eliminates the end1/end2 check.
- *
- * This implementation is a bit of a kludge, in that a trailing $ is first
- * taken as an ordinary character and then revised to be an anchor.
- * The amount of lookahead needed to avoid this kludge is excessive.
+ * Eat consecutive branch delimiters for the kind of expression that we are
+ * parsing, return the number of delimiters that we ate.
  */
+static int
+p_branch_eat_delim(struct parse *p, struct branchc *bc)
+{
+        int nskip;
+
+        (void) bc;
+        nskip = 0;
+        while (EAT('|'))
+                ++nskip;
+        return (nskip);
+}
+
+/*
+ * Insert necessary branch book-keeping operations. This emits a
+ * bogus 'next' offset, since we still have more to parse
+ */
 static void
-p_bre(struct parse *p,
-    int end1,           /* first terminating character */
-    int end2)           /* second terminating character */
+p_branch_ins_offset(struct parse *p, struct branchc *bc)
 {
-        sopno start = HERE();
-        int first = 1;                  /* first subexpression? */
-        int wasdollar = 0;
+        if (bc->nbranch == 0) {
+                INSERT(OCH_, bc->start);        /* offset is wrong */
+                bc->fwd = bc->start;
+                bc->back = bc->start;
+        }
 
+        ASTERN(OOR1, bc->back);
+        bc->back = THERE();
+        AHEAD(bc->fwd);                 /* fix previous offset */
+        bc->fwd = HERE();
+        EMIT(OOR2, 0);                  /* offset is very wrong */
+        ++bc->nbranch;
+}
+
+/*
+ * Fix the offset of the tail branch, if we actually had any branches.
+ * This is to correct the bogus placeholder offset that we use.
+ */
+static void
+p_branch_fix_tail(struct parse *p, struct branchc *bc)
+{
+        /* Fix bogus offset at the tail if we actually have branches */
+        if (bc->nbranch > 0) {
+                AHEAD(bc->fwd);
+                ASTERN(O_CH, bc->back);
+        }
+}
+
+/*
+ * Signal to the parser that an empty branch has been encountered; this will,
+ * in the future, be used to allow for more permissive behavior with empty
+ * branches. The return value should indicate whether parsing may continue
+ * or not.
+ */
+static bool
+p_branch_empty(struct parse *p, struct branchc *bc)
+{
+        (void) bc;
+        SETERROR(REG_BADPAT);
+        return (false);
+}
+
+/*
+ * Take care of any branching requirements. This includes inserting the
+ * appropriate branching instructions as well as eating all of the branch
+ * delimiters until we either run out of pattern or need to parse more pattern.
+ */
+static bool
+p_branch_do(struct parse *p, struct branchc *bc)
+{
+        int ate = 0;
+
+        ate = p_branch_eat_delim(p, bc);
+        if (ate == 0)
+                return (false);
+        else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
+                /*
+                 * Halt parsing only if we have an empty branch and
+                 * p_branch_empty indicates that we must not continue.
+                 * In the future, this will not  necessarily be an error.
+                 */
+                return (false);
+        p_branch_ins_offset(p, bc);
+
+        return (true);
+}
+
+static void
+p_bre_pre_parse(struct parse *p, struct branchc *bc)
+{
+        (void) bc;
+        /*
+         * Does not move cleanly into expression parser because of
+         * ordinary interpration of * at the beginning position of
+         * an expression.
+         */
         if (EAT('^')) {
                 EMIT(OBOL, 0);
                 p->g->iflags |= USEBOL;
                 p->g->nbol++;
         }
-        while (MORE() && !SEETWO(end1, end2)) {
-                wasdollar = p_simp_re(p, first);
-                first = 0;
-        }
-        if (wasdollar) {        /* oops, that was a trailing anchor */
+}
+
+static void
+p_bre_post_parse(struct parse *p, struct branchc *bc)
+{
+        /* Expression is terminating due to EOL token */
+        if (bc->terminate) {
                 DROP(1);
                 EMIT(OEOL, 0);
                 p->g->iflags |= USEEOL;
                 p->g->neol++;
         }
+}
 
-        (void) REQUIRE(HERE() != start, REG_BADPAT);    /* require nonempty */
+/*
+ * Top level parser, concatenation and BRE anchoring.
+ * Giving end1 as OUT essentially eliminates the end1/end2 check.
+ *
+ * This implementation is a bit of a kludge, in that a trailing $ is first
+ * taken as an ordinary character and then revised to be an anchor.
+ * The amount of lookahead needed to avoid this kludge is excessive.
+ */
+static void
+p_re(struct parse *p,
+    int end1,   /* first terminating character */
+    int end2)   /* second terminating character; ignored for EREs */
+{
+        struct branchc bc;
+
+        bc.nbranch = 0;
+        if (end1 == OUT && end2 == OUT)
+                bc.outer = true;
+        else
+                bc.outer = false;
+#define SEEEND()        (!p->bre ? SEE(end1) : SEETWO(end1, end2))
+        for (;;) {
+                bc.start = HERE();
+                bc.nchain = 0;
+                bc.terminate = false;
+                if (p->pre_parse != NULL)
+                        p->pre_parse(p, &bc);
+                while (MORE() && (!p->allowbranch || !SEESPEC('|')) &&
+                    !SEEEND()) {
+                        bc.terminate = p->parse_expr(p, &bc);
+                        ++bc.nchain;
+                }
+                if (p->post_parse != NULL)
+                        p->post_parse(p, &bc);
+                (void) REQUIRE(HERE() != bc.start, REG_BADPAT);
+                if (!p->allowbranch)
+                        break;
+                /*
+                 * p_branch_do's return value indicates whether we should
+                 * continue parsing or not. This is both for correctness and
+                 * a slight optimization, because it will check if we've
+                 * encountered an empty branch or the end of the string
+                 * immediately following a branch delimiter.
+                 */
+                if (!p_branch_do(p, &bc))
+                        break;
+        }
+#undef SEE_END
+        if (p->allowbranch)
+                p_branch_fix_tail(p, &bc);
+        assert(!MORE() || SEE(end1));
 }
 
 /*
  * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
  */
-static int                      /* was the simple RE an unbackslashed $? */
-p_simp_re(struct parse *p,
-    int starordinary)   /* is a leading * an ordinary character? */
+static bool                     /* was the simple RE an unbackslashed $? */
+p_simp_re(struct parse *p, struct branchc *bc)
 {
         int c;
         int count;
         int count2;
         sopno pos;

@@ -590,11 +724,11 @@
                 if (subno < NPAREN)
                         p->pbegin[subno] = HERE();
                 EMIT(OLPAREN, subno);
                 /* the MORE here is an error heuristic */
                 if (MORE() && !SEETWO('\\', ')'))
-                        p_bre(p, '\\', ')');
+                        p_re(p, '\\', ')');
                 if (subno < NPAREN) {
                         p->pend[subno] = HERE();
                         assert(p->pend[subno] != 0);
                 }
                 EMIT(ORPAREN, subno);

@@ -625,15 +759,20 @@
                 } else
                         SETERROR(REG_ESUBREG);
                 p->g->backrefs = 1;
                 break;
         case '*':
-                (void) REQUIRE(starordinary, REG_BADRPT);
+                /*
+                 * Ordinary if used as the first character beyond BOL anchor of
+                 * a (sub-)expression, counts as a bad repetition operator if it
+                 * appears otherwise.
+                 */
+                (void) REQUIRE(bc->nchain == 0, REG_BADRPT);
                 /* FALLTHROUGH */
         default:
                 if (p->error != 0)
-                        return (0);     /* Definitely not $... */
+                        return (false); /* Definitely not $... */
                 p->next--;
                 wc = WGETNEXT();
                 ordinary(p, wc);
                 break;
         }

@@ -660,13 +799,13 @@
                                 NEXT();
                         (void) REQUIRE(MORE(), REG_EBRACE);
                         SETERROR(REG_BADBR);
                 }
         } else if (c == '$')    /* $ (but not \$) ends it */
-                return (1);
+                return (true);
 
-        return (0);
+        return (false);
 }
 
 /*
  * p_count - parse a repetition count
  */

@@ -891,11 +1030,11 @@
                 SETERROR(REG_EBRACK);
                 return (0);
         }
         len = p->next - sp;
         for (cp = cnames; cp->name != NULL; cp++)
-                if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
+                if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
                         return (cp->code);      /* known name */
         (void) memset(&mbs, 0, sizeof (mbs));
         if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
                 return (wc);                    /* single character */
         else if (clen == (size_t)-1 || clen == (size_t)-2)

@@ -1434,25 +1573,25 @@
                         scan--;
                         do {
                                 scan += OPND(s);
                                 s = *scan;
                                 /* assert() interferes w debug printouts */
-                                if (OP(s) != O_QUEST && OP(s) != O_CH &&
-                                    OP(s) != OOR2) {
+                                if (OP(s) != (sop)O_QUEST &&
+                                    OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) {
                                         g->iflags |= BAD;
                                         return;
                                 }
-                        } while (OP(s) != O_QUEST && OP(s) != O_CH);
+                        } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
                         /* FALLTHROUGH */
                 case OBOW:              /* things that break a sequence */
                 case OEOW:
                 case OBOL:
                 case OEOL:
                 case O_QUEST:
                 case O_CH:
                 case OEND:
-                        if (newlen > g->mlen) {         /* ends one */
+                        if (newlen > (sopno)g->mlen) {          /* ends one */
                                 start = newstart;
                                 g->mlen = newlen;
                                 if (offset > -1) {
                                         g->moffset += offset;
                                         offset = newlen;

@@ -1463,11 +1602,11 @@
                                         offset += newlen;
                         }
                         newlen = 0;
                         break;
                 case OANY:
-                        if (newlen > g->mlen) {         /* ends one */
+                        if (newlen > (sopno)g->mlen) {          /* ends one */
                                 start = newstart;
                                 g->mlen = newlen;
                                 if (offset > -1) {
                                         g->moffset += offset;
                                         offset = newlen;

@@ -1481,11 +1620,11 @@
                                 offset++;
                         newlen = 0;
                         break;
                 case OANYOF:            /* may or may not invalidate offset */
                         /* First, everything as OANY */
-                        if (newlen > g->mlen) {         /* ends one */
+                        if (newlen > (sopno)g->mlen) {          /* ends one */
                                 start = newstart;
                                 g->mlen = newlen;
                                 if (offset > -1) {
                                         g->moffset += offset;
                                         offset = newlen;

@@ -1505,11 +1644,11 @@
                          * Anything here makes it impossible or too hard
                          * to calculate the offset -- so we give up;
                          * save the last known good offset, in case the
                          * must sequence doesn't occur later.
                          */
-                        if (newlen > g->mlen) {         /* ends one */
+                        if (newlen > (sopno)g->mlen) {          /* ends one */
                                 start = newstart;
                                 g->mlen = newlen;
                                 if (offset > -1)
                                         g->moffset += offset;
                                 else

@@ -1565,11 +1704,11 @@
                 return (-1);
 
         largest = 0;
         try = 0;
         s = *scan++;
-        while (OP(s) != O_QUEST && OP(s) != O_CH) {
+        while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) {
                 switch (OP(s)) {
                 case OOR1:
                         if (try > largest)
                                 largest = try;
                         try = 0;

@@ -1581,14 +1720,14 @@
                                 return (-1);
                         scan--;
                         do {
                                 scan += OPND(s);
                                 s = *scan;
-                                if (OP(s) != O_QUEST && OP(s) != O_CH &&
-                                    OP(s) != OOR2)
+                                if (OP(s) != (sop)O_QUEST &&
+                                    OP(s) != (sop)O_CH && OP(s) != (sop)OOR2)
                                         return (-1);
-                        } while (OP(s) != O_QUEST && OP(s) != O_CH);
+                        } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
                         /*
                          * We must skip to the next position, or we'll
                          * leave altoffset() too early.
                          */
                         scan++;