ilregex Wdiff usr/src/lib/libc/port/regex/regcomp.c

Print this page

8993 sync regcomp(3C) with upstream

Split	Close
Expand all
Collapse all

          --- old/usr/src/lib/libc/port/regex/regcomp.c
          +++ new/usr/src/lib/libc/port/regex/regcomp.c

   1    1  /*
   2    2   * Copyright 2013 Garrett D'Amore <garrett@damore.org>
   3    3   * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
   4    4   * Copyright 2012 Milan Jurik. All rights reserved.
   5    5   * Copyright (c) 1992, 1993, 1994 Henry Spencer.
   6    6   * Copyright (c) 1992, 1993, 1994
   7    7   *      The Regents of the University of California.  All rights reserved.
   8    8   *
   9    9   * This code is derived from software contributed to Berkeley by
  10   10   * Henry Spencer.
  11   11   *
  12   12   * Redistribution and use in source and binary forms, with or without
  13   13   * modification, are permitted provided that the following conditions
  14   14   * are met:
  15   15   * 1. Redistributions of source code must retain the above copyright
  16   16   *    notice, this list of conditions and the following disclaimer.
  17   17   * 2. Redistributions in binary form must reproduce the above copyright
  18   18   *    notice, this list of conditions and the following disclaimer in the
  19   19   *    documentation and/or other materials provided with the distribution.
  20   20   * 3. Neither the name of the University nor the names of its contributors
  21   21   *    may be used to endorse or promote products derived from this software
  22   22   *    without specific prior written permission.
  23   23   *
  24   24   * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25   25   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26   26   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27   27   * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28   28   * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29   29   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30   30   * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31   31   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32   32   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33   33   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

↓ open down ↓

33 lines elided

↑ open up ↑

  34   34   * SUCH DAMAGE.
  35   35   */
  36   36  
  37   37  #include "lint.h"
  38   38  #include "file64.h"
  39   39  #include <sys/types.h>
  40   40  #include <stdio.h>
  41   41  #include <string.h>
  42   42  #include <ctype.h>
  43   43  #include <limits.h>
  44      -#include <stdlib.h>
  45   44  #include <regex.h>
       45 +#include <stdlib.h>
       46 +#include <stdbool.h>
  46   47  #include <wchar.h>
  47   48  #include <wctype.h>
  48   49  
  49   50  #include "../locale/runetype.h"
  50   51  #include "../locale/collate.h"
  51   52  
  52   53  #include "utils.h"
  53   54  #include "regex2.h"
  54   55  
  55   56  #include "cname.h"
  56   57  #include "../locale/mblocal.h"
  57   58  
  58   59  /*
       60 + * Branching context, used to keep track of branch state for all of the branch-
       61 + * aware functions. In addition to keeping track of branch positions for the
       62 + * p_branch_* functions, we use this to simplify some clumsiness in BREs for
       63 + * detection of whether ^ is acting as an anchor or being used erroneously and
       64 + * also for whether we're in a sub-expression or not.
       65 + */
       66 +struct branchc {
       67 +        sopno start;
       68 +        sopno back;
       69 +        sopno fwd;
       70 +
       71 +        int nbranch;
       72 +        int nchain;
       73 +        bool outer;
       74 +        bool terminate;
       75 +};
       76 +
       77 +/*
  59   78   * parse structure, passed up and down to avoid global variables and
  60   79   * other clumsinesses
  61   80   */
  62   81  struct parse {
  63   82          const char *next;       /* next character in RE */
  64   83          const char *end;        /* end of string (-> NUL normally) */
  65   84          int error;              /* has an error been seen? */
  66   85          sop *strip;             /* malloced strip */
  67   86          sopno ssize;            /* malloced strip size (allocated) */
  68   87          sopno slen;             /* malloced strip length (used) */
  69   88          int ncsalloc;           /* number of csets allocated */
  70   89          struct re_guts *g;
  71   90  #define NPAREN  10              /* we need to remember () 1-9 for back refs */
  72   91          sopno pbegin[NPAREN];   /* -> ( ([0] unused) */
  73   92          sopno pend[NPAREN];     /* -> ) ([0] unused) */
       93 +        bool allowbranch;       /* can this expression branch? */
       94 +        bool bre;               /* convenience; is this a BRE? */
       95 +        bool (*parse_expr)(struct parse *, struct branchc *);
       96 +        void (*pre_parse)(struct parse *, struct branchc *);
       97 +        void (*post_parse)(struct parse *, struct branchc *);
  74   98  };
  75   99  
  76  100  /* ========= begin header generated by ./mkh ========= */
  77  101  #ifdef __cplusplus
  78  102  extern "C" {
  79  103  #endif
  80  104  
  81  105  /* === regcomp.c === */
  82      -static void p_ere(struct parse *p, int stop);
  83      -static void p_ere_exp(struct parse *p);
      106 +static bool p_ere_exp(struct parse *p, struct branchc *bc);
  84  107  static void p_str(struct parse *p);
  85      -static void p_bre(struct parse *p, int end1, int end2);
  86      -static int p_simp_re(struct parse *p, int starordinary);
      108 +static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
      109 +static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
      110 +static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
      111 +static bool p_branch_empty(struct parse *p, struct branchc *bc);
      112 +static bool p_branch_do(struct parse *p, struct branchc *bc);
      113 +static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
      114 +static void p_bre_post_parse(struct parse *p, struct branchc *bc);
      115 +static void p_re(struct parse *p, int end1, int end2);
      116 +static bool p_simp_re(struct parse *p, struct branchc *bc);
  87  117  static int p_count(struct parse *p);
  88  118  static void p_bracket(struct parse *p);
  89  119  static void p_b_term(struct parse *p, cset *cs);
  90  120  static void p_b_cclass(struct parse *p, cset *cs);
  91  121  static void p_b_eclass(struct parse *p, cset *cs);
  92  122  static wint_t p_b_symbol(struct parse *p);
  93  123  static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
  94  124  static wint_t othercase(wint_t ch);
  95  125  static void bothcases(struct parse *p, wint_t ch);
  96  126  static void ordinary(struct parse *p, wint_t ch);

  97  127  static void nonnewline(struct parse *p);
  98  128  static void repeat(struct parse *p, sopno start, int from, int to);
  99  129  static int seterr(struct parse *p, int e);
 100  130  static cset *allocset(struct parse *p);
 101  131  static void freeset(struct parse *p, cset *cs);
 102  132  static void CHadd(struct parse *p, cset *cs, wint_t ch);
 103  133  static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
 104  134  static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
 105  135  static wint_t singleton(cset *cs);
 106  136  static sopno dupl(struct parse *p, sopno start, sopno finish);
 107  137  static void doemit(struct parse *p, sop op, size_t opnd);
 108  138  static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
 109  139  static void dofwd(struct parse *p, sopno pos, sop value);
 110  140  static int enlarge(struct parse *p, sopno size);
 111  141  static void stripsnug(struct parse *p, struct re_guts *g);
 112  142  static void findmust(struct parse *p, struct re_guts *g);
 113  143  static int altoffset(sop *scan, int offset);
 114  144  static void computejumps(struct parse *p, struct re_guts *g);
 115  145  static void computematchjumps(struct parse *p, struct re_guts *g);
 116  146  static sopno pluscount(struct parse *p, struct re_guts *g);
 117  147  static wint_t wgetnext(struct parse *p);
 118  148  
 119  149  #ifdef __cplusplus
 120  150  }
 121  151  #endif
 122  152  /* ========= end header generated by ./mkh ========= */
 123  153  
 124  154  static char nuls[10];           /* place to point scanner in event of error */
 125  155

↓ open down ↓

29 lines elided

↑ open up ↑

 126  156  /*
 127  157   * macros for use with parse structure
 128  158   * BEWARE:  these know that the parse structure is named `p' !!!
 129  159   */
 130  160  #define PEEK()  (*p->next)
 131  161  #define PEEK2() (*(p->next+1))
 132  162  #define MORE()  (p->next < p->end)
 133  163  #define MORE2() (p->next+1 < p->end)
 134  164  #define SEE(c)  (MORE() && PEEK() == (c))
 135  165  #define SEETWO(a, b)    (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
      166 +#define SEESPEC(a)      (p->bre ? SEETWO('\\', a) : SEE(a))
 136  167  #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
 137  168  #define EATTWO(a, b)    ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
 138  169  #define NEXT()  (p->next++)
 139  170  #define NEXT2() (p->next += 2)
 140  171  #define NEXTn(n)        (p->next += (n))
 141  172  #define GETNEXT()       (*p->next++)
 142  173  #define WGETNEXT()      wgetnext(p)
 143  174  #define SETERROR(e)     ((void)seterr(p, (e)))
 144  175  #define REQUIRE(co, e)  ((co) || seterr(p, e))
 145  176  #define MUSTSEE(c, e)   (REQUIRE(MORE() && PEEK() == (c), e))

 146  177  #define MUSTEAT(c, e)   (REQUIRE(MORE() && GETNEXT() == (c), e))
 147  178  #define MUSTNOTSEE(c, e)        (REQUIRE(!MORE() || PEEK() != (c), e))
 148  179  #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
 149  180  #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
 150  181  #define AHEAD(pos)              dofwd(p, pos, HERE()-(pos))
 151  182  #define ASTERN(sop, pos)        EMIT(sop, HERE()-pos)
 152  183  #define HERE()          (p->slen)
 153  184  #define THERE()         (p->slen - 1)
 154  185  #define THERETHERE()    (p->slen - 2)
 155  186  #define DROP(n) (p->slen -= (n))
 156  187  
 157  188  #ifndef NDEBUG
 158  189  static int never = 0;           /* for use in asserts; shuts lint up */
 159  190  #else
 160  191  #define never   0               /* some <assert.h>s have bugs too */
 161  192  #endif
 162  193  
 163  194  /*
 164  195   * regcomp - interface for parser and compilation
 165  196   */
 166  197  int                             /* 0 success, otherwise REG_something */
 167  198  regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern,
 168  199      int cflags)
 169  200  {
 170  201          struct parse pa;
 171  202          struct re_guts *g;
 172  203          struct parse *p = &pa;
 173  204          int i;
 174  205          size_t len;
 175  206          size_t maxlen;
 176  207  #ifdef REDEBUG
 177  208  #define GOODFLAGS(f)    (f)
 178  209  #else
 179  210  #define GOODFLAGS(f)    ((f)&~REG_DUMP)
 180  211  #endif
 181  212  
 182  213          /* We had REG_INVARG, but we don't have that on Solaris. */
 183  214          cflags = GOODFLAGS(cflags);
 184  215          if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
 185  216                  return (REG_EFATAL);
 186  217  
 187  218          if (cflags&REG_PEND) {
 188  219                  if (preg->re_endp < pattern)
 189  220                          return (REG_EFATAL);
 190  221                  len = preg->re_endp - pattern;
 191  222          } else
 192  223                  len = strlen(pattern);
 193  224  
 194  225          /* do the mallocs early so failure handling is easy */
 195  226          g = (struct re_guts *)malloc(sizeof (struct re_guts));
 196  227          if (g == NULL)
 197  228                  return (REG_ESPACE);
 198  229          /*
 199  230           * Limit the pattern space to avoid a 32-bit overflow on buffer
 200  231           * extension.  Also avoid any signed overflow in case of conversion
 201  232           * so make the real limit based on a 31-bit overflow.
 202  233           *
 203  234           * Likely not applicable on 64-bit systems but handle the case
 204  235           * generically (who are we to stop people from using ~715MB+
 205  236           * patterns?).
 206  237           */
 207  238          maxlen = ((size_t)-1 >> 1) / sizeof (sop) * 2 / 3;
 208  239          if (len >= maxlen) {
 209  240                  free((char *)g);
 210  241                  return (REG_ESPACE);
 211  242          }
 212  243          p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
 213  244          assert(p->ssize >= len);
 214  245  
 215  246          p->strip = (sop *)malloc(p->ssize * sizeof (sop));
 216  247          p->slen = 0;
 217  248          if (p->strip == NULL) {
 218  249                  free((char *)g);
 219  250                  return (REG_ESPACE);
 220  251          }
 221  252

↓ open down ↓

76 lines elided

↑ open up ↑

 222  253          /* set things up */
 223  254          p->g = g;
 224  255          p->next = pattern;      /* convenience; we do not modify it */
 225  256          p->end = p->next + len;
 226  257          p->error = 0;
 227  258          p->ncsalloc = 0;
 228  259          for (i = 0; i < NPAREN; i++) {
 229  260                  p->pbegin[i] = 0;
 230  261                  p->pend[i] = 0;
 231  262          }
      263 +        if (cflags & REG_EXTENDED) {
      264 +                p->allowbranch = true;
      265 +                p->bre = false;
      266 +                p->parse_expr = p_ere_exp;
      267 +                p->pre_parse = NULL;
      268 +                p->post_parse = NULL;
      269 +        } else {
      270 +                p->allowbranch = false;
      271 +                p->bre = true;
      272 +                p->parse_expr = p_simp_re;
      273 +                p->pre_parse = p_bre_pre_parse;
      274 +                p->post_parse = p_bre_post_parse;
      275 +        }
 232  276          g->sets = NULL;
 233  277          g->ncsets = 0;
 234  278          g->cflags = cflags;
 235  279          g->iflags = 0;
 236  280          g->nbol = 0;
 237  281          g->neol = 0;
 238  282          g->must = NULL;
 239  283          g->moffset = -1;
 240  284          g->charjump = NULL;
 241  285          g->matchjump = NULL;
 242  286          g->mlen = 0;
 243  287          g->nsub = 0;
 244  288          g->backrefs = 0;
 245  289  
 246  290          /* do it */
 247  291          EMIT(OEND, 0);
 248  292          g->firststate = THERE();
 249      -        if (cflags&REG_EXTENDED)
 250      -                p_ere(p, OUT);
 251      -        else if (cflags&REG_NOSPEC)
      293 +        if (cflags & REG_NOSPEC)
 252  294                  p_str(p);
 253  295          else
 254      -                p_bre(p, OUT, OUT);
      296 +                p_re(p, OUT, OUT);
 255  297          EMIT(OEND, 0);
 256  298          g->laststate = THERE();
 257  299  
 258  300          /* tidy up loose ends and fill things in */
 259  301          stripsnug(p, g);
 260  302          findmust(p, g);
 261  303          /*
 262  304           * only use Boyer-Moore algorithm if the pattern is bigger
 263  305           * than three characters
 264  306           */

 265  307          if (g->mlen > 3) {
 266  308                  computejumps(p, g);
 267  309                  computematchjumps(p, g);
 268  310                  if (g->matchjump == NULL && g->charjump != NULL) {
 269  311                          free(g->charjump);
 270  312                          g->charjump = NULL;
 271  313                  }
 272  314          }
 273  315          g->nplus = pluscount(p, g);
 274  316          g->magic = MAGIC2;
 275  317          preg->re_nsub = g->nsub;
 276  318          preg->re_g = g;
 277  319          preg->re_magic = MAGIC1;
 278  320  #ifndef REDEBUG
 279  321          /* not debugging, so can't rely on the assert() in regexec() */
 280  322          if (g->iflags&BAD)

↓ open down ↓

16 lines elided

↑ open up ↑

 281  323                  SETERROR(REG_EFATAL);
 282  324  #endif
 283  325  
 284  326          /* win or lose, we're done */
 285  327          if (p->error != 0)      /* lose */
 286  328                  regfree(preg);
 287  329          return (p->error);
 288  330  }
 289  331  
 290  332  /*
 291      - * p_ere - ERE parser top level, concatenation and alternation
      333 + * Parse one subERE, an atom possibly followed by a repetition op,
      334 + * return whether we should terminate or not.
 292  335   */
 293      -static void
 294      -p_ere(struct parse *p,
 295      -    int stop)           /* character this ERE should end at */
      336 +static bool
      337 +p_ere_exp(struct parse *p, struct branchc *bc)
 296  338  {
 297  339          char c;
 298      -        sopno prevback;
 299      -        sopno prevfwd;
 300      -        sopno conc;
 301      -        int first = 1;          /* is this the first alternative? */
 302      -
 303      -        for (;;) {
 304      -                /* do a bunch of concatenated expressions */
 305      -                conc = HERE();
 306      -                while (MORE() && (c = PEEK()) != '|' && c != stop)
 307      -                        p_ere_exp(p);
 308      -                /* require nonempty */
 309      -                (void) REQUIRE(HERE() != conc, REG_BADPAT);
 310      -
 311      -                if (!EAT('|'))
 312      -                        break;          /* NOTE BREAK OUT */
 313      -
 314      -                if (first) {
 315      -                        INSERT(OCH_, conc);     /* offset is wrong */
 316      -                        prevfwd = conc;
 317      -                        prevback = conc;
 318      -                        first = 0;
 319      -                }
 320      -                ASTERN(OOR1, prevback);
 321      -                prevback = THERE();
 322      -                AHEAD(prevfwd);                 /* fix previous offset */
 323      -                prevfwd = HERE();
 324      -                EMIT(OOR2, 0);                  /* offset is very wrong */
 325      -        }
 326      -
 327      -        if (!first) {           /* tail-end fixups */
 328      -                AHEAD(prevfwd);
 329      -                ASTERN(O_CH, prevback);
 330      -        }
 331      -
 332      -        assert(!MORE() || SEE(stop));
 333      -}
 334      -
 335      -/*
 336      - * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
 337      - */
 338      -static void
 339      -p_ere_exp(struct parse *p)
 340      -{
 341      -        char c;
 342  340          wint_t wc;
 343  341          sopno pos;
 344  342          int count;
 345  343          int count2;
 346  344          sopno subno;
 347  345          int wascaret = 0;
 348  346  
      347 +        (void) bc;
 349  348          assert(MORE());         /* caller should have ensured this */
 350  349          c = GETNEXT();
 351  350  
 352  351          pos = HERE();
 353  352          switch (c) {
 354  353          case '(':
 355  354                  (void) REQUIRE(MORE(), REG_EPAREN);
 356  355                  p->g->nsub++;
 357  356                  subno = p->g->nsub;
 358  357                  if (subno < NPAREN)
 359  358                          p->pbegin[subno] = HERE();
 360  359                  EMIT(OLPAREN, subno);
 361  360                  if (!SEE(')'))
 362      -                        p_ere(p, ')');
      361 +                        p_re(p, ')', IGN);
 363  362                  if (subno < NPAREN) {
 364  363                          p->pend[subno] = HERE();
 365  364                          assert(p->pend[subno] != 0);
 366  365                  }
 367  366                  EMIT(ORPAREN, subno);
 368  367                  (void) MUSTEAT(')', REG_EPAREN);
 369  368                  break;
 370  369  #ifndef POSIX_MISTAKE
 371  370          case ')':               /* happens only if no current unmatched ( */
 372  371                  /*

 373  372                   * You may ask, why the ifndef?  Because I didn't notice
 374  373                   * this until slightly too late for 1003.2, and none of the
 375  374                   * other 1003.2 regular-expression reviewers noticed it at
 376  375                   * all.  So an unmatched ) is legal POSIX, at least until
 377  376                   * we can get it fixed.
 378  377                   */
 379  378                  SETERROR(REG_EPAREN);
 380  379                  break;
 381  380  #endif
 382  381          case '^':
 383  382                  EMIT(OBOL, 0);
 384  383                  p->g->iflags |= USEBOL;
 385  384                  p->g->nbol++;
 386  385                  wascaret = 1;
 387  386                  break;
 388  387          case '$':
 389  388                  EMIT(OEOL, 0);
 390  389                  p->g->iflags |= USEEOL;
 391  390                  p->g->neol++;
 392  391                  break;
 393  392          case '|':
 394  393                  SETERROR(REG_BADPAT);
 395  394                  break;
 396  395          case '*':
 397  396          case '+':
 398  397          case '?':
 399  398          case '{':
 400  399                  SETERROR(REG_BADRPT);
 401  400                  break;
 402  401          case '.':
 403  402                  if (p->g->cflags&REG_NEWLINE)
 404  403                          nonnewline(p);
 405  404                  else
 406  405                          EMIT(OANY, 0);
 407  406                  break;
 408  407          case '[':
 409  408                  p_bracket(p);
 410  409                  break;
 411  410          case '\\':
 412  411                  (void) REQUIRE(MORE(), REG_EESCAPE);
 413  412                  wc = WGETNEXT();
 414  413                  switch (wc) {
 415  414                  case '<':
 416  415                          EMIT(OBOW, 0);
 417  416                          break;

↓ open down ↓

45 lines elided

↑ open up ↑

 418  417                  case '>':
 419  418                          EMIT(OEOW, 0);
 420  419                          break;
 421  420                  default:
 422  421                          ordinary(p, wc);
 423  422                          break;
 424  423                  }
 425  424                  break;
 426  425          default:
 427  426                  if (p->error != 0)
 428      -                        return;
      427 +                        return (false);
 429  428                  p->next--;
 430  429                  wc = WGETNEXT();
 431  430                  ordinary(p, wc);
 432  431                  break;
 433  432          }
 434  433  
 435  434          if (!MORE())
 436      -                return;
      435 +                return (false);
 437  436          c = PEEK();
 438  437          /* we call { a repetition if followed by a digit */
 439  438          if (!(c == '*' || c == '+' || c == '?' || c == '{'))
 440      -                return;         /* no repetition, we're done */
      439 +                return (false);         /* no repetition, we're done */
 441  440          else if (c == '{')
 442  441                  (void) REQUIRE(MORE2() && \
 443  442                      (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
 444  443          NEXT();
 445  444  
 446  445          (void) REQUIRE(!wascaret, REG_BADRPT);
 447  446          switch (c) {
 448  447          case '*':       /* implemented as +? */
 449  448                  /* this case does not require the (y|) trick, noKLUDGE */
 450  449                  INSERT(OPLUS_, pos);

 451  450                  ASTERN(O_PLUS, pos);
 452  451                  INSERT(OQUEST_, pos);
 453  452                  ASTERN(O_QUEST, pos);
 454  453                  break;
 455  454          case '+':
 456  455                  INSERT(OPLUS_, pos);
 457  456                  ASTERN(O_PLUS, pos);
 458  457                  break;
 459  458          case '?':
 460  459                  /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 461  460                  INSERT(OCH_, pos);              /* offset slightly wrong */
 462  461                  ASTERN(OOR1, pos);              /* this one's right */
 463  462                  AHEAD(pos);                     /* fix the OCH_ */
 464  463                  EMIT(OOR2, 0);                  /* offset very wrong... */
 465  464                  AHEAD(THERE());                 /* ...so fix it */
 466  465                  ASTERN(O_CH, THERETHERE());
 467  466                  break;
 468  467          case '{':
 469  468                  count = p_count(p);
 470  469                  if (EAT(',')) {
 471  470                          if (isdigit((uch)PEEK())) {
 472  471                                  count2 = p_count(p);
 473  472                                  (void) REQUIRE(count <= count2, REG_BADBR);
 474  473                          } else          /* single number with comma */
 475  474                                  count2 = INFINITY;
 476  475                  } else          /* just a single number */
 477  476                          count2 = count;
 478  477                  repeat(p, pos, count, count2);

↓ open down ↓

28 lines elided

↑ open up ↑

 479  478                  if (!EAT('}')) {        /* error heuristics */
 480  479                          while (MORE() && PEEK() != '}')
 481  480                                  NEXT();
 482  481                          (void) REQUIRE(MORE(), REG_EBRACE);
 483  482                          SETERROR(REG_BADBR);
 484  483                  }
 485  484                  break;
 486  485          }
 487  486  
 488  487          if (!MORE())
 489      -                return;
      488 +                return (false);
 490  489          c = PEEK();
 491  490          if (!(c == '*' || c == '+' || c == '?' ||
 492  491              (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
 493      -                return;
      492 +                return (false);
 494  493          SETERROR(REG_BADRPT);
      494 +        return (false);
 495  495  }
 496  496  
 497  497  /*
 498  498   * p_str - string (no metacharacters) "parser"
 499  499   */
 500  500  static void
 501  501  p_str(struct parse *p)
 502  502  {
 503  503          (void) REQUIRE(MORE(), REG_BADPAT);
 504  504          while (MORE())
 505  505                  ordinary(p, WGETNEXT());
 506  506  }
 507  507  
 508  508  /*
 509      - * p_bre - BRE parser top level, anchoring and concatenation
 510      - * Giving end1 as OUT essentially eliminates the end1/end2 check.
 511      - *
 512      - * This implementation is a bit of a kludge, in that a trailing $ is first
 513      - * taken as an ordinary character and then revised to be an anchor.
 514      - * The amount of lookahead needed to avoid this kludge is excessive.
      509 + * Eat consecutive branch delimiters for the kind of expression that we are
      510 + * parsing, return the number of delimiters that we ate.
 515  511   */
      512 +static int
      513 +p_branch_eat_delim(struct parse *p, struct branchc *bc)
      514 +{
      515 +        int nskip;
      516 +
      517 +        (void) bc;
      518 +        nskip = 0;
      519 +        while (EAT('|'))
      520 +                ++nskip;
      521 +        return (nskip);
      522 +}
      523 +
      524 +/*
      525 + * Insert necessary branch book-keeping operations. This emits a
      526 + * bogus 'next' offset, since we still have more to parse
      527 + */
 516  528  static void
 517      -p_bre(struct parse *p,
 518      -    int end1,           /* first terminating character */
 519      -    int end2)           /* second terminating character */
      529 +p_branch_ins_offset(struct parse *p, struct branchc *bc)
 520  530  {
 521      -        sopno start = HERE();
 522      -        int first = 1;                  /* first subexpression? */
 523      -        int wasdollar = 0;
      531 +        if (bc->nbranch == 0) {
      532 +                INSERT(OCH_, bc->start);        /* offset is wrong */
      533 +                bc->fwd = bc->start;
      534 +                bc->back = bc->start;
      535 +        }
 524  536  
      537 +        ASTERN(OOR1, bc->back);
      538 +        bc->back = THERE();
      539 +        AHEAD(bc->fwd);                 /* fix previous offset */
      540 +        bc->fwd = HERE();
      541 +        EMIT(OOR2, 0);                  /* offset is very wrong */
      542 +        ++bc->nbranch;
      543 +}
      544 +
      545 +/*
      546 + * Fix the offset of the tail branch, if we actually had any branches.
      547 + * This is to correct the bogus placeholder offset that we use.
      548 + */
      549 +static void
      550 +p_branch_fix_tail(struct parse *p, struct branchc *bc)
      551 +{
      552 +        /* Fix bogus offset at the tail if we actually have branches */
      553 +        if (bc->nbranch > 0) {
      554 +                AHEAD(bc->fwd);
      555 +                ASTERN(O_CH, bc->back);
      556 +        }
      557 +}
      558 +
      559 +/*
      560 + * Signal to the parser that an empty branch has been encountered; this will,
      561 + * in the future, be used to allow for more permissive behavior with empty
      562 + * branches. The return value should indicate whether parsing may continue
      563 + * or not.
      564 + */
      565 +static bool
      566 +p_branch_empty(struct parse *p, struct branchc *bc)
      567 +{
      568 +        (void) bc;
      569 +        SETERROR(REG_BADPAT);
      570 +        return (false);
      571 +}
      572 +
      573 +/*
      574 + * Take care of any branching requirements. This includes inserting the
      575 + * appropriate branching instructions as well as eating all of the branch
      576 + * delimiters until we either run out of pattern or need to parse more pattern.
      577 + */
      578 +static bool
      579 +p_branch_do(struct parse *p, struct branchc *bc)
      580 +{
      581 +        int ate = 0;
      582 +
      583 +        ate = p_branch_eat_delim(p, bc);
      584 +        if (ate == 0)
      585 +                return (false);
      586 +        else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
      587 +                /*
      588 +                 * Halt parsing only if we have an empty branch and
      589 +                 * p_branch_empty indicates that we must not continue.
      590 +                 * In the future, this will not  necessarily be an error.
      591 +                 */
      592 +                return (false);
      593 +        p_branch_ins_offset(p, bc);
      594 +
      595 +        return (true);
      596 +}
      597 +
      598 +static void
      599 +p_bre_pre_parse(struct parse *p, struct branchc *bc)
      600 +{
      601 +        (void) bc;
      602 +        /*
      603 +         * Does not move cleanly into expression parser because of
      604 +         * ordinary interpration of * at the beginning position of
      605 +         * an expression.
      606 +         */
 525  607          if (EAT('^')) {
 526  608                  EMIT(OBOL, 0);
 527  609                  p->g->iflags |= USEBOL;
 528  610                  p->g->nbol++;
 529  611          }
 530      -        while (MORE() && !SEETWO(end1, end2)) {
 531      -                wasdollar = p_simp_re(p, first);
 532      -                first = 0;
 533      -        }
 534      -        if (wasdollar) {        /* oops, that was a trailing anchor */
      612 +}
      613 +
      614 +static void
      615 +p_bre_post_parse(struct parse *p, struct branchc *bc)
      616 +{
      617 +        /* Expression is terminating due to EOL token */
      618 +        if (bc->terminate) {
 535  619                  DROP(1);
 536  620                  EMIT(OEOL, 0);
 537  621                  p->g->iflags |= USEEOL;
 538  622                  p->g->neol++;
 539  623          }
      624 +}
 540  625  
 541      -        (void) REQUIRE(HERE() != start, REG_BADPAT);    /* require nonempty */
      626 +/*
      627 + * Top level parser, concatenation and BRE anchoring.
      628 + * Giving end1 as OUT essentially eliminates the end1/end2 check.
      629 + *
      630 + * This implementation is a bit of a kludge, in that a trailing $ is first
      631 + * taken as an ordinary character and then revised to be an anchor.
      632 + * The amount of lookahead needed to avoid this kludge is excessive.
      633 + */
      634 +static void
      635 +p_re(struct parse *p,
      636 +    int end1,   /* first terminating character */
      637 +    int end2)   /* second terminating character; ignored for EREs */
      638 +{
      639 +        struct branchc bc;
      640 +
      641 +        bc.nbranch = 0;
      642 +        if (end1 == OUT && end2 == OUT)
      643 +                bc.outer = true;
      644 +        else
      645 +                bc.outer = false;
      646 +#define SEEEND()        (!p->bre ? SEE(end1) : SEETWO(end1, end2))
      647 +        for (;;) {
      648 +                bc.start = HERE();
      649 +                bc.nchain = 0;
      650 +                bc.terminate = false;
      651 +                if (p->pre_parse != NULL)
      652 +                        p->pre_parse(p, &bc);
      653 +                while (MORE() && (!p->allowbranch || !SEESPEC('|')) &&
      654 +                    !SEEEND()) {
      655 +                        bc.terminate = p->parse_expr(p, &bc);
      656 +                        ++bc.nchain;
      657 +                }
      658 +                if (p->post_parse != NULL)
      659 +                        p->post_parse(p, &bc);
      660 +                (void) REQUIRE(HERE() != bc.start, REG_BADPAT);
      661 +                if (!p->allowbranch)
      662 +                        break;
      663 +                /*
      664 +                 * p_branch_do's return value indicates whether we should
      665 +                 * continue parsing or not. This is both for correctness and
      666 +                 * a slight optimization, because it will check if we've
      667 +                 * encountered an empty branch or the end of the string
      668 +                 * immediately following a branch delimiter.
      669 +                 */
      670 +                if (!p_branch_do(p, &bc))
      671 +                        break;
      672 +        }
      673 +#undef SEE_END
      674 +        if (p->allowbranch)
      675 +                p_branch_fix_tail(p, &bc);
      676 +        assert(!MORE() || SEE(end1));
 542  677  }
 543  678  
 544  679  /*
 545  680   * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
 546  681   */
 547      -static int                      /* was the simple RE an unbackslashed $? */
 548      -p_simp_re(struct parse *p,
 549      -    int starordinary)   /* is a leading * an ordinary character? */
      682 +static bool                     /* was the simple RE an unbackslashed $? */
      683 +p_simp_re(struct parse *p, struct branchc *bc)
 550  684  {
 551  685          int c;
 552  686          int count;
 553  687          int count2;
 554  688          sopno pos;
 555  689          int i;
 556  690          wint_t wc;
 557  691          sopno subno;
 558  692  #define BACKSL  (1<<CHAR_BIT)
 559  693

 560  694          pos = HERE();           /* repetition op, if any, covers from here */
 561  695  
 562  696          assert(MORE());         /* caller should have ensured this */
 563  697          c = GETNEXT();
 564  698          if (c == '\\') {
 565  699                  (void) REQUIRE(MORE(), REG_EESCAPE);
 566  700                  c = BACKSL | GETNEXT();
 567  701          }
 568  702          switch (c) {
 569  703          case '.':
 570  704                  if (p->g->cflags&REG_NEWLINE)
 571  705                          nonnewline(p);
 572  706                  else
 573  707                          EMIT(OANY, 0);
 574  708                  break;
 575  709          case '[':
 576  710                  p_bracket(p);
 577  711                  break;
 578  712          case BACKSL|'<':
 579  713                  EMIT(OBOW, 0);
 580  714                  break;
 581  715          case BACKSL|'>':
 582  716                  EMIT(OEOW, 0);
 583  717                  break;
 584  718          case BACKSL|'{':

↓ open down ↓

25 lines elided

↑ open up ↑

 585  719                  SETERROR(REG_BADRPT);
 586  720                  break;
 587  721          case BACKSL|'(':
 588  722                  p->g->nsub++;
 589  723                  subno = p->g->nsub;
 590  724                  if (subno < NPAREN)
 591  725                          p->pbegin[subno] = HERE();
 592  726                  EMIT(OLPAREN, subno);
 593  727                  /* the MORE here is an error heuristic */
 594  728                  if (MORE() && !SEETWO('\\', ')'))
 595      -                        p_bre(p, '\\', ')');
      729 +                        p_re(p, '\\', ')');
 596  730                  if (subno < NPAREN) {
 597  731                          p->pend[subno] = HERE();
 598  732                          assert(p->pend[subno] != 0);
 599  733                  }
 600  734                  EMIT(ORPAREN, subno);
 601  735                  (void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
 602  736                  break;
 603  737          case BACKSL|')':        /* should not get here -- must be user */
 604  738                  SETERROR(REG_EPAREN);
 605  739                  break;

 606  740          case BACKSL|'1':
 607  741          case BACKSL|'2':
 608  742          case BACKSL|'3':
 609  743          case BACKSL|'4':
 610  744          case BACKSL|'5':
 611  745          case BACKSL|'6':
 612  746          case BACKSL|'7':
 613  747          case BACKSL|'8':
 614  748          case BACKSL|'9':
 615  749                  i = (c&~BACKSL) - '0';
 616  750                  assert(i < NPAREN);
 617  751                  if (p->pend[i] != 0) {
 618  752                          assert(i <= p->g->nsub);
 619  753                          EMIT(OBACK_, i);

↓ open down ↓

14 lines elided

↑ open up ↑

 620  754                          assert(p->pbegin[i] != 0);
 621  755                          assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
 622  756                          assert(OP(p->strip[p->pend[i]]) == ORPAREN);
 623  757                          (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
 624  758                          EMIT(O_BACK, i);
 625  759                  } else
 626  760                          SETERROR(REG_ESUBREG);
 627  761                  p->g->backrefs = 1;
 628  762                  break;
 629  763          case '*':
 630      -                (void) REQUIRE(starordinary, REG_BADRPT);
      764 +                /*
      765 +                 * Ordinary if used as the first character beyond BOL anchor of
      766 +                 * a (sub-)expression, counts as a bad repetition operator if it
      767 +                 * appears otherwise.
      768 +                 */
      769 +                (void) REQUIRE(bc->nchain == 0, REG_BADRPT);
 631  770                  /* FALLTHROUGH */
 632  771          default:
 633  772                  if (p->error != 0)
 634      -                        return (0);     /* Definitely not $... */
      773 +                        return (false); /* Definitely not $... */
 635  774                  p->next--;
 636  775                  wc = WGETNEXT();
 637  776                  ordinary(p, wc);
 638  777                  break;
 639  778          }
 640  779  
 641  780          if (EAT('*')) {         /* implemented as +? */
 642  781                  /* this case does not require the (y|) trick, noKLUDGE */
 643  782                  INSERT(OPLUS_, pos);
 644  783                  ASTERN(O_PLUS, pos);

 645  784                  INSERT(OQUEST_, pos);
 646  785                  ASTERN(O_QUEST, pos);
 647  786          } else if (EATTWO('\\', '{')) {
 648  787                  count = p_count(p);
 649  788                  if (EAT(',')) {
 650  789                          if (MORE() && isdigit((uch)PEEK())) {
 651  790                                  count2 = p_count(p);
 652  791                                  (void) REQUIRE(count <= count2, REG_BADBR);
 653  792                          } else          /* single number with comma */
 654  793                                  count2 = INFINITY;

↓ open down ↓

10 lines elided

↑ open up ↑

 655  794                  } else          /* just a single number */
 656  795                          count2 = count;
 657  796                  repeat(p, pos, count, count2);
 658  797                  if (!EATTWO('\\', '}')) {       /* error heuristics */
 659  798                          while (MORE() && !SEETWO('\\', '}'))
 660  799                                  NEXT();
 661  800                          (void) REQUIRE(MORE(), REG_EBRACE);
 662  801                          SETERROR(REG_BADBR);
 663  802                  }
 664  803          } else if (c == '$')    /* $ (but not \$) ends it */
 665      -                return (1);
      804 +                return (true);
 666  805  
 667      -        return (0);
      806 +        return (false);
 668  807  }
 669  808  
 670  809  /*
 671  810   * p_count - parse a repetition count
 672  811   */
 673  812  static int                      /* the value */
 674  813  p_count(struct parse *p)
 675  814  {
 676  815          int count = 0;
 677  816          int ndigits = 0;

 678  817  
 679  818          while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
 680  819                  count = count*10 + (GETNEXT() - '0');
 681  820                  ndigits++;
 682  821          }
 683  822  
 684  823          (void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
 685  824          return (count);
 686  825  }
 687  826  
 688  827  /*
 689  828   * p_bracket - parse a bracketed character list
 690  829   */
 691  830  static void
 692  831  p_bracket(struct parse *p)
 693  832  {
 694  833          cset *cs;
 695  834          wint_t ch;
 696  835  
 697  836          /* Dept of Truly Sickening Special-Case Kludges */
 698  837          if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
 699  838                  EMIT(OBOW, 0);
 700  839                  NEXTn(6);
 701  840                  return;
 702  841          }
 703  842          if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
 704  843                  EMIT(OEOW, 0);
 705  844                  NEXTn(6);
 706  845                  return;
 707  846          }
 708  847  
 709  848          if ((cs = allocset(p)) == NULL)
 710  849                  return;
 711  850  
 712  851          if (p->g->cflags&REG_ICASE)
 713  852                  cs->icase = 1;
 714  853          if (EAT('^'))
 715  854                  cs->invert = 1;
 716  855          if (EAT(']'))
 717  856                  CHadd(p, cs, ']');
 718  857          else if (EAT('-'))
 719  858                  CHadd(p, cs, '-');
 720  859          while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
 721  860                  p_b_term(p, cs);
 722  861          if (EAT('-'))
 723  862                  CHadd(p, cs, '-');
 724  863          (void) MUSTEAT(']', REG_EBRACK);
 725  864  
 726  865          if (p->error != 0)      /* don't mess things up further */
 727  866                  return;
 728  867  
 729  868          if (cs->invert && p->g->cflags&REG_NEWLINE)
 730  869                  cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
 731  870  
 732  871          if ((ch = singleton(cs)) != OUT) {      /* optimize singleton sets */
 733  872                  ordinary(p, ch);
 734  873                  freeset(p, cs);
 735  874          } else
 736  875                  EMIT(OANYOF, (int)(cs - p->g->sets));
 737  876  }
 738  877  
 739  878  /*
 740  879   * p_b_term - parse one term of a bracketed character list
 741  880   */
 742  881  static void
 743  882  p_b_term(struct parse *p, cset *cs)
 744  883  {
 745  884          char c;
 746  885          wint_t start, finish;
 747  886          wint_t i;
 748  887          locale_t loc = uselocale(NULL);
 749  888  
 750  889          /* classify what we've got */
 751  890          switch ((MORE()) ? PEEK() : '\0') {
 752  891          case '[':
 753  892                  c = (MORE2()) ? PEEK2() : '\0';
 754  893                  break;
 755  894          case '-':
 756  895                  SETERROR(REG_ERANGE);
 757  896                  return;                 /* NOTE RETURN */
 758  897          default:
 759  898                  c = '\0';
 760  899                  break;
 761  900          }
 762  901  
 763  902          switch (c) {
 764  903          case ':':               /* character class */
 765  904                  NEXT2();
 766  905                  (void) REQUIRE(MORE(), REG_EBRACK);
 767  906                  c = PEEK();
 768  907                  (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);
 769  908                  p_b_cclass(p, cs);
 770  909                  (void) REQUIRE(MORE(), REG_EBRACK);
 771  910                  (void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
 772  911                  break;
 773  912          case '=':               /* equivalence class */
 774  913                  NEXT2();
 775  914                  (void) REQUIRE(MORE(), REG_EBRACK);
 776  915                  c = PEEK();
 777  916                  (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
 778  917                  p_b_eclass(p, cs);
 779  918                  (void) REQUIRE(MORE(), REG_EBRACK);
 780  919                  (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
 781  920                  break;
 782  921          default:                /* symbol, ordinary character, or range */
 783  922                  start = p_b_symbol(p);
 784  923                  if (SEE('-') && MORE2() && PEEK2() != ']') {
 785  924                          /* range */
 786  925                          NEXT();
 787  926                          if (EAT('-'))
 788  927                                  finish = '-';
 789  928                          else
 790  929                                  finish = p_b_symbol(p);
 791  930                  } else
 792  931                          finish = start;
 793  932                  if (start == finish)
 794  933                          CHadd(p, cs, start);
 795  934                  else {
 796  935                          if (loc->collate->lc_is_posix) {
 797  936                                  (void) REQUIRE((uch)start <= (uch)finish,
 798  937                                      REG_ERANGE);
 799  938                                  CHaddrange(p, cs, start, finish);
 800  939                          } else {
 801  940                                  (void) REQUIRE(_collate_range_cmp(start,
 802  941                                      finish, loc) <= 0, REG_ERANGE);
 803  942                                  for (i = 0; i <= UCHAR_MAX; i++) {
 804  943                                          if (_collate_range_cmp(start, i, loc)
 805  944                                              <= 0 &&
 806  945                                              _collate_range_cmp(i, finish, loc)
 807  946                                              <= 0)
 808  947                                                  CHadd(p, cs, i);
 809  948                                  }
 810  949                          }
 811  950                  }
 812  951                  break;
 813  952          }
 814  953  }
 815  954  
 816  955  /*
 817  956   * p_b_cclass - parse a character-class name and deal with it
 818  957   */
 819  958  static void
 820  959  p_b_cclass(struct parse *p, cset *cs)
 821  960  {
 822  961          const char *sp = p->next;
 823  962          size_t len;
 824  963          wctype_t wct;
 825  964          char clname[16];
 826  965  
 827  966          while (MORE() && isalpha((uch)PEEK()))
 828  967                  NEXT();
 829  968          len = p->next - sp;
 830  969          if (len >= sizeof (clname) - 1) {
 831  970                  SETERROR(REG_ECTYPE);
 832  971                  return;
 833  972          }
 834  973          (void) memcpy(clname, sp, len);
 835  974          clname[len] = '\0';
 836  975          if ((wct = wctype(clname)) == 0) {
 837  976                  SETERROR(REG_ECTYPE);
 838  977                  return;
 839  978          }
 840  979          CHaddtype(p, cs, wct);
 841  980  }
 842  981  
 843  982  /*
 844  983   * p_b_eclass - parse an equivalence-class name and deal with it
 845  984   *
 846  985   * This implementation is incomplete. xxx
 847  986   */
 848  987  static void
 849  988  p_b_eclass(struct parse *p, cset *cs)
 850  989  {
 851  990          wint_t c;
 852  991  
 853  992          c = p_b_coll_elem(p, '=');
 854  993          CHadd(p, cs, c);
 855  994  }
 856  995  
 857  996  /*
 858  997   * p_b_symbol - parse a character or [..]ed multicharacter collating symbol
 859  998   */
 860  999  static wint_t                   /* value of symbol */
 861 1000  p_b_symbol(struct parse *p)
 862 1001  {
 863 1002          wint_t value;
 864 1003  
 865 1004          (void) REQUIRE(MORE(), REG_EBRACK);
 866 1005          if (!EATTWO('[', '.'))
 867 1006                  return (WGETNEXT());
 868 1007  
 869 1008          /* collating symbol */
 870 1009          value = p_b_coll_elem(p, '.');
 871 1010          (void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
 872 1011          return (value);
 873 1012  }
 874 1013  
 875 1014  /*
 876 1015   * p_b_coll_elem - parse a collating-element name and look it up
 877 1016   */
 878 1017  static wint_t                   /* value of collating element */
 879 1018  p_b_coll_elem(struct parse *p,
 880 1019      wint_t endc)                /* name ended by endc,']' */
 881 1020  {
 882 1021          const char *sp = p->next;
 883 1022          struct cname *cp;
 884 1023          mbstate_t mbs;
 885 1024          wchar_t wc;

↓ open down ↓

208 lines elided

↑ open up ↑

 886 1025          size_t clen, len;
 887 1026  
 888 1027          while (MORE() && !SEETWO(endc, ']'))
 889 1028                  NEXT();
 890 1029          if (!MORE()) {
 891 1030                  SETERROR(REG_EBRACK);
 892 1031                  return (0);
 893 1032          }
 894 1033          len = p->next - sp;
 895 1034          for (cp = cnames; cp->name != NULL; cp++)
 896      -                if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
     1035 +                if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
 897 1036                          return (cp->code);      /* known name */
 898 1037          (void) memset(&mbs, 0, sizeof (mbs));
 899 1038          if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
 900 1039                  return (wc);                    /* single character */
 901 1040          else if (clen == (size_t)-1 || clen == (size_t)-2)
 902 1041                  SETERROR(REG_ECHAR);
 903 1042          else
 904 1043                  SETERROR(REG_ECOLLATE);         /* neither */
 905 1044          return (0);
 906 1045  }

 907 1046  
 908 1047  /*
 909 1048   * othercase - return the case counterpart of an alphabetic
 910 1049   */
 911 1050  static wint_t                   /* if no counterpart, return ch */
 912 1051  othercase(wint_t ch)
 913 1052  {
 914 1053          assert(iswalpha(ch));
 915 1054          if (iswupper(ch))
 916 1055                  return (towlower(ch));
 917 1056          else if (iswlower(ch))
 918 1057                  return (towupper(ch));
 919 1058          else                    /* peculiar, but could happen */
 920 1059                  return (ch);
 921 1060  }
 922 1061  
 923 1062  /*
 924 1063   * bothcases - emit a dualcase version of a two-case character
 925 1064   *
 926 1065   * Boy, is this implementation ever a kludge...
 927 1066   */
 928 1067  static void
 929 1068  bothcases(struct parse *p, wint_t ch)
 930 1069  {
 931 1070          const char *oldnext = p->next;
 932 1071          const char *oldend = p->end;
 933 1072          char bracket[3 + MB_LEN_MAX];
 934 1073          size_t n;
 935 1074          mbstate_t mbs;
 936 1075  
 937 1076          assert(othercase(ch) != ch);    /* p_bracket() would recurse */
 938 1077          p->next = bracket;
 939 1078          (void) memset(&mbs, 0, sizeof (mbs));
 940 1079          n = wcrtomb(bracket, ch, &mbs);
 941 1080          assert(n != (size_t)-1);
 942 1081          bracket[n] = ']';
 943 1082          bracket[n + 1] = '\0';
 944 1083          p->end = bracket+n+1;
 945 1084          p_bracket(p);
 946 1085          assert(p->next == p->end);
 947 1086          p->next = oldnext;
 948 1087          p->end = oldend;
 949 1088  }
 950 1089  
 951 1090  /*
 952 1091   * ordinary - emit an ordinary character
 953 1092   */
 954 1093  static void
 955 1094  ordinary(struct parse *p, wint_t ch)
 956 1095  {
 957 1096          cset *cs;
 958 1097  
 959 1098          if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
 960 1099                  bothcases(p, ch);
 961 1100          else if ((ch & OPDMASK) == ch)
 962 1101                  EMIT(OCHAR, ch);
 963 1102          else {
 964 1103                  /*
 965 1104                   * Kludge: character is too big to fit into an OCHAR operand.
 966 1105                   * Emit a singleton set.
 967 1106                   */
 968 1107                  if ((cs = allocset(p)) == NULL)
 969 1108                          return;
 970 1109                  CHadd(p, cs, ch);
 971 1110                  EMIT(OANYOF, (int)(cs - p->g->sets));
 972 1111          }
 973 1112  }
 974 1113  
 975 1114  /*
 976 1115   * nonnewline - emit REG_NEWLINE version of OANY
 977 1116   *
 978 1117   * Boy, is this implementation ever a kludge...
 979 1118   */
 980 1119  static void
 981 1120  nonnewline(struct parse *p)
 982 1121  {
 983 1122          const char *oldnext = p->next;
 984 1123          const char *oldend = p->end;
 985 1124          char bracket[4];
 986 1125  
 987 1126          p->next = bracket;
 988 1127          p->end = bracket+3;
 989 1128          bracket[0] = '^';
 990 1129          bracket[1] = '\n';
 991 1130          bracket[2] = ']';
 992 1131          bracket[3] = '\0';
 993 1132          p_bracket(p);
 994 1133          assert(p->next == bracket+3);
 995 1134          p->next = oldnext;
 996 1135          p->end = oldend;
 997 1136  }
 998 1137  
 999 1138  /*
1000 1139   * repeat - generate code for a bounded repetition, recursively if needed
1001 1140   */
1002 1141  static void
1003 1142  repeat(struct parse *p,
1004 1143      sopno start,                /* operand from here to end of strip */
1005 1144      int from,                   /* repeated from this number */
1006 1145      int to)                     /* to this number of times (maybe INFINITY) */
1007 1146  {
1008 1147          sopno finish = HERE();
1009 1148  #define N       2
1010 1149  #define INF     3
1011 1150  #define REP(f, t)       ((f)*8 + (t))
1012 1151  #define MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1013 1152          sopno copy;
1014 1153  
1015 1154          if (p->error != 0)      /* head off possible runaway recursion */
1016 1155                  return;
1017 1156  
1018 1157          assert(from <= to);
1019 1158  
1020 1159          switch (REP(MAP(from), MAP(to))) {
1021 1160          case REP(0, 0):                 /* must be user doing this */
1022 1161                  DROP(finish-start);     /* drop the operand */
1023 1162                  break;
1024 1163          case REP(0, 1):                 /* as x{1,1}? */
1025 1164          case REP(0, N):                 /* as x{1,n}? */
1026 1165          case REP(0, INF):               /* as x{1,}? */
1027 1166                  /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1028 1167                  INSERT(OCH_, start);            /* offset is wrong... */
1029 1168                  repeat(p, start+1, 1, to);
1030 1169                  ASTERN(OOR1, start);
1031 1170                  AHEAD(start);                   /* ... fix it */
1032 1171                  EMIT(OOR2, 0);
1033 1172                  AHEAD(THERE());
1034 1173                  ASTERN(O_CH, THERETHERE());
1035 1174                  break;
1036 1175          case REP(1, 1):                 /* trivial case */
1037 1176                  /* done */
1038 1177                  break;
1039 1178          case REP(1, N):                 /* as x?x{1,n-1} */
1040 1179                  /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1041 1180                  INSERT(OCH_, start);
1042 1181                  ASTERN(OOR1, start);
1043 1182                  AHEAD(start);
1044 1183                  EMIT(OOR2, 0);                  /* offset very wrong... */
1045 1184                  AHEAD(THERE());                 /* ...so fix it */
1046 1185                  ASTERN(O_CH, THERETHERE());
1047 1186                  copy = dupl(p, start+1, finish+1);
1048 1187                  assert(copy == finish+4);
1049 1188                  repeat(p, copy, 1, to-1);
1050 1189                  break;
1051 1190          case REP(1, INF):               /* as x+ */
1052 1191                  INSERT(OPLUS_, start);
1053 1192                  ASTERN(O_PLUS, start);
1054 1193                  break;
1055 1194          case REP(N, N):                 /* as xx{m-1,n-1} */
1056 1195                  copy = dupl(p, start, finish);
1057 1196                  repeat(p, copy, from-1, to-1);
1058 1197                  break;
1059 1198          case REP(N, INF):               /* as xx{n-1,INF} */
1060 1199                  copy = dupl(p, start, finish);
1061 1200                  repeat(p, copy, from-1, to);
1062 1201                  break;
1063 1202          default:                        /* "can't happen" */
1064 1203                  SETERROR(REG_EFATAL);   /* just in case */
1065 1204                  break;
1066 1205          }
1067 1206  }
1068 1207  
1069 1208  /*
1070 1209   * wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1071 1210   * character from the parse struct, signals a REG_ILLSEQ error if the
1072 1211   * character can't be converted. Returns the number of bytes consumed.
1073 1212   */
1074 1213  static wint_t
1075 1214  wgetnext(struct parse *p)
1076 1215  {
1077 1216          mbstate_t mbs;
1078 1217          wchar_t wc;
1079 1218          size_t n;
1080 1219  
1081 1220          (void) memset(&mbs, 0, sizeof (mbs));
1082 1221          n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1083 1222          if (n == (size_t)-1 || n == (size_t)-2) {
1084 1223                  SETERROR(REG_ECHAR);
1085 1224                  return (0);
1086 1225          }
1087 1226          if (n == 0)
1088 1227                  n = 1;
1089 1228          p->next += n;
1090 1229          return (wc);
1091 1230  }
1092 1231  
1093 1232  /*
1094 1233   * seterr - set an error condition
1095 1234   */
1096 1235  static int                      /* useless but makes type checking happy */
1097 1236  seterr(struct parse *p, int e)
1098 1237  {
1099 1238          if (p->error == 0)      /* keep earliest error condition */
1100 1239                  p->error = e;
1101 1240          p->next = nuls;         /* try to bring things to a halt */
1102 1241          p->end = nuls;
1103 1242          return (0);             /* make the return value well-defined */
1104 1243  }
1105 1244  
1106 1245  /*
1107 1246   * allocset - allocate a set of characters for []
1108 1247   */
1109 1248  static cset *
1110 1249  allocset(struct parse *p)
1111 1250  {
1112 1251          cset *cs, *ncs;
1113 1252  
1114 1253          ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs));
1115 1254          if (ncs == NULL) {
1116 1255                  SETERROR(REG_ESPACE);
1117 1256                  return (NULL);
1118 1257          }
1119 1258          p->g->sets = ncs;
1120 1259          cs = &p->g->sets[p->g->ncsets++];
1121 1260          (void) memset(cs, 0, sizeof (*cs));
1122 1261  
1123 1262          return (cs);
1124 1263  }
1125 1264  
1126 1265  /*
1127 1266   * freeset - free a now-unused set
1128 1267   */
1129 1268  static void
1130 1269  freeset(struct parse *p, cset *cs)
1131 1270  {
1132 1271          cset *top = &p->g->sets[p->g->ncsets];
1133 1272  
1134 1273          free(cs->wides);
1135 1274          free(cs->ranges);
1136 1275          free(cs->types);
1137 1276          (void) memset(cs, 0, sizeof (*cs));
1138 1277          if (cs == top-1)        /* recover only the easy case */
1139 1278                  p->g->ncsets--;
1140 1279  }
1141 1280  
1142 1281  /*
1143 1282   * singleton - Determine whether a set contains only one character,
1144 1283   * returning it if so, otherwise returning OUT.
1145 1284   */
1146 1285  static wint_t
1147 1286  singleton(cset *cs)
1148 1287  {
1149 1288          wint_t i, s, n;
1150 1289  
1151 1290          for (i = n = 0; i < NC; i++)
1152 1291                  if (CHIN(cs, i)) {
1153 1292                          n++;
1154 1293                          s = i;
1155 1294                  }
1156 1295          if (n == 1)
1157 1296                  return (s);
1158 1297          if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1159 1298              cs->icase == 0)
1160 1299                  return (cs->wides[0]);
1161 1300          /* Don't bother handling the other cases. */
1162 1301          return (OUT);
1163 1302  }
1164 1303  
1165 1304  /*
1166 1305   * CHadd - add character to character set.
1167 1306   */
1168 1307  static void
1169 1308  CHadd(struct parse *p, cset *cs, wint_t ch)
1170 1309  {
1171 1310          wint_t nch, *newwides;
1172 1311          assert(ch >= 0);
1173 1312          if (ch < NC)
1174 1313                  cs->bmp[ch >> 3] |= 1 << (ch & 7);
1175 1314          else {
1176 1315                  newwides = realloc(cs->wides, (cs->nwides + 1) *
1177 1316                      sizeof (*cs->wides));
1178 1317                  if (newwides == NULL) {
1179 1318                          SETERROR(REG_ESPACE);
1180 1319                          return;
1181 1320                  }
1182 1321                  cs->wides = newwides;
1183 1322                  cs->wides[cs->nwides++] = ch;
1184 1323          }
1185 1324          if (cs->icase) {
1186 1325                  if ((nch = towlower(ch)) < NC)
1187 1326                          cs->bmp[nch >> 3] |= 1 << (nch & 7);
1188 1327                  if ((nch = towupper(ch)) < NC)
1189 1328                          cs->bmp[nch >> 3] |= 1 << (nch & 7);
1190 1329          }
1191 1330  }
1192 1331  
1193 1332  /*
1194 1333   * CHaddrange - add all characters in the range [min,max] to a character set.
1195 1334   */
1196 1335  static void
1197 1336  CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1198 1337  {
1199 1338          crange *newranges;
1200 1339  
1201 1340          for (; min < NC && min <= max; min++)
1202 1341                  CHadd(p, cs, min);
1203 1342          if (min >= max)
1204 1343                  return;
1205 1344          newranges = realloc(cs->ranges, (cs->nranges + 1) *
1206 1345              sizeof (*cs->ranges));
1207 1346          if (newranges == NULL) {
1208 1347                  SETERROR(REG_ESPACE);
1209 1348                  return;
1210 1349          }
1211 1350          cs->ranges = newranges;
1212 1351          cs->ranges[cs->nranges].min = min;
1213 1352          cs->ranges[cs->nranges].max = max;
1214 1353          cs->nranges++;
1215 1354  }
1216 1355  
1217 1356  /*
1218 1357   * CHaddtype - add all characters of a certain type to a character set.
1219 1358   */
1220 1359  static void
1221 1360  CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1222 1361  {
1223 1362          wint_t i;
1224 1363          wctype_t *newtypes;
1225 1364  
1226 1365          for (i = 0; i < NC; i++)
1227 1366                  if (iswctype(i, wct))
1228 1367                          CHadd(p, cs, i);
1229 1368          newtypes = realloc(cs->types, (cs->ntypes + 1) *
1230 1369              sizeof (*cs->types));
1231 1370          if (newtypes == NULL) {
1232 1371                  SETERROR(REG_ESPACE);
1233 1372                  return;
1234 1373          }
1235 1374          cs->types = newtypes;
1236 1375          cs->types[cs->ntypes++] = wct;
1237 1376  }
1238 1377  
1239 1378  /*
1240 1379   * dupl - emit a duplicate of a bunch of sops
1241 1380   */
1242 1381  static sopno                    /* start of duplicate */
1243 1382  dupl(struct parse *p,
1244 1383      sopno start,                /* from here */
1245 1384      sopno finish)               /* to this less one */
1246 1385  {
1247 1386          sopno ret = HERE();
1248 1387          sopno len = finish - start;
1249 1388  
1250 1389          assert(finish >= start);
1251 1390          if (len == 0)
1252 1391                  return (ret);
1253 1392          if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1254 1393                  return (ret);
1255 1394          assert(p->ssize >= p->slen + len);
1256 1395          (void) memcpy((char *)(p->strip + p->slen),
1257 1396              (char *)(p->strip + start), (size_t)len*sizeof (sop));
1258 1397          p->slen += len;
1259 1398          return (ret);
1260 1399  }
1261 1400  
1262 1401  /*
1263 1402   * doemit - emit a strip operator
1264 1403   *
1265 1404   * It might seem better to implement this as a macro with a function as
1266 1405   * hard-case backup, but it's just too big and messy unless there are
1267 1406   * some changes to the data structures.  Maybe later.
1268 1407   */
1269 1408  static void
1270 1409  doemit(struct parse *p, sop op, size_t opnd)
1271 1410  {
1272 1411          /* avoid making error situations worse */
1273 1412          if (p->error != 0)
1274 1413                  return;
1275 1414  
1276 1415          /* deal with oversize operands ("can't happen", more or less) */
1277 1416          assert(opnd < 1<<OPSHIFT);
1278 1417  
1279 1418          /* deal with undersized strip */
1280 1419          if (p->slen >= p->ssize)
1281 1420                  if (!enlarge(p, (p->ssize+1) / 2 * 3))  /* +50% */
1282 1421                          return;
1283 1422  
1284 1423          /* finally, it's all reduced to the easy case */
1285 1424          p->strip[p->slen++] = SOP(op, opnd);
1286 1425  }
1287 1426  
1288 1427  /*
1289 1428   * doinsert - insert a sop into the strip
1290 1429   */
1291 1430  static void
1292 1431  doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1293 1432  {
1294 1433          sopno sn;
1295 1434          sop s;
1296 1435          int i;
1297 1436  
1298 1437          /* avoid making error situations worse */
1299 1438          if (p->error != 0)
1300 1439                  return;
1301 1440  
1302 1441          sn = HERE();
1303 1442          EMIT(op, opnd);         /* do checks, ensure space */
1304 1443          assert(HERE() == sn+1);
1305 1444          s = p->strip[sn];
1306 1445  
1307 1446          /* adjust paren pointers */
1308 1447          assert(pos > 0);
1309 1448          for (i = 1; i < NPAREN; i++) {
1310 1449                  if (p->pbegin[i] >= pos) {
1311 1450                          p->pbegin[i]++;
1312 1451                  }
1313 1452                  if (p->pend[i] >= pos) {
1314 1453                          p->pend[i]++;
1315 1454                  }
1316 1455          }
1317 1456  
1318 1457          (void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1319 1458              (HERE()-pos-1)*sizeof (sop));
1320 1459          p->strip[pos] = s;
1321 1460  }
1322 1461  
1323 1462  /*
1324 1463   * dofwd - complete a forward reference
1325 1464   */
1326 1465  static void
1327 1466  dofwd(struct parse *p, sopno pos, sop value)
1328 1467  {
1329 1468          /* avoid making error situations worse */
1330 1469          if (p->error != 0)
1331 1470                  return;
1332 1471  
1333 1472          assert(value < 1<<OPSHIFT);
1334 1473          p->strip[pos] = OP(p->strip[pos]) | value;
1335 1474  }
1336 1475  
1337 1476  /*
1338 1477   * enlarge - enlarge the strip
1339 1478   */
1340 1479  static int
1341 1480  enlarge(struct parse *p, sopno size)
1342 1481  {
1343 1482          sop *sp;
1344 1483  
1345 1484          if (p->ssize >= size)
1346 1485                  return (1);
1347 1486  
1348 1487          sp = (sop *)realloc(p->strip, size*sizeof (sop));
1349 1488          if (sp == NULL) {
1350 1489                  SETERROR(REG_ESPACE);
1351 1490                  return (0);
1352 1491          }
1353 1492          p->strip = sp;
1354 1493          p->ssize = size;
1355 1494          return (1);
1356 1495  }
1357 1496  
1358 1497  /*
1359 1498   * stripsnug - compact the strip
1360 1499   */
1361 1500  static void
1362 1501  stripsnug(struct parse *p, struct re_guts *g)
1363 1502  {
1364 1503          g->nstates = p->slen;
1365 1504          g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop));
1366 1505          if (g->strip == NULL) {
1367 1506                  SETERROR(REG_ESPACE);
1368 1507                  g->strip = p->strip;
1369 1508          }
1370 1509  }
1371 1510  
1372 1511  /*
1373 1512   * findmust - fill in must and mlen with longest mandatory literal string
1374 1513   *
1375 1514   * This algorithm could do fancy things like analyzing the operands of |
1376 1515   * for common subsequences.  Someday.  This code is simple and finds most
1377 1516   * of the interesting cases.
1378 1517   *
1379 1518   * Note that must and mlen got initialized during setup.
1380 1519   */
1381 1520  static void
1382 1521  findmust(struct parse *p, struct re_guts *g)
1383 1522  {
1384 1523          sop *scan;
1385 1524          sop *start = NULL;
1386 1525          sop *newstart = NULL;
1387 1526          sopno newlen;
1388 1527          sop s;
1389 1528          char *cp;
1390 1529          int offset;
1391 1530          char buf[MB_LEN_MAX];
1392 1531          size_t clen;
1393 1532          mbstate_t mbs;
1394 1533          locale_t loc = uselocale(NULL);
1395 1534  
1396 1535          /* avoid making error situations worse */
1397 1536          if (p->error != 0)
1398 1537                  return;
1399 1538  
1400 1539          /*
1401 1540           * It's not generally safe to do a ``char'' substring search on
1402 1541           * multibyte character strings, but it's safe for at least
1403 1542           * UTF-8 (see RFC 3629).
1404 1543           */
1405 1544          if (MB_CUR_MAX > 1 &&
1406 1545              strcmp(loc->runelocale->__encoding, "UTF-8") != 0)
1407 1546                  return;
1408 1547  
1409 1548          /* find the longest OCHAR sequence in strip */
1410 1549          newlen = 0;
1411 1550          offset = 0;
1412 1551          g->moffset = 0;
1413 1552          scan = g->strip + 1;
1414 1553          do {
1415 1554                  s = *scan++;
1416 1555                  switch (OP(s)) {
1417 1556                  case OCHAR:             /* sequence member */
1418 1557                          if (newlen == 0) {              /* new sequence */
1419 1558                                  (void) memset(&mbs, 0, sizeof (mbs));
1420 1559                                  newstart = scan - 1;
1421 1560                          }
1422 1561                          clen = wcrtomb(buf, OPND(s), &mbs);
1423 1562                          if (clen == (size_t)-1)
1424 1563                                  goto toohard;
1425 1564                          newlen += clen;
1426 1565                          break;
1427 1566                  case OPLUS_:            /* things that don't break one */
1428 1567                  case OLPAREN:

↓ open down ↓

522 lines elided

↑ open up ↑

1429 1568                  case ORPAREN:
1430 1569                          break;
1431 1570                  case OQUEST_:           /* things that must be skipped */
1432 1571                  case OCH_:
1433 1572                          offset = altoffset(scan, offset);
1434 1573                          scan--;
1435 1574                          do {
1436 1575                                  scan += OPND(s);
1437 1576                                  s = *scan;
1438 1577                                  /* assert() interferes w debug printouts */
1439      -                                if (OP(s) != O_QUEST && OP(s) != O_CH &&
1440      -                                    OP(s) != OOR2) {
     1578 +                                if (OP(s) != (sop)O_QUEST &&
     1579 +                                    OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) {
1441 1580                                          g->iflags |= BAD;
1442 1581                                          return;
1443 1582                                  }
1444      -                        } while (OP(s) != O_QUEST && OP(s) != O_CH);
     1583 +                        } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1445 1584                          /* FALLTHROUGH */
1446 1585                  case OBOW:              /* things that break a sequence */
1447 1586                  case OEOW:
1448 1587                  case OBOL:
1449 1588                  case OEOL:
1450 1589                  case O_QUEST:
1451 1590                  case O_CH:
1452 1591                  case OEND:
1453      -                        if (newlen > g->mlen) {         /* ends one */
     1592 +                        if (newlen > (sopno)g->mlen) {          /* ends one */
1454 1593                                  start = newstart;
1455 1594                                  g->mlen = newlen;
1456 1595                                  if (offset > -1) {
1457 1596                                          g->moffset += offset;
1458 1597                                          offset = newlen;
1459 1598                                  } else
1460 1599                                          g->moffset = offset;
1461 1600                          } else {
1462 1601                                  if (offset > -1)
1463 1602                                          offset += newlen;
1464 1603                          }
1465 1604                          newlen = 0;
1466 1605                          break;
1467 1606                  case OANY:
1468      -                        if (newlen > g->mlen) {         /* ends one */
     1607 +                        if (newlen > (sopno)g->mlen) {          /* ends one */
1469 1608                                  start = newstart;
1470 1609                                  g->mlen = newlen;
1471 1610                                  if (offset > -1) {
1472 1611                                          g->moffset += offset;
1473 1612                                          offset = newlen;
1474 1613                                  } else
1475 1614                                          g->moffset = offset;
1476 1615                          } else {
1477 1616                                  if (offset > -1)
1478 1617                                          offset += newlen;
1479 1618                          }
1480 1619                          if (offset > -1)
1481 1620                                  offset++;
1482 1621                          newlen = 0;
1483 1622                          break;
1484 1623                  case OANYOF:            /* may or may not invalidate offset */
1485 1624                          /* First, everything as OANY */
1486      -                        if (newlen > g->mlen) {         /* ends one */
     1625 +                        if (newlen > (sopno)g->mlen) {          /* ends one */
1487 1626                                  start = newstart;
1488 1627                                  g->mlen = newlen;
1489 1628                                  if (offset > -1) {
1490 1629                                          g->moffset += offset;
1491 1630                                          offset = newlen;
1492 1631                                  } else
1493 1632                                          g->moffset = offset;
1494 1633                          } else {
1495 1634                                  if (offset > -1)
1496 1635                                          offset += newlen;

1497 1636                          }
1498 1637                          if (offset > -1)
1499 1638                                  offset++;

↓ open down ↓

3 lines elided

↑ open up ↑

1500 1639                          newlen = 0;
1501 1640                          break;
1502 1641                  toohard:
1503 1642                  default:
1504 1643                          /*
1505 1644                           * Anything here makes it impossible or too hard
1506 1645                           * to calculate the offset -- so we give up;
1507 1646                           * save the last known good offset, in case the
1508 1647                           * must sequence doesn't occur later.
1509 1648                           */
1510      -                        if (newlen > g->mlen) {         /* ends one */
     1649 +                        if (newlen > (sopno)g->mlen) {          /* ends one */
1511 1650                                  start = newstart;
1512 1651                                  g->mlen = newlen;
1513 1652                                  if (offset > -1)
1514 1653                                          g->moffset += offset;
1515 1654                                  else
1516 1655                                          g->moffset = offset;
1517 1656                          }
1518 1657                          offset = -1;
1519 1658                          newlen = 0;
1520 1659                          break;

1521 1660                  }
1522 1661          } while (OP(s) != OEND);
1523 1662  
1524 1663          if (g->mlen == 0) {             /* there isn't one */
1525 1664                  g->moffset = -1;
1526 1665                  return;
1527 1666          }
1528 1667  
1529 1668          /* turn it into a character string */
1530 1669          g->must = malloc((size_t)g->mlen + 1);
1531 1670          if (g->must == NULL) {          /* argh; just forget it */
1532 1671                  g->mlen = 0;
1533 1672                  g->moffset = -1;
1534 1673                  return;
1535 1674          }
1536 1675          cp = g->must;
1537 1676          scan = start;
1538 1677          (void) memset(&mbs, 0, sizeof (mbs));
1539 1678          while (cp < g->must + g->mlen) {
1540 1679                  while (OP(s = *scan++) != OCHAR)
1541 1680                          continue;
1542 1681                  clen = wcrtomb(cp, OPND(s), &mbs);
1543 1682                  assert(clen != (size_t)-1);
1544 1683                  cp += clen;
1545 1684          }
1546 1685          assert(cp == g->must + g->mlen);
1547 1686          *cp++ = '\0';           /* just on general principles */
1548 1687  }
1549 1688  
1550 1689  /*
1551 1690   * altoffset - choose biggest offset among multiple choices
1552 1691   *
1553 1692   * Compute, recursively if necessary, the largest offset among multiple
1554 1693   * re paths.
1555 1694   */
1556 1695  static int
1557 1696  altoffset(sop *scan, int offset)
1558 1697  {
1559 1698          int largest;

↓ open down ↓

39 lines elided

↑ open up ↑

1560 1699          int try;
1561 1700          sop s;
1562 1701  
1563 1702          /* If we gave up already on offsets, return */
1564 1703          if (offset == -1)
1565 1704                  return (-1);
1566 1705  
1567 1706          largest = 0;
1568 1707          try = 0;
1569 1708          s = *scan++;
1570      -        while (OP(s) != O_QUEST && OP(s) != O_CH) {
     1709 +        while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) {
1571 1710                  switch (OP(s)) {
1572 1711                  case OOR1:
1573 1712                          if (try > largest)
1574 1713                                  largest = try;
1575 1714                          try = 0;
1576 1715                          break;
1577 1716                  case OQUEST_:
1578 1717                  case OCH_:
1579 1718                          try = altoffset(scan, try);
1580 1719                          if (try == -1)
1581 1720                                  return (-1);
1582 1721                          scan--;
1583 1722                          do {
1584 1723                                  scan += OPND(s);
1585 1724                                  s = *scan;
1586      -                                if (OP(s) != O_QUEST && OP(s) != O_CH &&
1587      -                                    OP(s) != OOR2)
     1725 +                                if (OP(s) != (sop)O_QUEST &&
     1726 +                                    OP(s) != (sop)O_CH && OP(s) != (sop)OOR2)
1588 1727                                          return (-1);
1589      -                        } while (OP(s) != O_QUEST && OP(s) != O_CH);
     1728 +                        } while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1590 1729                          /*
1591 1730                           * We must skip to the next position, or we'll
1592 1731                           * leave altoffset() too early.
1593 1732                           */
1594 1733                          scan++;
1595 1734                          break;
1596 1735                  case OANYOF:
1597 1736                  case OCHAR:
1598 1737                  case OANY:
1599 1738                          try++;

1600 1739                          /*FALLTHRU*/
1601 1740                  case OBOW:
1602 1741                  case OEOW:
1603 1742                  case OLPAREN:
1604 1743                  case ORPAREN:
1605 1744                  case OOR2:
1606 1745                          break;
1607 1746                  default:
1608 1747                          try = -1;
1609 1748                          break;
1610 1749                  }
1611 1750                  if (try == -1)
1612 1751                          return (-1);
1613 1752                  s = *scan++;
1614 1753          }
1615 1754  
1616 1755          if (try > largest)
1617 1756                  largest = try;
1618 1757  
1619 1758          return (largest+offset);
1620 1759  }
1621 1760  
1622 1761  /*
1623 1762   * computejumps - compute char jumps for BM scan
1624 1763   *
1625 1764   * This algorithm assumes g->must exists and is has size greater than
1626 1765   * zero. It's based on the algorithm found on Computer Algorithms by
1627 1766   * Sara Baase.
1628 1767   *
1629 1768   * A char jump is the number of characters one needs to jump based on
1630 1769   * the value of the character from the text that was mismatched.
1631 1770   */
1632 1771  static void
1633 1772  computejumps(struct parse *p, struct re_guts *g)
1634 1773  {
1635 1774          int ch;
1636 1775          int mindex;
1637 1776  
1638 1777          /* Avoid making errors worse */
1639 1778          if (p->error != 0)
1640 1779                  return;
1641 1780  
1642 1781          g->charjump = (int *)malloc((NC + 1) * sizeof (int));
1643 1782          if (g->charjump == NULL)        /* Not a fatal error */
1644 1783                  return;
1645 1784          /* Adjust for signed chars, if necessary */
1646 1785          g->charjump = &g->charjump[-(CHAR_MIN)];
1647 1786  
1648 1787          /*
1649 1788           * If the character does not exist in the pattern, the jump
1650 1789           * is equal to the number of characters in the pattern.
1651 1790           */
1652 1791          for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1653 1792                  g->charjump[ch] = g->mlen;
1654 1793  
1655 1794          /*
1656 1795           * If the character does exist, compute the jump that would
1657 1796           * take us to the last character in the pattern equal to it
1658 1797           * (notice that we match right to left, so that last character
1659 1798           * is the first one that would be matched).
1660 1799           */
1661 1800          for (mindex = 0; mindex < g->mlen; mindex++)
1662 1801                  g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1663 1802  }
1664 1803  
1665 1804  /*
1666 1805   * computematchjumps - compute match jumps for BM scan
1667 1806   *
1668 1807   * This algorithm assumes g->must exists and is has size greater than
1669 1808   * zero. It's based on the algorithm found on Computer Algorithms by
1670 1809   * Sara Baase.
1671 1810   *
1672 1811   * A match jump is the number of characters one needs to advance based
1673 1812   * on the already-matched suffix.
1674 1813   * Notice that all values here are minus (g->mlen-1), because of the way
1675 1814   * the search algorithm works.
1676 1815   */
1677 1816  static void
1678 1817  computematchjumps(struct parse *p, struct re_guts *g)
1679 1818  {
1680 1819          int mindex;             /* General "must" iterator */
1681 1820          int suffix;             /* Keeps track of matching suffix */
1682 1821          int ssuffix;            /* Keeps track of suffixes' suffix */
1683 1822          int *pmatches;
1684 1823                                  /*
1685 1824                                   * pmatches[k] points to the next i
1686 1825                                   * such that i+1...mlen is a substring
1687 1826                                   * of k+1...k+mlen-i-1
1688 1827                                   */
1689 1828  
1690 1829          /* Avoid making errors worse */
1691 1830          if (p->error != 0)
1692 1831                  return;
1693 1832  
1694 1833          pmatches = (int *)malloc(g->mlen * sizeof (unsigned int));
1695 1834          if (pmatches == NULL) {
1696 1835                  g->matchjump = NULL;
1697 1836                  return;
1698 1837          }
1699 1838  
1700 1839          g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int));
1701 1840          if (g->matchjump == NULL) {     /* Not a fatal error */
1702 1841                  free(pmatches);
1703 1842                  return;
1704 1843          }
1705 1844  
1706 1845          /* Set maximum possible jump for each character in the pattern */
1707 1846          for (mindex = 0; mindex < g->mlen; mindex++)
1708 1847                  g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1709 1848  
1710 1849          /* Compute pmatches[] */
1711 1850          for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1712 1851              mindex--, suffix--) {
1713 1852                  pmatches[mindex] = suffix;
1714 1853  
1715 1854                  /*
1716 1855                   * If a mismatch is found, interrupting the substring,
1717 1856                   * compute the matchjump for that position. If no
1718 1857                   * mismatch is found, then a text substring mismatched
1719 1858                   * against the suffix will also mismatch against the
1720 1859                   * substring.
1721 1860                   */
1722 1861                  while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) {
1723 1862                          g->matchjump[suffix] = MIN(g->matchjump[suffix],
1724 1863                              g->mlen - mindex - 1);
1725 1864                          suffix = pmatches[suffix];
1726 1865                  }
1727 1866          }
1728 1867  
1729 1868          /*
1730 1869           * Compute the matchjump up to the last substring found to jump
1731 1870           * to the beginning of the largest must pattern prefix matching
1732 1871           * it's own suffix.
1733 1872           */
1734 1873          for (mindex = 0; mindex <= suffix; mindex++)
1735 1874                  g->matchjump[mindex] = MIN(g->matchjump[mindex],
1736 1875                      g->mlen + suffix - mindex);
1737 1876  
1738 1877          ssuffix = pmatches[suffix];
1739 1878          while (suffix < g->mlen) {
1740 1879                  while (suffix <= ssuffix && suffix < g->mlen) {
1741 1880                          g->matchjump[suffix] = MIN(g->matchjump[suffix],
1742 1881                              g->mlen + ssuffix - suffix);
1743 1882                          suffix++;
1744 1883                  }
1745 1884                  if (suffix < g->mlen)
1746 1885                          ssuffix = pmatches[ssuffix];
1747 1886          }
1748 1887  
1749 1888          free(pmatches);
1750 1889  }
1751 1890  
1752 1891  /*
1753 1892   * pluscount - count + nesting
1754 1893   */
1755 1894  static sopno                    /* nesting depth */
1756 1895  pluscount(struct parse *p, struct re_guts *g)
1757 1896  {
1758 1897          sop *scan;
1759 1898          sop s;
1760 1899          sopno plusnest = 0;
1761 1900          sopno maxnest = 0;
1762 1901  
1763 1902          if (p->error != 0)
1764 1903                  return (0);     /* there may not be an OEND */
1765 1904  
1766 1905          scan = g->strip + 1;
1767 1906          do {
1768 1907                  s = *scan++;
1769 1908                  switch (OP(s)) {
1770 1909                  case OPLUS_:
1771 1910                          plusnest++;
1772 1911                          break;
1773 1912                  case O_PLUS:
1774 1913                          if (plusnest > maxnest)
1775 1914                                  maxnest = plusnest;
1776 1915                          plusnest--;
1777 1916                          break;
1778 1917                  }
1779 1918          } while (OP(s) != OEND);
1780 1919          if (plusnest != 0)
1781 1920                  g->iflags |= BAD;
1782 1921          return (maxnest);
1783 1922  }

↓ open down ↓

184 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX