1 /*
   2  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
   3  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
   4  * Copyright 2012 Milan Jurik. All rights reserved.
   5  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
   6  * Copyright (c) 1992, 1993, 1994
   7  *      The Regents of the University of California.  All rights reserved.
   8  *
   9  * This code is derived from software contributed to Berkeley by
  10  * Henry Spencer.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 4. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  */
  36 
  37 #include "lint.h"
  38 #include "file64.h"
  39 #include <sys/types.h>
  40 #include <stdio.h>
  41 #include <string.h>
  42 #include <ctype.h>
  43 #include <limits.h>
  44 #include <stdlib.h>
  45 #include <regex.h>
  46 #include <wchar.h>
  47 #include <wctype.h>
  48 
  49 #include "runetype.h"
  50 #include "collate.h"
  51 
  52 #include "utils.h"
  53 #include "regex2.h"
  54 
  55 #include "cname.h"
  56 #include "mblocal.h"
  57 
  58 /*
  59  * parse structure, passed up and down to avoid global variables and
  60  * other clumsinesses
  61  */
  62 struct parse {
  63         char *next;             /* next character in RE */
  64         char *end;              /* end of string (-> NUL normally) */
  65         int error;              /* has an error been seen? */
  66         sop *strip;             /* malloced strip */
  67         sopno ssize;            /* malloced strip size (allocated) */
  68         sopno slen;             /* malloced strip length (used) */
  69         int ncsalloc;           /* number of csets allocated */
  70         struct re_guts *g;
  71 #define NPAREN  10              /* we need to remember () 1-9 for back refs */
  72         sopno pbegin[NPAREN];   /* -> ( ([0] unused) */
  73         sopno pend[NPAREN];     /* -> ) ([0] unused) */
  74 };
  75 
  76 /* ========= begin header generated by ./mkh ========= */
  77 #ifdef __cplusplus
  78 extern "C" {
  79 #endif
  80 
  81 /* === regcomp.c === */
  82 static void p_ere(struct parse *p, wint_t stop);
  83 static void p_ere_exp(struct parse *p);
  84 static void p_str(struct parse *p);
  85 static void p_bre(struct parse *p, wint_t end1, wint_t end2);
  86 static int p_simp_re(struct parse *p, int starordinary);
  87 static int p_count(struct parse *p);
  88 static void p_bracket(struct parse *p);
  89 static void p_b_term(struct parse *p, cset *cs);
  90 static void p_b_cclass(struct parse *p, cset *cs);
  91 static void p_b_eclass(struct parse *p, cset *cs);
  92 static wint_t p_b_symbol(struct parse *p);
  93 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
  94 static wint_t othercase(wint_t ch);
  95 static void bothcases(struct parse *p, wint_t ch);
  96 static void ordinary(struct parse *p, wint_t ch);
  97 static void nonnewline(struct parse *p);
  98 static void repeat(struct parse *p, sopno start, int from, int to);
  99 static int seterr(struct parse *p, int e);
 100 static cset *allocset(struct parse *p);
 101 static void freeset(struct parse *p, cset *cs);
 102 static void CHadd(struct parse *p, cset *cs, wint_t ch);
 103 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
 104 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
 105 static wint_t singleton(cset *cs);
 106 static sopno dupl(struct parse *p, sopno start, sopno finish);
 107 static void doemit(struct parse *p, sop op, size_t opnd);
 108 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
 109 static void dofwd(struct parse *p, sopno pos, sop value);
 110 static void enlarge(struct parse *p, sopno size);
 111 static void stripsnug(struct parse *p, struct re_guts *g);
 112 static void findmust(struct parse *p, struct re_guts *g);
 113 static int altoffset(sop *scan, int offset);
 114 static void computejumps(struct parse *p, struct re_guts *g);
 115 static void computematchjumps(struct parse *p, struct re_guts *g);
 116 static sopno pluscount(struct parse *p, struct re_guts *g);
 117 static wint_t wgetnext(struct parse *p);
 118 
 119 #ifdef __cplusplus
 120 }
 121 #endif
 122 /* ========= end header generated by ./mkh ========= */
 123 
 124 static char nuls[10];           /* place to point scanner in event of error */
 125 
 126 /*
 127  * macros for use with parse structure
 128  * BEWARE:  these know that the parse structure is named `p' !!!
 129  */
 130 #define PEEK()  (*p->next)
 131 #define PEEK2() (*(p->next+1))
 132 #define MORE()  (p->next < p->end)
 133 #define MORE2() (p->next+1 < p->end)
 134 #define SEE(c)  (MORE() && PEEK() == (c))
 135 #define SEETWO(a, b)    (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
 136 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
 137 #define EATTWO(a, b)    ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
 138 #define NEXT()  (p->next++)
 139 #define NEXT2() (p->next += 2)
 140 #define NEXTn(n)        (p->next += (n))
 141 #define GETNEXT()       (*p->next++)
 142 #define WGETNEXT()      wgetnext(p)
 143 #define SETERROR(e)     ((void)seterr(p, (e)))
 144 #define REQUIRE(co, e)  ((co) || seterr(p, e))
 145 #define MUSTSEE(c, e)   (REQUIRE(MORE() && PEEK() == (c), e))
 146 #define MUSTEAT(c, e)   (REQUIRE(MORE() && GETNEXT() == (c), e))
 147 #define MUSTNOTSEE(c, e)        (REQUIRE(!MORE() || PEEK() != (c), e))
 148 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
 149 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
 150 #define AHEAD(pos)              dofwd(p, pos, HERE()-(pos))
 151 #define ASTERN(sop, pos)        EMIT(sop, HERE()-pos)
 152 #define HERE()          (p->slen)
 153 #define THERE()         (p->slen - 1)
 154 #define THERETHERE()    (p->slen - 2)
 155 #define DROP(n) (p->slen -= (n))
 156 
 157 #ifndef NDEBUG
 158 static int never = 0;           /* for use in asserts; shuts lint up */
 159 #else
 160 #define never   0               /* some <assert.h>s have bugs too */
 161 #endif
 162 
 163 /*
 164  * regcomp - interface for parser and compilation
 165  */
 166 int                             /* 0 success, otherwise REG_something */
 167 regcomp(regex_t *_RESTRICT_KYWD preg,
 168         const char *_RESTRICT_KYWD pattern,
 169         int cflags)
 170 {
 171         struct parse pa;
 172         struct re_guts *g;
 173         struct parse *p = &pa;
 174         int i;
 175         size_t len;
 176 #ifdef REDEBUG
 177 #define GOODFLAGS(f)    (f)
 178 #else
 179 #define GOODFLAGS(f)    ((f)&~REG_DUMP)
 180 #endif
 181 
 182         /* We had REG_INVARG, but we don't have that on Solaris. */
 183         cflags = GOODFLAGS(cflags);
 184         if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
 185                 return (REG_EFATAL);
 186 
 187         if (cflags&REG_PEND) {
 188                 if (preg->re_endp < pattern)
 189                         return (REG_EFATAL);
 190                 len = preg->re_endp - pattern;
 191         } else
 192                 len = strlen((char *)pattern);
 193 
 194         /* do the mallocs early so failure handling is easy */
 195         g = (struct re_guts *)malloc(sizeof (struct re_guts));
 196         if (g == NULL)
 197                 return (REG_ESPACE);
 198         p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;      /* ugh */
 199         p->strip = (sop *)malloc(p->ssize * sizeof (sop));
 200         p->slen = 0;
 201         if (p->strip == NULL) {
 202                 free((char *)g);
 203                 return (REG_ESPACE);
 204         }
 205 
 206         /* set things up */
 207         p->g = g;
 208         p->next = (char *)pattern;   /* convenience; we do not modify it */
 209         p->end = p->next + len;
 210         p->error = 0;
 211         p->ncsalloc = 0;
 212         for (i = 0; i < NPAREN; i++) {
 213                 p->pbegin[i] = 0;
 214                 p->pend[i] = 0;
 215         }
 216         g->sets = NULL;
 217         g->ncsets = 0;
 218         g->cflags = cflags;
 219         g->iflags = 0;
 220         g->nbol = 0;
 221         g->neol = 0;
 222         g->must = NULL;
 223         g->moffset = -1;
 224         g->charjump = NULL;
 225         g->matchjump = NULL;
 226         g->mlen = 0;
 227         g->nsub = 0;
 228         g->backrefs = 0;
 229 
 230         /* do it */
 231         EMIT(OEND, 0);
 232         g->firststate = THERE();
 233         if (cflags&REG_EXTENDED)
 234                 p_ere(p, OUT);
 235         else if (cflags&REG_NOSPEC)
 236                 p_str(p);
 237         else
 238                 p_bre(p, OUT, OUT);
 239         EMIT(OEND, 0);
 240         g->laststate = THERE();
 241 
 242         /* tidy up loose ends and fill things in */
 243         stripsnug(p, g);
 244         findmust(p, g);
 245         /*
 246          * only use Boyer-Moore algorithm if the pattern is bigger
 247          * than three characters
 248          */
 249         if (g->mlen > 3) {
 250                 computejumps(p, g);
 251                 computematchjumps(p, g);
 252                 if (g->matchjump == NULL && g->charjump != NULL) {
 253                         free(g->charjump);
 254                         g->charjump = NULL;
 255                 }
 256         }
 257         g->nplus = pluscount(p, g);
 258         g->magic = MAGIC2;
 259         preg->re_nsub = g->nsub;
 260         preg->re_g = g;
 261         preg->re_magic = MAGIC1;
 262 #ifndef REDEBUG
 263         /* not debugging, so can't rely on the assert() in regexec() */
 264         if (g->iflags&BAD)
 265                 SETERROR(REG_EFATAL);
 266 #endif
 267 
 268         /* win or lose, we're done */
 269         if (p->error != 0)   /* lose */
 270                 regfree(preg);
 271         return (p->error);
 272 }
 273 
 274 /*
 275  * p_ere - ERE parser top level, concatenation and alternation
 276  */
 277 static void
 278 p_ere(struct parse *p,
 279     wint_t stop)                /* character this ERE should end at */
 280 {
 281         char c;
 282         sopno prevback;
 283         sopno prevfwd;
 284         sopno conc;
 285         int first = 1;          /* is this the first alternative? */
 286 
 287         for (;;) {
 288                 /* do a bunch of concatenated expressions */
 289                 conc = HERE();
 290                 while (MORE() && (c = PEEK()) != '|' && c != stop)
 291                         p_ere_exp(p);
 292                 /* require nonempty */
 293                 (void) REQUIRE(HERE() != conc, REG_BADPAT);
 294 
 295                 if (!EAT('|'))
 296                         break;          /* NOTE BREAK OUT */
 297 
 298                 if (first) {
 299                         INSERT(OCH_, conc);     /* offset is wrong */
 300                         prevfwd = conc;
 301                         prevback = conc;
 302                         first = 0;
 303                 }
 304                 ASTERN(OOR1, prevback);
 305                 prevback = THERE();
 306                 AHEAD(prevfwd);                 /* fix previous offset */
 307                 prevfwd = HERE();
 308                 EMIT(OOR2, 0);                  /* offset is very wrong */
 309         }
 310 
 311         if (!first) {           /* tail-end fixups */
 312                 AHEAD(prevfwd);
 313                 ASTERN(O_CH, prevback);
 314         }
 315 
 316         assert(!MORE() || SEE(stop));
 317 }
 318 
 319 /*
 320  * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
 321  */
 322 static void
 323 p_ere_exp(struct parse *p)
 324 {
 325         char c;
 326         wint_t wc;
 327         sopno pos;
 328         int count;
 329         int count2;
 330         sopno subno;
 331         int wascaret = 0;
 332 
 333         assert(MORE());         /* caller should have ensured this */
 334         c = GETNEXT();
 335 
 336         pos = HERE();
 337         switch (c) {
 338         case '(':
 339                 (void) REQUIRE(MORE(), REG_EPAREN);
 340                 p->g->nsub++;
 341                 subno = p->g->nsub;
 342                 if (subno < NPAREN)
 343                         p->pbegin[subno] = HERE();
 344                 EMIT(OLPAREN, subno);
 345                 if (!SEE(')'))
 346                         p_ere(p, ')');
 347                 if (subno < NPAREN) {
 348                         p->pend[subno] = HERE();
 349                         assert(p->pend[subno] != 0);
 350                 }
 351                 EMIT(ORPAREN, subno);
 352                 (void) MUSTEAT(')', REG_EPAREN);
 353                 break;
 354 #ifndef POSIX_MISTAKE
 355         case ')':               /* happens only if no current unmatched ( */
 356                 /*
 357                  * You may ask, why the ifndef?  Because I didn't notice
 358                  * this until slightly too late for 1003.2, and none of the
 359                  * other 1003.2 regular-expression reviewers noticed it at
 360                  * all.  So an unmatched ) is legal POSIX, at least until
 361                  * we can get it fixed.
 362                  */
 363                 SETERROR(REG_EPAREN);
 364                 break;
 365 #endif
 366         case '^':
 367                 EMIT(OBOL, 0);
 368                 p->g->iflags |= USEBOL;
 369                 p->g->nbol++;
 370                 wascaret = 1;
 371                 break;
 372         case '$':
 373                 EMIT(OEOL, 0);
 374                 p->g->iflags |= USEEOL;
 375                 p->g->neol++;
 376                 break;
 377         case '|':
 378                 SETERROR(REG_BADPAT);
 379                 break;
 380         case '*':
 381         case '+':
 382         case '?':
 383                 SETERROR(REG_BADRPT);
 384                 break;
 385         case '.':
 386                 if (p->g->cflags&REG_NEWLINE)
 387                         nonnewline(p);
 388                 else
 389                         EMIT(OANY, 0);
 390                 break;
 391         case '[':
 392                 p_bracket(p);
 393                 break;
 394         case '\\':
 395                 (void) REQUIRE(MORE(), REG_EESCAPE);
 396                 wc = WGETNEXT();
 397                 switch (wc) {
 398                 case '<':
 399                         EMIT(OBOW, 0);
 400                         break;
 401                 case '>':
 402                         EMIT(OEOW, 0);
 403                         break;
 404                 default:
 405                         ordinary(p, wc);
 406                         break;
 407                 }
 408                 break;
 409         case '{':               /* okay as ordinary except if digit follows */
 410                 (void) REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
 411                 /* FALLTHROUGH */
 412         default:
 413                 p->next--;
 414                 wc = WGETNEXT();
 415                 ordinary(p, wc);
 416                 break;
 417         }
 418 
 419         if (!MORE())
 420                 return;
 421         c = PEEK();
 422         /* we call { a repetition if followed by a digit */
 423         if (!(c == '*' || c == '+' || c == '?' ||
 424             (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
 425                 return;         /* no repetition, we're done */
 426         NEXT();
 427 
 428         (void) REQUIRE(!wascaret, REG_BADRPT);
 429         switch (c) {
 430         case '*':       /* implemented as +? */
 431                 /* this case does not require the (y|) trick, noKLUDGE */
 432                 INSERT(OPLUS_, pos);
 433                 ASTERN(O_PLUS, pos);
 434                 INSERT(OQUEST_, pos);
 435                 ASTERN(O_QUEST, pos);
 436                 break;
 437         case '+':
 438                 INSERT(OPLUS_, pos);
 439                 ASTERN(O_PLUS, pos);
 440                 break;
 441         case '?':
 442                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 443                 INSERT(OCH_, pos);              /* offset slightly wrong */
 444                 ASTERN(OOR1, pos);              /* this one's right */
 445                 AHEAD(pos);                     /* fix the OCH_ */
 446                 EMIT(OOR2, 0);                  /* offset very wrong... */
 447                 AHEAD(THERE());                 /* ...so fix it */
 448                 ASTERN(O_CH, THERETHERE());
 449                 break;
 450         case '{':
 451                 count = p_count(p);
 452                 if (EAT(',')) {
 453                         if (isdigit((uch)PEEK())) {
 454                                 count2 = p_count(p);
 455                                 (void) REQUIRE(count <= count2, REG_BADBR);
 456                         } else          /* single number with comma */
 457                                 count2 = INFINITY;
 458                 } else          /* just a single number */
 459                         count2 = count;
 460                 repeat(p, pos, count, count2);
 461                 if (!EAT('}')) {        /* error heuristics */
 462                         while (MORE() && PEEK() != '}')
 463                                 NEXT();
 464                         (void) REQUIRE(MORE(), REG_EBRACE);
 465                         SETERROR(REG_BADBR);
 466                 }
 467                 break;
 468         }
 469 
 470         if (!MORE())
 471                 return;
 472         c = PEEK();
 473         if (!(c == '*' || c == '+' || c == '?' ||
 474             (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
 475                 return;
 476         SETERROR(REG_BADRPT);
 477 }
 478 
 479 /*
 480  * p_str - string (no metacharacters) "parser"
 481  */
 482 static void
 483 p_str(struct parse *p)
 484 {
 485         (void) REQUIRE(MORE(), REG_BADPAT);
 486         while (MORE())
 487                 ordinary(p, WGETNEXT());
 488 }
 489 
 490 /*
 491  * p_bre - BRE parser top level, anchoring and concatenation
 492  * Giving end1 as OUT essentially eliminates the end1/end2 check.
 493  *
 494  * This implementation is a bit of a kludge, in that a trailing $ is first
 495  * taken as an ordinary character and then revised to be an anchor.
 496  * The amount of lookahead needed to avoid this kludge is excessive.
 497  */
 498 static void
 499 p_bre(struct parse *p,
 500     wint_t end1,                /* first terminating character */
 501     wint_t end2)                /* second terminating character */
 502 {
 503         sopno start = HERE();
 504         int first = 1;                  /* first subexpression? */
 505         int wasdollar = 0;
 506 
 507         if (EAT('^')) {
 508                 EMIT(OBOL, 0);
 509                 p->g->iflags |= USEBOL;
 510                 p->g->nbol++;
 511         }
 512         while (MORE() && !SEETWO(end1, end2)) {
 513                 wasdollar = p_simp_re(p, first);
 514                 first = 0;
 515         }
 516         if (wasdollar) {        /* oops, that was a trailing anchor */
 517                 DROP(1);
 518                 EMIT(OEOL, 0);
 519                 p->g->iflags |= USEEOL;
 520                 p->g->neol++;
 521         }
 522 
 523         (void) REQUIRE(HERE() != start, REG_BADPAT);    /* require nonempty */
 524 }
 525 
 526 /*
 527  * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
 528  */
 529 static int                      /* was the simple RE an unbackslashed $? */
 530 p_simp_re(struct parse *p,
 531         int starordinary)       /* is a leading * an ordinary character? */
 532 {
 533         int c;
 534         int count;
 535         int count2;
 536         sopno pos;
 537         int i;
 538         wint_t wc;
 539         sopno subno;
 540 #define BACKSL  (1<<CHAR_BIT)
 541 
 542         pos = HERE();           /* repetion op, if any, covers from here */
 543 
 544         assert(MORE());         /* caller should have ensured this */
 545         c = GETNEXT();
 546         if (c == '\\') {
 547                 (void) REQUIRE(MORE(), REG_EESCAPE);
 548                 c = BACKSL | GETNEXT();
 549         }
 550         switch (c) {
 551         case '.':
 552                 if (p->g->cflags&REG_NEWLINE)
 553                         nonnewline(p);
 554                 else
 555                         EMIT(OANY, 0);
 556                 break;
 557         case '[':
 558                 p_bracket(p);
 559                 break;
 560         case BACKSL|'<':
 561                 EMIT(OBOW, 0);
 562                 break;
 563         case BACKSL|'>':
 564                 EMIT(OEOW, 0);
 565                 break;
 566         case BACKSL|'{':
 567                 SETERROR(REG_BADRPT);
 568                 break;
 569         case BACKSL|'(':
 570                 p->g->nsub++;
 571                 subno = p->g->nsub;
 572                 if (subno < NPAREN)
 573                         p->pbegin[subno] = HERE();
 574                 EMIT(OLPAREN, subno);
 575                 /* the MORE here is an error heuristic */
 576                 if (MORE() && !SEETWO('\\', ')'))
 577                         p_bre(p, '\\', ')');
 578                 if (subno < NPAREN) {
 579                         p->pend[subno] = HERE();
 580                         assert(p->pend[subno] != 0);
 581                 }
 582                 EMIT(ORPAREN, subno);
 583                 (void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
 584                 break;
 585         case BACKSL|')':        /* should not get here -- must be user */
 586         case BACKSL|'}':
 587                 SETERROR(REG_EPAREN);
 588                 break;
 589         case BACKSL|'1':
 590         case BACKSL|'2':
 591         case BACKSL|'3':
 592         case BACKSL|'4':
 593         case BACKSL|'5':
 594         case BACKSL|'6':
 595         case BACKSL|'7':
 596         case BACKSL|'8':
 597         case BACKSL|'9':
 598                 i = (c&~BACKSL) - '0';
 599                 assert(i < NPAREN);
 600                 if (p->pend[i] != 0) {
 601                         assert(i <= p->g->nsub);
 602                         EMIT(OBACK_, i);
 603                         assert(p->pbegin[i] != 0);
 604                         assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
 605                         assert(OP(p->strip[p->pend[i]]) == ORPAREN);
 606                         (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
 607                         EMIT(O_BACK, i);
 608                 } else
 609                         SETERROR(REG_ESUBREG);
 610                 p->g->backrefs = 1;
 611                 break;
 612         case '*':
 613                 (void) REQUIRE(starordinary, REG_BADRPT);
 614                 /* FALLTHROUGH */
 615         default:
 616                 p->next--;
 617                 wc = WGETNEXT();
 618                 ordinary(p, wc);
 619                 break;
 620         }
 621 
 622         if (EAT('*')) {         /* implemented as +? */
 623                 /* this case does not require the (y|) trick, noKLUDGE */
 624                 INSERT(OPLUS_, pos);
 625                 ASTERN(O_PLUS, pos);
 626                 INSERT(OQUEST_, pos);
 627                 ASTERN(O_QUEST, pos);
 628         } else if (EATTWO('\\', '{')) {
 629                 count = p_count(p);
 630                 if (EAT(',')) {
 631                         if (MORE() && isdigit((uch)PEEK())) {
 632                                 count2 = p_count(p);
 633                                 (void) REQUIRE(count <= count2, REG_BADBR);
 634                         } else          /* single number with comma */
 635                                 count2 = INFINITY;
 636                 } else          /* just a single number */
 637                         count2 = count;
 638                 repeat(p, pos, count, count2);
 639                 if (!EATTWO('\\', '}')) {       /* error heuristics */
 640                         while (MORE() && !SEETWO('\\', '}'))
 641                                 NEXT();
 642                         (void) REQUIRE(MORE(), REG_EBRACE);
 643                         SETERROR(REG_BADBR);
 644                 }
 645         } else if (c == '$')    /* $ (but not \$) ends it */
 646                 return (1);
 647 
 648         return (0);
 649 }
 650 
 651 /*
 652  * p_count - parse a repetition count
 653  */
 654 static int                      /* the value */
 655 p_count(struct parse *p)
 656 {
 657         int count = 0;
 658         int ndigits = 0;
 659 
 660         while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
 661                 count = count*10 + (GETNEXT() - '0');
 662                 ndigits++;
 663         }
 664 
 665         (void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
 666         return (count);
 667 }
 668 
 669 /*
 670  * p_bracket - parse a bracketed character list
 671  */
 672 static void
 673 p_bracket(struct parse *p)
 674 {
 675         cset *cs;
 676         wint_t ch;
 677 
 678         /* Dept of Truly Sickening Special-Case Kludges */
 679         if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
 680                 EMIT(OBOW, 0);
 681                 NEXTn(6);
 682                 return;
 683         }
 684         if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
 685                 EMIT(OEOW, 0);
 686                 NEXTn(6);
 687                 return;
 688         }
 689 
 690         if ((cs = allocset(p)) == NULL)
 691                 return;
 692 
 693         if (p->g->cflags&REG_ICASE)
 694                 cs->icase = 1;
 695         if (EAT('^'))
 696                 cs->invert = 1;
 697         if (EAT(']'))
 698                 CHadd(p, cs, ']');
 699         else if (EAT('-'))
 700                 CHadd(p, cs, '-');
 701         while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
 702                 p_b_term(p, cs);
 703         if (EAT('-'))
 704                 CHadd(p, cs, '-');
 705         (void) MUSTEAT(']', REG_EBRACK);
 706 
 707         if (p->error != 0)   /* don't mess things up further */
 708                 return;
 709 
 710         if (cs->invert && p->g->cflags&REG_NEWLINE)
 711                 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
 712 
 713         if ((ch = singleton(cs)) != OUT) {      /* optimize singleton sets */
 714                 ordinary(p, ch);
 715                 freeset(p, cs);
 716         } else
 717                 EMIT(OANYOF, (int)(cs - p->g->sets));
 718 }
 719 
 720 /*
 721  * p_b_term - parse one term of a bracketed character list
 722  */
 723 static void
 724 p_b_term(struct parse *p, cset *cs)
 725 {
 726         char c;
 727         wint_t start, finish;
 728         wint_t i;
 729         locale_t loc = uselocale(NULL);
 730 
 731         /* classify what we've got */
 732         switch ((MORE()) ? PEEK() : '\0') {
 733         case '[':
 734                 c = (MORE2()) ? PEEK2() : '\0';
 735                 break;
 736         case '-':
 737                 SETERROR(REG_ERANGE);
 738                 return;                 /* NOTE RETURN */
 739         default:
 740                 c = '\0';
 741                 break;
 742         }
 743 
 744         switch (c) {
 745         case ':':               /* character class */
 746                 NEXT2();
 747                 (void) REQUIRE(MORE(), REG_EBRACK);
 748                 c = PEEK();
 749                 (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);
 750                 p_b_cclass(p, cs);
 751                 (void) REQUIRE(MORE(), REG_EBRACK);
 752                 (void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
 753                 break;
 754         case '=':               /* equivalence class */
 755                 NEXT2();
 756                 (void) REQUIRE(MORE(), REG_EBRACK);
 757                 c = PEEK();
 758                 (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
 759                 p_b_eclass(p, cs);
 760                 (void) REQUIRE(MORE(), REG_EBRACK);
 761                 (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
 762                 break;
 763         default:                /* symbol, ordinary character, or range */
 764                 start = p_b_symbol(p);
 765                 if (SEE('-') && MORE2() && PEEK2() != ']') {
 766                         /* range */
 767                         NEXT();
 768                         if (EAT('-'))
 769                                 finish = '-';
 770                         else
 771                                 finish = p_b_symbol(p);
 772                 } else
 773                         finish = start;
 774                 if (start == finish)
 775                         CHadd(p, cs, start);
 776                 else {
 777                         if (loc->collate->lc_is_posix) {
 778                                 (void) REQUIRE((uch)start <= (uch)finish,
 779                                     REG_ERANGE);
 780                                 CHaddrange(p, cs, start, finish);
 781                         } else {
 782                                 (void) REQUIRE(_collate_range_cmp(start,
 783                                     finish, loc) <= 0, REG_ERANGE);
 784                                 for (i = 0; i <= UCHAR_MAX; i++) {
 785                                         if (_collate_range_cmp(start, i, loc)
 786                                             <= 0 &&
 787                                             _collate_range_cmp(i, finish, loc)
 788                                             <= 0)
 789                                                 CHadd(p, cs, i);
 790                                 }
 791                         }
 792                 }
 793                 break;
 794         }
 795 }
 796 
 797 /*
 798  * p_b_cclass - parse a character-class name and deal with it
 799  */
 800 static void
 801 p_b_cclass(struct parse *p, cset *cs)
 802 {
 803         char *sp = p->next;
 804         size_t len;
 805         wctype_t wct;
 806         char clname[16];
 807 
 808         while (MORE() && isalpha((uch)PEEK()))
 809                 NEXT();
 810         len = p->next - sp;
 811         if (len >= sizeof (clname) - 1) {
 812                 SETERROR(REG_ECTYPE);
 813                 return;
 814         }
 815         (void) memcpy(clname, sp, len);
 816         clname[len] = '\0';
 817         if ((wct = wctype(clname)) == 0) {
 818                 SETERROR(REG_ECTYPE);
 819                 return;
 820         }
 821         CHaddtype(p, cs, wct);
 822 }
 823 
 824 /*
 825  * p_b_eclass - parse an equivalence-class name and deal with it
 826  *
 827  * This implementation is incomplete. xxx
 828  */
 829 static void
 830 p_b_eclass(struct parse *p, cset *cs)
 831 {
 832         wint_t c;
 833 
 834         c = p_b_coll_elem(p, '=');
 835         CHadd(p, cs, c);
 836 }
 837 
 838 /*
 839  * p_b_symbol - parse a character or [..]ed multicharacter collating symbol
 840  */
 841 static wint_t                   /* value of symbol */
 842 p_b_symbol(struct parse *p)
 843 {
 844         wint_t value;
 845 
 846         (void) REQUIRE(MORE(), REG_EBRACK);
 847         if (!EATTWO('[', '.'))
 848                 return (WGETNEXT());
 849 
 850         /* collating symbol */
 851         value = p_b_coll_elem(p, '.');
 852         (void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
 853         return (value);
 854 }
 855 
 856 /*
 857  * p_b_coll_elem - parse a collating-element name and look it up
 858  */
 859 static wint_t                   /* value of collating element */
 860 p_b_coll_elem(struct parse *p,
 861         wint_t endc)            /* name ended by endc,']' */
 862 {
 863         char *sp = p->next;
 864         struct cname *cp;
 865         int len;
 866         mbstate_t mbs;
 867         wchar_t wc;
 868         size_t clen;
 869 
 870         while (MORE() && !SEETWO(endc, ']'))
 871                 NEXT();
 872         if (!MORE()) {
 873                 SETERROR(REG_EBRACK);
 874                 return (0);
 875         }
 876         len = p->next - sp;
 877         for (cp = cnames; cp->name != NULL; cp++)
 878                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
 879                         return (cp->code);   /* known name */
 880         (void) memset(&mbs, 0, sizeof (mbs));
 881         if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
 882                 return (wc);                    /* single character */
 883         else if (clen == (size_t)-1 || clen == (size_t)-2)
 884                 SETERROR(REG_ECHAR);
 885         else
 886                 SETERROR(REG_ECOLLATE);         /* neither */
 887         return (0);
 888 }
 889 
 890 /*
 891  * othercase - return the case counterpart of an alphabetic
 892  */
 893 static wint_t                   /* if no counterpart, return ch */
 894 othercase(wint_t ch)
 895 {
 896         assert(iswalpha(ch));
 897         if (iswupper(ch))
 898                 return (towlower(ch));
 899         else if (iswlower(ch))
 900                 return (towupper(ch));
 901         else                    /* peculiar, but could happen */
 902                 return (ch);
 903 }
 904 
 905 /*
 906  * bothcases - emit a dualcase version of a two-case character
 907  *
 908  * Boy, is this implementation ever a kludge...
 909  */
 910 static void
 911 bothcases(struct parse *p, wint_t ch)
 912 {
 913         char *oldnext = p->next;
 914         char *oldend = p->end;
 915         char bracket[3 + MB_LEN_MAX];
 916         size_t n;
 917         mbstate_t mbs;
 918 
 919         assert(othercase(ch) != ch);    /* p_bracket() would recurse */
 920         p->next = bracket;
 921         (void) memset(&mbs, 0, sizeof (mbs));
 922         n = wcrtomb(bracket, ch, &mbs);
 923         assert(n != (size_t)-1);
 924         bracket[n] = ']';
 925         bracket[n + 1] = '\0';
 926         p->end = bracket+n+1;
 927         p_bracket(p);
 928         assert(p->next == p->end);
 929         p->next = oldnext;
 930         p->end = oldend;
 931 }
 932 
 933 /*
 934  * ordinary - emit an ordinary character
 935  */
 936 static void
 937 ordinary(struct parse *p, wint_t ch)
 938 {
 939         cset *cs;
 940 
 941         if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
 942                 bothcases(p, ch);
 943         else if ((ch & OPDMASK) == ch)
 944                 EMIT(OCHAR, ch);
 945         else {
 946                 /*
 947                  * Kludge: character is too big to fit into an OCHAR operand.
 948                  * Emit a singleton set.
 949                  */
 950                 if ((cs = allocset(p)) == NULL)
 951                         return;
 952                 CHadd(p, cs, ch);
 953                 EMIT(OANYOF, (int)(cs - p->g->sets));
 954         }
 955 }
 956 
 957 /*
 958  * nonnewline - emit REG_NEWLINE version of OANY
 959  *
 960  * Boy, is this implementation ever a kludge...
 961  */
 962 static void
 963 nonnewline(struct parse *p)
 964 {
 965         char *oldnext = p->next;
 966         char *oldend = p->end;
 967         char bracket[4];
 968 
 969         p->next = bracket;
 970         p->end = bracket+3;
 971         bracket[0] = '^';
 972         bracket[1] = '\n';
 973         bracket[2] = ']';
 974         bracket[3] = '\0';
 975         p_bracket(p);
 976         assert(p->next == bracket+3);
 977         p->next = oldnext;
 978         p->end = oldend;
 979 }
 980 
 981 /*
 982  * repeat - generate code for a bounded repetition, recursively if needed
 983  */
 984 static void
 985 repeat(struct parse *p,
 986     sopno start,                /* operand from here to end of strip */
 987     int from,                   /* repeated from this number */
 988     int to)                     /* to this number of times (maybe INFINITY) */
 989 {
 990         sopno finish = HERE();
 991 #define N       2
 992 #define INF     3
 993 #define REP(f, t)       ((f)*8 + (t))
 994 #define MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
 995         sopno copy;
 996 
 997         if (p->error != 0)   /* head off possible runaway recursion */
 998                 return;
 999 
1000         assert(from <= to);
1001 
1002         switch (REP(MAP(from), MAP(to))) {
1003         case REP(0, 0):                 /* must be user doing this */
1004                 DROP(finish-start);     /* drop the operand */
1005                 break;
1006         case REP(0, 1):                 /* as x{1,1}? */
1007         case REP(0, N):                 /* as x{1,n}? */
1008         case REP(0, INF):               /* as x{1,}? */
1009                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1010                 INSERT(OCH_, start);            /* offset is wrong... */
1011                 repeat(p, start+1, 1, to);
1012                 ASTERN(OOR1, start);
1013                 AHEAD(start);                   /* ... fix it */
1014                 EMIT(OOR2, 0);
1015                 AHEAD(THERE());
1016                 ASTERN(O_CH, THERETHERE());
1017                 break;
1018         case REP(1, 1):                 /* trivial case */
1019                 /* done */
1020                 break;
1021         case REP(1, N):                 /* as x?x{1,n-1} */
1022                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1023                 INSERT(OCH_, start);
1024                 ASTERN(OOR1, start);
1025                 AHEAD(start);
1026                 EMIT(OOR2, 0);                  /* offset very wrong... */
1027                 AHEAD(THERE());                 /* ...so fix it */
1028                 ASTERN(O_CH, THERETHERE());
1029                 copy = dupl(p, start+1, finish+1);
1030                 assert(copy == finish+4);
1031                 repeat(p, copy, 1, to-1);
1032                 break;
1033         case REP(1, INF):               /* as x+ */
1034                 INSERT(OPLUS_, start);
1035                 ASTERN(O_PLUS, start);
1036                 break;
1037         case REP(N, N):                 /* as xx{m-1,n-1} */
1038                 copy = dupl(p, start, finish);
1039                 repeat(p, copy, from-1, to-1);
1040                 break;
1041         case REP(N, INF):               /* as xx{n-1,INF} */
1042                 copy = dupl(p, start, finish);
1043                 repeat(p, copy, from-1, to);
1044                 break;
1045         default:                        /* "can't happen" */
1046                 SETERROR(REG_EFATAL);   /* just in case */
1047                 break;
1048         }
1049 }
1050 
1051 /*
1052  * wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1053  * character from the parse struct, signals a REG_ILLSEQ error if the
1054  * character can't be converted. Returns the number of bytes consumed.
1055  */
1056 static wint_t
1057 wgetnext(struct parse *p)
1058 {
1059         mbstate_t mbs;
1060         wchar_t wc;
1061         size_t n;
1062 
1063         (void) memset(&mbs, 0, sizeof (mbs));
1064         n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1065         if (n == (size_t)-1 || n == (size_t)-2) {
1066                 SETERROR(REG_ECHAR);
1067                 return (0);
1068         }
1069         if (n == 0)
1070                 n = 1;
1071         p->next += n;
1072         return (wc);
1073 }
1074 
1075 /*
1076  * seterr - set an error condition
1077  */
1078 static int                      /* useless but makes type checking happy */
1079 seterr(struct parse *p, int e)
1080 {
1081         if (p->error == 0)   /* keep earliest error condition */
1082                 p->error = e;
1083         p->next = nuls;              /* try to bring things to a halt */
1084         p->end = nuls;
1085         return (0);             /* make the return value well-defined */
1086 }
1087 
1088 /*
1089  * allocset - allocate a set of characters for []
1090  */
1091 static cset *
1092 allocset(struct parse *p)
1093 {
1094         cset *cs, *ncs;
1095 
1096         ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs));
1097         if (ncs == NULL) {
1098                 SETERROR(REG_ESPACE);
1099                 return (NULL);
1100         }
1101         p->g->sets = ncs;
1102         cs = &p->g->sets[p->g->ncsets++];
1103         (void) memset(cs, 0, sizeof (*cs));
1104 
1105         return (cs);
1106 }
1107 
1108 /*
1109  * freeset - free a now-unused set
1110  */
1111 static void
1112 freeset(struct parse *p, cset *cs)
1113 {
1114         cset *top = &p->g->sets[p->g->ncsets];
1115 
1116         free(cs->wides);
1117         free(cs->ranges);
1118         free(cs->types);
1119         (void) memset(cs, 0, sizeof (*cs));
1120         if (cs == top-1)        /* recover only the easy case */
1121                 p->g->ncsets--;
1122 }
1123 
1124 /*
1125  * singleton - Determine whether a set contains only one character,
1126  * returning it if so, otherwise returning OUT.
1127  */
1128 static wint_t
1129 singleton(cset *cs)
1130 {
1131         wint_t i, s, n;
1132 
1133         for (i = n = 0; i < NC; i++)
1134                 if (CHIN(cs, i)) {
1135                         n++;
1136                         s = i;
1137                 }
1138         if (n == 1)
1139                 return (s);
1140         if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1141             cs->icase == 0)
1142                 return (cs->wides[0]);
1143         /* Don't bother handling the other cases. */
1144         return (OUT);
1145 }
1146 
1147 /*
1148  * CHadd - add character to character set.
1149  */
1150 static void
1151 CHadd(struct parse *p, cset *cs, wint_t ch)
1152 {
1153         wint_t nch, *newwides;
1154         assert(ch >= 0);
1155         if (ch < NC)
1156                 cs->bmp[ch >> 3] |= 1 << (ch & 7);
1157         else {
1158                 newwides = realloc(cs->wides, (cs->nwides + 1) *
1159                     sizeof (*cs->wides));
1160                 if (newwides == NULL) {
1161                         SETERROR(REG_ESPACE);
1162                         return;
1163                 }
1164                 cs->wides = newwides;
1165                 cs->wides[cs->nwides++] = ch;
1166         }
1167         if (cs->icase) {
1168                 if ((nch = towlower(ch)) < NC)
1169                         cs->bmp[nch >> 3] |= 1 << (nch & 7);
1170                 if ((nch = towupper(ch)) < NC)
1171                         cs->bmp[nch >> 3] |= 1 << (nch & 7);
1172         }
1173 }
1174 
1175 /*
1176  * CHaddrange - add all characters in the range [min,max] to a character set.
1177  */
1178 static void
1179 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1180 {
1181         crange *newranges;
1182 
1183         for (; min < NC && min <= max; min++)
1184                 CHadd(p, cs, min);
1185         if (min >= max)
1186                 return;
1187         newranges = realloc(cs->ranges, (cs->nranges + 1) *
1188             sizeof (*cs->ranges));
1189         if (newranges == NULL) {
1190                 SETERROR(REG_ESPACE);
1191                 return;
1192         }
1193         cs->ranges = newranges;
1194         cs->ranges[cs->nranges].min = min;
1195         cs->ranges[cs->nranges].min = max;
1196         cs->nranges++;
1197 }
1198 
1199 /*
1200  * CHaddtype - add all characters of a certain type to a character set.
1201  */
1202 static void
1203 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1204 {
1205         wint_t i;
1206         wctype_t *newtypes;
1207 
1208         for (i = 0; i < NC; i++)
1209                 if (iswctype(i, wct))
1210                         CHadd(p, cs, i);
1211         newtypes = realloc(cs->types, (cs->ntypes + 1) *
1212             sizeof (*cs->types));
1213         if (newtypes == NULL) {
1214                 SETERROR(REG_ESPACE);
1215                 return;
1216         }
1217         cs->types = newtypes;
1218         cs->types[cs->ntypes++] = wct;
1219 }
1220 
1221 /*
1222  * dupl - emit a duplicate of a bunch of sops
1223  */
1224 static sopno                    /* start of duplicate */
1225 dupl(struct parse *p,
1226         sopno start,            /* from here */
1227         sopno finish)           /* to this less one */
1228 {
1229         sopno ret = HERE();
1230         sopno len = finish - start;
1231 
1232         assert(finish >= start);
1233         if (len == 0)
1234                 return (ret);
1235         enlarge(p, p->ssize + len);  /* this many unexpected additions */
1236         assert(p->ssize >= p->slen + len);
1237         (void) memcpy((char *)(p->strip + p->slen),
1238             (char *)(p->strip + start), (size_t)len*sizeof (sop));
1239         p->slen += len;
1240         return (ret);
1241 }
1242 
1243 /*
1244  * doemit - emit a strip operator
1245  *
1246  * It might seem better to implement this as a macro with a function as
1247  * hard-case backup, but it's just too big and messy unless there are
1248  * some changes to the data structures.  Maybe later.
1249  */
1250 static void
1251 doemit(struct parse *p, sop op, size_t opnd)
1252 {
1253         /* avoid making error situations worse */
1254         if (p->error != 0)
1255                 return;
1256 
1257         /* deal with oversize operands ("can't happen", more or less) */
1258         assert(opnd < 1<<OPSHIFT);
1259 
1260         /* deal with undersized strip */
1261         if (p->slen >= p->ssize)
1262                 enlarge(p, (p->ssize+1) / 2 * 3);    /* +50% */
1263         assert(p->slen < p->ssize);
1264 
1265         /* finally, it's all reduced to the easy case */
1266         p->strip[p->slen++] = SOP(op, opnd);
1267 }
1268 
1269 /*
1270  * doinsert - insert a sop into the strip
1271  */
1272 static void
1273 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1274 {
1275         sopno sn;
1276         sop s;
1277         int i;
1278 
1279         /* avoid making error situations worse */
1280         if (p->error != 0)
1281                 return;
1282 
1283         sn = HERE();
1284         EMIT(op, opnd);         /* do checks, ensure space */
1285         assert(HERE() == sn+1);
1286         s = p->strip[sn];
1287 
1288         /* adjust paren pointers */
1289         assert(pos > 0);
1290         for (i = 1; i < NPAREN; i++) {
1291                 if (p->pbegin[i] >= pos) {
1292                         p->pbegin[i]++;
1293                 }
1294                 if (p->pend[i] >= pos) {
1295                         p->pend[i]++;
1296                 }
1297         }
1298 
1299         (void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1300             (HERE()-pos-1)*sizeof (sop));
1301         p->strip[pos] = s;
1302 }
1303 
1304 /*
1305  * dofwd - complete a forward reference
1306  */
1307 static void
1308 dofwd(struct parse *p, sopno pos, sop value)
1309 {
1310         /* avoid making error situations worse */
1311         if (p->error != 0)
1312                 return;
1313 
1314         assert(value < 1<<OPSHIFT);
1315         p->strip[pos] = OP(p->strip[pos]) | value;
1316 }
1317 
1318 /*
1319  * enlarge - enlarge the strip
1320  */
1321 static void
1322 enlarge(struct parse *p, sopno size)
1323 {
1324         sop *sp;
1325 
1326         if (p->ssize >= size)
1327                 return;
1328 
1329         sp = (sop *)realloc(p->strip, size*sizeof (sop));
1330         if (sp == NULL) {
1331                 SETERROR(REG_ESPACE);
1332                 return;
1333         }
1334         p->strip = sp;
1335         p->ssize = size;
1336 }
1337 
1338 /*
1339  * stripsnug - compact the strip
1340  */
1341 static void
1342 stripsnug(struct parse *p, struct re_guts *g)
1343 {
1344         g->nstates = p->slen;
1345         g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop));
1346         if (g->strip == NULL) {
1347                 SETERROR(REG_ESPACE);
1348                 g->strip = p->strip;
1349         }
1350 }
1351 
1352 /*
1353  * findmust - fill in must and mlen with longest mandatory literal string
1354  *
1355  * This algorithm could do fancy things like analyzing the operands of |
1356  * for common subsequences.  Someday.  This code is simple and finds most
1357  * of the interesting cases.
1358  *
1359  * Note that must and mlen got initialized during setup.
1360  */
1361 static void
1362 findmust(struct parse *p, struct re_guts *g)
1363 {
1364         sop *scan;
1365         sop *start;
1366         sop *newstart;
1367         sopno newlen;
1368         sop s;
1369         char *cp;
1370         int offset;
1371         char buf[MB_LEN_MAX];
1372         size_t clen;
1373         mbstate_t mbs;
1374         locale_t loc = uselocale(NULL);
1375 
1376         /* avoid making error situations worse */
1377         if (p->error != 0)
1378                 return;
1379 
1380         /*
1381          * It's not generally safe to do a ``char'' substring search on
1382          * multibyte character strings, but it's safe for at least
1383          * UTF-8 (see RFC 3629).
1384          */
1385         if (MB_CUR_MAX > 1 &&
1386             strcmp(loc->runelocale->__encoding, "UTF-8") != 0)
1387                 return;
1388 
1389         /* find the longest OCHAR sequence in strip */
1390         newlen = 0;
1391         offset = 0;
1392         g->moffset = 0;
1393         scan = g->strip + 1;
1394         do {
1395                 s = *scan++;
1396                 switch (OP(s)) {
1397                 case OCHAR:             /* sequence member */
1398                         if (newlen == 0) {              /* new sequence */
1399                                 (void) memset(&mbs, 0, sizeof (mbs));
1400                                 newstart = scan - 1;
1401                         }
1402                         clen = wcrtomb(buf, OPND(s), &mbs);
1403                         if (clen == (size_t)-1)
1404                                 goto toohard;
1405                         newlen += clen;
1406                         break;
1407                 case OPLUS_:            /* things that don't break one */
1408                 case OLPAREN:
1409                 case ORPAREN:
1410                         break;
1411                 case OQUEST_:           /* things that must be skipped */
1412                 case OCH_:
1413                         offset = altoffset(scan, offset);
1414                         scan--;
1415                         do {
1416                                 scan += OPND(s);
1417                                 s = *scan;
1418                                 /* assert() interferes w debug printouts */
1419                                 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1420                                     OP(s) != OOR2) {
1421                                         g->iflags |= BAD;
1422                                         return;
1423                                 }
1424                         } while (OP(s) != O_QUEST && OP(s) != O_CH);
1425                         /* FALLTHROUGH */
1426                 case OBOW:              /* things that break a sequence */
1427                 case OEOW:
1428                 case OBOL:
1429                 case OEOL:
1430                 case O_QUEST:
1431                 case O_CH:
1432                 case OEND:
1433                         if (newlen > g->mlen) {           /* ends one */
1434                                 start = newstart;
1435                                 g->mlen = newlen;
1436                                 if (offset > -1) {
1437                                         g->moffset += offset;
1438                                         offset = newlen;
1439                                 } else
1440                                         g->moffset = offset;
1441                         } else {
1442                                 if (offset > -1)
1443                                         offset += newlen;
1444                         }
1445                         newlen = 0;
1446                         break;
1447                 case OANY:
1448                         if (newlen > g->mlen) {           /* ends one */
1449                                 start = newstart;
1450                                 g->mlen = newlen;
1451                                 if (offset > -1) {
1452                                         g->moffset += offset;
1453                                         offset = newlen;
1454                                 } else
1455                                         g->moffset = offset;
1456                         } else {
1457                                 if (offset > -1)
1458                                         offset += newlen;
1459                         }
1460                         if (offset > -1)
1461                                 offset++;
1462                         newlen = 0;
1463                         break;
1464                 case OANYOF:            /* may or may not invalidate offset */
1465                         /* First, everything as OANY */
1466                         if (newlen > g->mlen) {           /* ends one */
1467                                 start = newstart;
1468                                 g->mlen = newlen;
1469                                 if (offset > -1) {
1470                                         g->moffset += offset;
1471                                         offset = newlen;
1472                                 } else
1473                                         g->moffset = offset;
1474                         } else {
1475                                 if (offset > -1)
1476                                         offset += newlen;
1477                         }
1478                         if (offset > -1)
1479                                 offset++;
1480                         newlen = 0;
1481                         break;
1482                 toohard:
1483                 default:
1484                         /*
1485                          * Anything here makes it impossible or too hard
1486                          * to calculate the offset -- so we give up;
1487                          * save the last known good offset, in case the
1488                          * must sequence doesn't occur later.
1489                          */
1490                         if (newlen > g->mlen) {           /* ends one */
1491                                 start = newstart;
1492                                 g->mlen = newlen;
1493                                 if (offset > -1)
1494                                         g->moffset += offset;
1495                                 else
1496                                         g->moffset = offset;
1497                         }
1498                         offset = -1;
1499                         newlen = 0;
1500                         break;
1501                 }
1502         } while (OP(s) != OEND);
1503 
1504         if (g->mlen == 0) {          /* there isn't one */
1505                 g->moffset = -1;
1506                 return;
1507         }
1508 
1509         /* turn it into a character string */
1510         g->must = malloc((size_t)g->mlen + 1);
1511         if (g->must == NULL) {               /* argh; just forget it */
1512                 g->mlen = 0;
1513                 g->moffset = -1;
1514                 return;
1515         }
1516         cp = g->must;
1517         scan = start;
1518         (void) memset(&mbs, 0, sizeof (mbs));
1519         while (cp < g->must + g->mlen) {
1520                 while (OP(s = *scan++) != OCHAR)
1521                         continue;
1522                 clen = wcrtomb(cp, OPND(s), &mbs);
1523                 assert(clen != (size_t)-1);
1524                 cp += clen;
1525         }
1526         assert(cp == g->must + g->mlen);
1527         *cp++ = '\0';           /* just on general principles */
1528 }
1529 
1530 /*
1531  * altoffset - choose biggest offset among multiple choices
1532  *
1533  * Compute, recursively if necessary, the largest offset among multiple
1534  * re paths.
1535  */
1536 static int
1537 altoffset(sop *scan, int offset)
1538 {
1539         int largest;
1540         int try;
1541         sop s;
1542 
1543         /* If we gave up already on offsets, return */
1544         if (offset == -1)
1545                 return (-1);
1546 
1547         largest = 0;
1548         try = 0;
1549         s = *scan++;
1550         while (OP(s) != O_QUEST && OP(s) != O_CH) {
1551                 switch (OP(s)) {
1552                 case OOR1:
1553                         if (try > largest)
1554                                 largest = try;
1555                         try = 0;
1556                         break;
1557                 case OQUEST_:
1558                 case OCH_:
1559                         try = altoffset(scan, try);
1560                         if (try == -1)
1561                                 return (-1);
1562                         scan--;
1563                         do {
1564                                 scan += OPND(s);
1565                                 s = *scan;
1566                                 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1567                                     OP(s) != OOR2)
1568                                         return (-1);
1569                         } while (OP(s) != O_QUEST && OP(s) != O_CH);
1570                         /*
1571                          * We must skip to the next position, or we'll
1572                          * leave altoffset() too early.
1573                          */
1574                         scan++;
1575                         break;
1576                 case OANYOF:
1577                 case OCHAR:
1578                 case OANY:
1579                         try++;
1580                         /*FALLTHRU*/
1581                 case OBOW:
1582                 case OEOW:
1583                 case OLPAREN:
1584                 case ORPAREN:
1585                 case OOR2:
1586                         break;
1587                 default:
1588                         try = -1;
1589                         break;
1590                 }
1591                 if (try == -1)
1592                         return (-1);
1593                 s = *scan++;
1594         }
1595 
1596         if (try > largest)
1597                 largest = try;
1598 
1599         return (largest+offset);
1600 }
1601 
1602 /*
1603  * computejumps - compute char jumps for BM scan
1604  *
1605  * This algorithm assumes g->must exists and is has size greater than
1606  * zero. It's based on the algorithm found on Computer Algorithms by
1607  * Sara Baase.
1608  *
1609  * A char jump is the number of characters one needs to jump based on
1610  * the value of the character from the text that was mismatched.
1611  */
1612 static void
1613 computejumps(struct parse *p, struct re_guts *g)
1614 {
1615         int ch;
1616         int mindex;
1617 
1618         /* Avoid making errors worse */
1619         if (p->error != 0)
1620                 return;
1621 
1622         g->charjump = (int *)malloc((NC + 1) * sizeof (int));
1623         if (g->charjump == NULL)     /* Not a fatal error */
1624                 return;
1625         /* Adjust for signed chars, if necessary */
1626         g->charjump = &g->charjump[-(CHAR_MIN)];
1627 
1628         /*
1629          * If the character does not exist in the pattern, the jump
1630          * is equal to the number of characters in the pattern.
1631          */
1632         for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1633                 g->charjump[ch] = g->mlen;
1634 
1635         /*
1636          * If the character does exist, compute the jump that would
1637          * take us to the last character in the pattern equal to it
1638          * (notice that we match right to left, so that last character
1639          * is the first one that would be matched).
1640          */
1641         for (mindex = 0; mindex < g->mlen; mindex++)
1642                 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1643 }
1644 
1645 /*
1646  * computematchjumps - compute match jumps for BM scan
1647  *
1648  * This algorithm assumes g->must exists and is has size greater than
1649  * zero. It's based on the algorithm found on Computer Algorithms by
1650  * Sara Baase.
1651  *
1652  * A match jump is the number of characters one needs to advance based
1653  * on the already-matched suffix.
1654  * Notice that all values here are minus (g->mlen-1), because of the way
1655  * the search algorithm works.
1656  */
1657 static void
1658 computematchjumps(struct parse *p, struct re_guts *g)
1659 {
1660         int mindex;             /* General "must" iterator */
1661         int suffix;             /* Keeps track of matching suffix */
1662         int ssuffix;            /* Keeps track of suffixes' suffix */
1663         int *pmatches;
1664                                 /*
1665                                  * pmatches[k] points to the next i
1666                                  * such that i+1...mlen is a substring
1667                                  * of k+1...k+mlen-i-1
1668                                  */
1669 
1670         /* Avoid making errors worse */
1671         if (p->error != 0)
1672                 return;
1673 
1674         pmatches = (int *)malloc(g->mlen * sizeof (unsigned int));
1675         if (pmatches == NULL) {
1676                 g->matchjump = NULL;
1677                 return;
1678         }
1679 
1680         g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int));
1681         if (g->matchjump == NULL)    /* Not a fatal error */
1682                 return;
1683 
1684         /* Set maximum possible jump for each character in the pattern */
1685         for (mindex = 0; mindex < g->mlen; mindex++)
1686                 g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1687 
1688         /* Compute pmatches[] */
1689         for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1690             mindex--, suffix--) {
1691                 pmatches[mindex] = suffix;
1692 
1693                 /*
1694                  * If a mismatch is found, interrupting the substring,
1695                  * compute the matchjump for that position. If no
1696                  * mismatch is found, then a text substring mismatched
1697                  * against the suffix will also mismatch against the
1698                  * substring.
1699                  */
1700                 while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) {
1701                         g->matchjump[suffix] = MIN(g->matchjump[suffix],
1702                             g->mlen - mindex - 1);
1703                         suffix = pmatches[suffix];
1704                 }
1705         }
1706 
1707         /*
1708          * Compute the matchjump up to the last substring found to jump
1709          * to the beginning of the largest must pattern prefix matching
1710          * it's own suffix.
1711          */
1712         for (mindex = 0; mindex <= suffix; mindex++)
1713                 g->matchjump[mindex] = MIN(g->matchjump[mindex],
1714                     g->mlen + suffix - mindex);
1715 
1716         ssuffix = pmatches[suffix];
1717         while (suffix < g->mlen) {
1718                 while (suffix <= ssuffix && suffix < g->mlen) {
1719                         g->matchjump[suffix] = MIN(g->matchjump[suffix],
1720                             g->mlen + ssuffix - suffix);
1721                         suffix++;
1722                 }
1723                 if (suffix < g->mlen)
1724                         ssuffix = pmatches[ssuffix];
1725         }
1726 
1727         free(pmatches);
1728 }
1729 
1730 /*
1731  * pluscount - count + nesting
1732  */
1733 static sopno                    /* nesting depth */
1734 pluscount(struct parse *p, struct re_guts *g)
1735 {
1736         sop *scan;
1737         sop s;
1738         sopno plusnest = 0;
1739         sopno maxnest = 0;
1740 
1741         if (p->error != 0)
1742                 return (0);     /* there may not be an OEND */
1743 
1744         scan = g->strip + 1;
1745         do {
1746                 s = *scan++;
1747                 switch (OP(s)) {
1748                 case OPLUS_:
1749                         plusnest++;
1750                         break;
1751                 case O_PLUS:
1752                         if (plusnest > maxnest)
1753                                 maxnest = plusnest;
1754                         plusnest--;
1755                         break;
1756                 }
1757         } while (OP(s) != OEND);
1758         if (plusnest != 0)
1759                 g->iflags |= BAD;
1760         return (maxnest);
1761 }