1 /*
   2  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
   3  * Copyright 2012 Milan Jurik. All rights reserved.
   4  * Copyright (c) 2016 by Delphix. All rights reserved.
   5  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
   6  * Copyright (c) 1992, 1993, 1994
   7  *      The Regents of the University of California.  All rights reserved.
   8  *
   9  * This code is derived from software contributed to Berkeley by
  10  * Henry Spencer.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  */
  36 
  37 /*
  38  * The matching engine and friends.  This file is #included by regexec.c
  39  * after suitable #defines of a variety of macros used herein, so that
  40  * different state representations can be used without duplicating masses
  41  * of code.
  42  */
  43 
  44 #ifdef SNAMES
  45 #define matcher smatcher
  46 #define fast    sfast
  47 #define slow    sslow
  48 #define dissect sdissect
  49 #define backref sbackref
  50 #define step    sstep
  51 #define print   sprint
  52 #define at      sat
  53 #define match   smat
  54 #endif
  55 #ifdef LNAMES
  56 #define matcher lmatcher
  57 #define fast    lfast
  58 #define slow    lslow
  59 #define dissect ldissect
  60 #define backref lbackref
  61 #define step    lstep
  62 #define print   lprint
  63 #define at      lat
  64 #define match   lmat
  65 #endif
  66 #ifdef MNAMES
  67 #define matcher mmatcher
  68 #define fast    mfast
  69 #define slow    mslow
  70 #define dissect mdissect
  71 #define backref mbackref
  72 #define step    mstep
  73 #define print   mprint
  74 #define at      mat
  75 #define match   mmat
  76 #endif
  77 
  78 /* another structure passed up and down to avoid zillions of parameters */
  79 struct match {
  80         struct re_guts *g;
  81         int eflags;
  82         regmatch_t *pmatch;     /* [nsub+1] (0 element unused) */
  83         const char *offp;       /* offsets work from here */
  84         const char *beginp;     /* start of string -- virtual NUL precedes */
  85         const char *endp;       /* end of string -- virtual NUL here */
  86         const char *coldp;      /* can be no match starting before here */
  87         const char **lastpos;   /* [nplus+1] */
  88         STATEVARS;
  89         states st;              /* current states */
  90         states fresh;           /* states for a fresh start */
  91         states tmp;             /* temporary */
  92         states empty;           /* empty set of states */
  93         mbstate_t mbs;          /* multibyte conversion state */
  94 };
  95 
  96 /* ========= begin header generated by ./mkh ========= */
  97 #ifdef __cplusplus
  98 extern "C" {
  99 #endif
 100 
 101 /* === engine.c === */
 102 static int matcher(struct re_guts *, const char *, size_t, regmatch_t[], int);
 103 static const char *dissect(struct match *, const char *, const char *,
 104     sopno, sopno);
 105 static const char *backref(struct match *, const char *, const char *, sopno,
 106     sopno, sopno, int);
 107 static const char *fast(struct match *, const char *, const char *, sopno,
 108     sopno);
 109 static const char *slow(struct match *, const char *, const char *, sopno,
 110     sopno);
 111 static states step(struct re_guts *, sopno, sopno, states, wint_t, states);
 112 #define MAX_RECURSION   100
 113 #define BOL     (OUT-1)
 114 #define EOL     (BOL-1)
 115 #define BOLEOL  (BOL-2)
 116 #define NOTHING (BOL-3)
 117 #define BOW     (BOL-4)
 118 #define EOW     (BOL-5)
 119 #define BADCHAR (BOL-6)
 120 #define NONCHAR(c)      ((c) <= OUT)
 121 #ifdef REDEBUG
 122 static void print(struct match *, const char *, states, int, FILE *);
 123 #endif
 124 #ifdef REDEBUG
 125 static void at(struct match *, const char *, const char *, const char *,
 126     sopno, sopno);
 127 #endif
 128 #ifdef REDEBUG
 129 static const char *pchar(int ch);
 130 #endif
 131 
 132 #ifdef __cplusplus
 133 }
 134 #endif
 135 /* ========= end header generated by ./mkh ========= */
 136 
 137 #ifdef REDEBUG
 138 #define SP(t, s, c)     print(m, t, s, c, stdout)
 139 #define AT(t, p1, p2, s1, s2)   at(m, t, p1, p2, s1, s2)
 140 #define NOTE(str)       { if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
 141 #else
 142 #define SP(t, s, c)     /* nothing */
 143 #define AT(t, p1, p2, s1, s2)   /* nothing */
 144 #define NOTE(s) /* nothing */
 145 #endif
 146 
 147 /*
 148  * matcher - the actual matching engine
 149  */
 150 static int                      /* 0 success, REG_NOMATCH failure */
 151 matcher(struct re_guts *g, const char *string, size_t nmatch,
 152     regmatch_t pmatch[], int eflags)
 153 {
 154         const char *endp;
 155         size_t i;
 156         struct match mv;
 157         struct match *m = &mv;
 158         const char *dp = NULL;
 159         const sopno gf = g->firststate+1;    /* +1 for OEND */
 160         const sopno gl = g->laststate;
 161         const char *start;
 162         const char *stop;
 163         /* Boyer-Moore algorithms variables */
 164         const char *pp;
 165         int cj, mj;
 166         const char *mustfirst;
 167         const char *mustlast;
 168         int *matchjump;
 169         int *charjump;
 170 
 171         /* simplify the situation where possible */
 172         if (g->cflags&REG_NOSUB)
 173                 nmatch = 0;
 174 
 175         if (eflags&REG_STARTEND) {
 176                 start = string + pmatch[0].rm_so;
 177                 stop = string + pmatch[0].rm_eo;
 178         } else {
 179                 start = string;
 180                 stop = start + strlen(start);
 181         }
 182 
 183         if (stop < start)
 184                 return (REG_EFATAL);
 185 
 186         /* prescreening; this does wonders for this rather slow code */
 187         if (g->must != NULL) {
 188                 if (g->charjump != NULL && g->matchjump != NULL) {
 189                         mustfirst = g->must;
 190                         mustlast = g->must + g->mlen - 1;
 191                         charjump = g->charjump;
 192                         matchjump = g->matchjump;
 193                         pp = mustlast;
 194                         for (dp = start+g->mlen-1; dp < stop; ) {
 195                                 /* Fast skip non-matches */
 196                                 while (dp < stop && charjump[(int)*dp])
 197                                         dp += charjump[(int)*dp];
 198 
 199                                 if (dp >= stop)
 200                                         break;
 201 
 202                                 /* Greedy matcher */
 203                                 /*
 204                                  * We depend on not being used for
 205                                  * for strings of length 1
 206                                  */
 207                                 while (*--dp == *--pp && pp != mustfirst)
 208                                         ;
 209 
 210                                 if (*dp == *pp)
 211                                         break;
 212 
 213                                 /* Jump to next possible match */
 214                                 mj = matchjump[pp - mustfirst];
 215                                 cj = charjump[(int)*dp];
 216                                 dp += (cj < mj ? mj : cj);
 217                                 pp = mustlast;
 218                         }
 219                         if (pp != mustfirst)
 220                                 return (REG_NOMATCH);
 221                 } else {
 222                         for (dp = start; dp < stop; dp++)
 223                                 if (*dp == g->must[0] &&
 224                                     stop - dp >= g->mlen &&
 225                                     memcmp(dp, g->must, (size_t)g->mlen) == 0)
 226                                         break;
 227                         if (dp == stop)         /* we didn't find g->must */
 228                                 return (REG_NOMATCH);
 229                 }
 230         }
 231 
 232         /* match struct setup */
 233         m->g = g;
 234         m->eflags = eflags;
 235         m->pmatch = NULL;
 236         m->lastpos = NULL;
 237         m->offp = string;
 238         m->beginp = start;
 239         m->endp = stop;
 240         STATESETUP(m, 4);
 241         SETUP(m->st);
 242         SETUP(m->fresh);
 243         SETUP(m->tmp);
 244         SETUP(m->empty);
 245         CLEAR(m->empty);
 246         ZAPSTATE(&m->mbs);
 247 
 248         /* Adjust start according to moffset, to speed things up */
 249         if (dp != NULL && g->moffset > -1)
 250                 start = ((dp - g->moffset) < start) ? start : dp - g->moffset;
 251 
 252         SP("mloop", m->st, *start);
 253 
 254         /* this loop does only one repetition except for backrefs */
 255         for (;;) {
 256                 endp = fast(m, start, stop, gf, gl);
 257                 if (endp == NULL) {             /* a miss */
 258                         if (m->pmatch != NULL)
 259                                 free((char *)m->pmatch);
 260                         if (m->lastpos != NULL)
 261                                 free((char *)m->lastpos);
 262                         STATETEARDOWN(m);
 263                         return (REG_NOMATCH);
 264                 }
 265                 if (nmatch == 0 && !g->backrefs)
 266                         break;          /* no further info needed */
 267 
 268                 /* where? */
 269                 assert(m->coldp != NULL);
 270                 for (;;) {
 271                         NOTE("finding start");
 272                         endp = slow(m, m->coldp, stop, gf, gl);
 273                         if (endp != NULL)
 274                                 break;
 275                         assert(m->coldp < m->endp);
 276                         m->coldp += XMBRTOWC(NULL, m->coldp,
 277                             m->endp - m->coldp, &m->mbs, 0);
 278                 }
 279                 if (nmatch == 1 && !g->backrefs)
 280                         break;          /* no further info needed */
 281 
 282                 /* oh my, it wants the subexpressions... */
 283                 if (m->pmatch == NULL)
 284                         m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
 285                             sizeof (regmatch_t));
 286                 if (m->pmatch == NULL) {
 287                         STATETEARDOWN(m);
 288                         return (REG_ESPACE);
 289                 }
 290                 for (i = 1; i <= m->g->nsub; i++)
 291                         m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
 292                 if (!g->backrefs && !(m->eflags&REG_BACKR)) {
 293                         NOTE("dissecting");
 294                         dp = dissect(m, m->coldp, endp, gf, gl);
 295                 } else {
 296                         if (g->nplus > 0 && m->lastpos == NULL)
 297                                 m->lastpos = malloc((g->nplus+1) *
 298                                     sizeof (const char *));
 299                         if (g->nplus > 0 && m->lastpos == NULL) {
 300                                 free(m->pmatch);
 301                                 STATETEARDOWN(m);
 302                                 return (REG_ESPACE);
 303                         }
 304                         NOTE("backref dissect");
 305                         dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
 306                 }
 307                 if (dp != NULL)
 308                         break;
 309 
 310                 /* uh-oh... we couldn't find a subexpression-level match */
 311                 assert(g->backrefs); /* must be back references doing it */
 312                 assert(g->nplus == 0 || m->lastpos != NULL);
 313                 for (;;) {
 314                         if (dp != NULL || endp <= m->coldp)
 315                                 break;          /* defeat */
 316                         NOTE("backoff");
 317                         endp = slow(m, m->coldp, endp-1, gf, gl);
 318                         if (endp == NULL)
 319                                 break;          /* defeat */
 320                         /* try it on a shorter possibility */
 321 #ifndef NDEBUG
 322                         for (i = 1; i <= m->g->nsub; i++) {
 323                                 assert(m->pmatch[i].rm_so == -1);
 324                                 assert(m->pmatch[i].rm_eo == -1);
 325                         }
 326 #endif
 327                         NOTE("backoff dissect");
 328                         dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
 329                 }
 330                 assert(dp == NULL || dp == endp);
 331                 if (dp != NULL)         /* found a shorter one */
 332                         break;
 333 
 334                 /* despite initial appearances, there is no match here */
 335                 NOTE("false alarm");
 336                 /* recycle starting later */
 337                 start = m->coldp + XMBRTOWC(NULL, m->coldp,
 338                     stop - m->coldp, &m->mbs, 0);
 339                 assert(start <= stop);
 340         }
 341 
 342         /* fill in the details if requested */
 343         if (nmatch > 0) {
 344                 pmatch[0].rm_so = m->coldp - m->offp;
 345                 pmatch[0].rm_eo = endp - m->offp;
 346         }
 347         if (nmatch > 1) {
 348                 assert(m->pmatch != NULL);
 349                 for (i = 1; i < nmatch; i++)
 350                         if (i <= m->g->nsub)
 351                                 pmatch[i] = m->pmatch[i];
 352                         else {
 353                                 pmatch[i].rm_so = -1;
 354                                 pmatch[i].rm_eo = -1;
 355                         }
 356         }
 357 
 358         if (m->pmatch != NULL)
 359                 free((char *)m->pmatch);
 360         if (m->lastpos != NULL)
 361                 free((char *)m->lastpos);
 362         STATETEARDOWN(m);
 363         return (0);
 364 }
 365 
 366 /*
 367  * dissect - figure out what matched what, no back references
 368  */
 369 static const char *
 370 dissect(struct match *m, const char *start, const char *stop, sopno startst,
 371     sopno stopst)
 372 {
 373         int i;
 374         sopno ss;               /* start sop of current subRE */
 375         sopno es;               /* end sop of current subRE */
 376         const char *sp;         /* start of string matched by it */
 377         const char *stp;        /* string matched by it cannot pass here */
 378         const char *rest;       /* start of rest of string */
 379         const char *tail;       /* string unmatched by rest of RE */
 380         sopno ssub;             /* start sop of subsubRE */
 381         sopno esub;             /* end sop of subsubRE */
 382         const char *ssp;        /* start of string matched by subsubRE */
 383         const char *sep;        /* end of string matched by subsubRE */
 384         const char *oldssp;     /* previous ssp */
 385         const char *dp __unused;
 386 
 387         AT("diss", start, stop, startst, stopst);
 388         sp = start;
 389         for (ss = startst; ss < stopst; ss = es) {
 390                 /* identify end of subRE */
 391                 es = ss;
 392                 switch (OP(m->g->strip[es])) {
 393                 case OPLUS_:
 394                 case OQUEST_:
 395                         es += OPND(m->g->strip[es]);
 396                         break;
 397                 case OCH_:
 398                         while (OP(m->g->strip[es]) != O_CH)
 399                                 es += OPND(m->g->strip[es]);
 400                         break;
 401                 }
 402                 es++;
 403 
 404                 /* figure out what it matched */
 405                 switch (OP(m->g->strip[ss])) {
 406                 case OEND:
 407                         assert(0);
 408                         break;
 409                 case OCHAR:
 410                         sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
 411                         break;
 412                 case OBOL:
 413                 case OEOL:
 414                 case OBOW:
 415                 case OEOW:
 416                         break;
 417                 case OANY:
 418                 case OANYOF:
 419                         sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
 420                         break;
 421                 case OBACK_:
 422                 case O_BACK:
 423                         assert(0);
 424                         break;
 425                 /* cases where length of match is hard to find */
 426                 case OQUEST_:
 427                         stp = stop;
 428                         for (;;) {
 429                                 /* how long could this one be? */
 430                                 rest = slow(m, sp, stp, ss, es);
 431                                 assert(rest != NULL);   /* it did match */
 432                                 /* could the rest match the rest? */
 433                                 tail = slow(m, rest, stop, es, stopst);
 434                                 if (tail == stop)
 435                                         break;          /* yes! */
 436                                 /* no -- try a shorter match for this one */
 437                                 stp = rest - 1;
 438                                 assert(stp >= sp);   /* it did work */
 439                         }
 440                         ssub = ss + 1;
 441                         esub = es - 1;
 442                         /* did innards match? */
 443                         if (slow(m, sp, rest, ssub, esub) != NULL) {
 444                                 dp = dissect(m, sp, rest, ssub, esub);
 445                                 assert(dp == rest);
 446 #if defined(__lint)
 447                                 (void) dp;
 448 #endif
 449                         } else          /* no */
 450                                 assert(sp == rest);
 451                         sp = rest;
 452                         break;
 453                 case OPLUS_:
 454                         stp = stop;
 455                         for (;;) {
 456                                 /* how long could this one be? */
 457                                 rest = slow(m, sp, stp, ss, es);
 458                                 assert(rest != NULL);   /* it did match */
 459                                 /* could the rest match the rest? */
 460                                 tail = slow(m, rest, stop, es, stopst);
 461                                 if (tail == stop)
 462                                         break;          /* yes! */
 463                                 /* no -- try a shorter match for this one */
 464                                 stp = rest - 1;
 465                                 assert(stp >= sp);   /* it did work */
 466                         }
 467                         ssub = ss + 1;
 468                         esub = es - 1;
 469                         ssp = sp;
 470                         oldssp = ssp;
 471                         for (;;) {      /* find last match of innards */
 472                                 sep = slow(m, ssp, rest, ssub, esub);
 473                                 if (sep == NULL || sep == ssp)
 474                                         break;  /* failed or matched null */
 475                                 oldssp = ssp;   /* on to next try */
 476                                 ssp = sep;
 477                         }
 478                         if (sep == NULL) {
 479                                 /* last successful match */
 480                                 sep = ssp;
 481                                 ssp = oldssp;
 482                         }
 483                         assert(sep == rest);    /* must exhaust substring */
 484                         assert(slow(m, ssp, sep, ssub, esub) == rest);
 485                         dp = dissect(m, ssp, sep, ssub, esub);
 486                         assert(dp == sep);
 487                         sp = rest;
 488                         break;
 489                 case OCH_:
 490                         stp = stop;
 491                         for (;;) {
 492                                 /* how long could this one be? */
 493                                 rest = slow(m, sp, stp, ss, es);
 494                                 assert(rest != NULL);   /* it did match */
 495                                 /* could the rest match the rest? */
 496                                 tail = slow(m, rest, stop, es, stopst);
 497                                 if (tail == stop)
 498                                         break;          /* yes! */
 499                                 /* no -- try a shorter match for this one */
 500                                 stp = rest - 1;
 501                                 assert(stp >= sp);   /* it did work */
 502                         }
 503                         ssub = ss + 1;
 504                         esub = ss + OPND(m->g->strip[ss]) - 1;
 505                         assert(OP(m->g->strip[esub]) == OOR1);
 506                         for (;;) {      /* find first matching branch */
 507                                 if (slow(m, sp, rest, ssub, esub) == rest)
 508                                         break;  /* it matched all of it */
 509                                 /* that one missed, try next one */
 510                                 assert(OP(m->g->strip[esub]) == OOR1);
 511                                 esub++;
 512                                 assert(OP(m->g->strip[esub]) == OOR2);
 513                                 ssub = esub + 1;
 514                                 esub += OPND(m->g->strip[esub]);
 515                                 if (OP(m->g->strip[esub]) == OOR2)
 516                                         esub--;
 517                                 else
 518                                         assert(OP(m->g->strip[esub]) == O_CH);
 519                         }
 520                         dp = dissect(m, sp, rest, ssub, esub);
 521                         assert(dp == rest);
 522                         sp = rest;
 523                         break;
 524                 case O_PLUS:
 525                 case O_QUEST:
 526                 case OOR1:
 527                 case OOR2:
 528                 case O_CH:
 529                         assert(0);
 530                         break;
 531                 case OLPAREN:
 532                         i = OPND(m->g->strip[ss]);
 533                         assert(0 < i && i <= m->g->nsub);
 534                         m->pmatch[i].rm_so = sp - m->offp;
 535                         break;
 536                 case ORPAREN:
 537                         i = OPND(m->g->strip[ss]);
 538                         assert(0 < i && i <= m->g->nsub);
 539                         m->pmatch[i].rm_eo = sp - m->offp;
 540                         break;
 541                 default:                /* uh oh */
 542                         assert(0);
 543                         break;
 544                 }
 545         }
 546 
 547         assert(sp == stop);
 548         return (sp);
 549 }
 550 
 551 /*
 552  * backref - figure out what matched what, figuring in back references
 553  */
 554 static const char *
 555 backref(struct match *m, const char *start, const char *stop, sopno startst,
 556     sopno stopst, sopno lev,            /* PLUS nesting level */
 557     int rec)
 558 {
 559         int i;
 560         sopno ss;               /* start sop of current subRE */
 561         const char *sp;         /* start of string matched by it */
 562         sopno ssub;             /* start sop of subsubRE */
 563         sopno esub;             /* end sop of subsubRE */
 564         const char *ssp;        /* start of string matched by subsubRE */
 565         const char *dp;
 566         size_t len;
 567         int hard;
 568         sop s;
 569         regoff_t offsave;
 570         cset *cs;
 571         wint_t wc;
 572 
 573         AT("back", start, stop, startst, stopst);
 574         sp = start;
 575 
 576         /* get as far as we can with easy stuff */
 577         hard = 0;
 578         for (ss = startst; !hard && ss < stopst; ss++)
 579                 switch (OP(s = m->g->strip[ss])) {
 580                 case OCHAR:
 581                         if (sp == stop)
 582                                 return (NULL);
 583                         sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
 584                         if (wc != OPND(s))
 585                                 return (NULL);
 586                         break;
 587                 case OANY:
 588                         if (sp == stop)
 589                                 return (NULL);
 590                         sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
 591                         if (wc == BADCHAR)
 592                                 return (NULL);
 593                         break;
 594                 case OANYOF:
 595                         if (sp == stop)
 596                                 return (NULL);
 597                         cs = &m->g->sets[OPND(s)];
 598                         sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
 599                         if (wc == BADCHAR || !CHIN(cs, wc))
 600                                 return (NULL);
 601                         break;
 602                 case OBOL:
 603                         if ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
 604                             (sp > m->offp && sp < m->endp &&
 605                             *(sp-1) == '\n' && (m->g->cflags&REG_NEWLINE))) {
 606                                 break;
 607                         }
 608                         return (NULL);
 609                 case OEOL:
 610                         if ((sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
 611                             (sp < m->endp && *sp == '\n' &&
 612                             (m->g->cflags&REG_NEWLINE))) {
 613                                 break;
 614                         }
 615                         return (NULL);
 616                 case OBOW:
 617                         if (sp < m->endp && ISWORD(*sp) &&
 618                             ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
 619                             (sp > m->offp && !ISWORD(*(sp-1))))) {
 620                                 break;
 621                         }
 622                         return (NULL);
 623                 case OEOW:
 624                         if (((sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
 625                             (sp < m->endp && *sp == '\n' &&
 626                             (m->g->cflags&REG_NEWLINE)) ||
 627                             (sp < m->endp && !ISWORD(*sp))) &&
 628                             (sp > m->beginp && ISWORD(*(sp-1)))) {
 629                                 break;
 630                         }
 631                         return (NULL);
 632                 case O_QUEST:
 633                         break;
 634                 case OOR1:      /* matches null but needs to skip */
 635                         ss++;
 636                         s = m->g->strip[ss];
 637                         do {
 638                                 assert(OP(s) == OOR2);
 639                                 ss += OPND(s);
 640                         } while (OP(s = m->g->strip[ss]) != O_CH);
 641                         /* note that the ss++ gets us past the O_CH */
 642                         break;
 643                 default:        /* have to make a choice */
 644                         hard = 1;
 645                         break;
 646                 }
 647         if (!hard) {            /* that was it! */
 648                 if (sp != stop)
 649                         return (NULL);
 650                 return (sp);
 651         }
 652         ss--;                   /* adjust for the for's final increment */
 653 
 654         /* the hard stuff */
 655         AT("hard", sp, stop, ss, stopst);
 656         s = m->g->strip[ss];
 657         switch (OP(s)) {
 658         case OBACK_:            /* the vilest depths */
 659                 i = OPND(s);
 660                 assert(0 < i && i <= m->g->nsub);
 661                 if (m->pmatch[i].rm_eo == -1)
 662                         return (NULL);
 663                 assert(m->pmatch[i].rm_so != -1);
 664                 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
 665                 if (len == 0 && rec++ > MAX_RECURSION)
 666                         return (NULL);
 667                 assert(stop - m->beginp >= len);
 668                 if (sp > stop - len)
 669                         return (NULL);  /* not enough left to match */
 670                 ssp = m->offp + m->pmatch[i].rm_so;
 671                 if (memcmp(sp, ssp, len) != 0)
 672                         return (NULL);
 673                 while (m->g->strip[ss] != SOP(O_BACK, i))
 674                         ss++;
 675                 return (backref(m, sp+len, stop, ss+1, stopst, lev, rec));
 676         case OQUEST_:           /* to null or not */
 677                 dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
 678                 if (dp != NULL)
 679                         return (dp);    /* not */
 680                 return (backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
 681         case OPLUS_:
 682                 assert(m->lastpos != NULL);
 683                 assert(lev+1 <= m->g->nplus);
 684                 m->lastpos[lev+1] = sp;
 685                 return (backref(m, sp, stop, ss+1, stopst, lev+1, rec));
 686         case O_PLUS:
 687                 if (sp == m->lastpos[lev])   /* last pass matched null */
 688                         return (backref(m, sp, stop, ss+1, stopst, lev-1, rec));
 689                 /* try another pass */
 690                 m->lastpos[lev] = sp;
 691                 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
 692                 if (dp == NULL)
 693                         return (backref(m, sp, stop, ss+1, stopst, lev-1, rec));
 694                 return (dp);
 695         case OCH_:              /* find the right one, if any */
 696                 ssub = ss + 1;
 697                 esub = ss + OPND(s) - 1;
 698                 assert(OP(m->g->strip[esub]) == OOR1);
 699                 for (;;) {      /* find first matching branch */
 700                         dp = backref(m, sp, stop, ssub, esub, lev, rec);
 701                         if (dp != NULL)
 702                                 return (dp);
 703                         /* that one missed, try next one */
 704                         if (OP(m->g->strip[esub]) == O_CH)
 705                                 return (NULL);  /* there is none */
 706                         esub++;
 707                         assert(OP(m->g->strip[esub]) == OOR2);
 708                         ssub = esub + 1;
 709                         esub += OPND(m->g->strip[esub]);
 710                         if (OP(m->g->strip[esub]) == OOR2)
 711                                 esub--;
 712                         else
 713                                 assert(OP(m->g->strip[esub]) == O_CH);
 714                 }
 715                 /* NOTREACHED */
 716                 break;
 717         case OLPAREN:           /* must undo assignment if rest fails */
 718                 i = OPND(s);
 719                 assert(0 < i && i <= m->g->nsub);
 720                 offsave = m->pmatch[i].rm_so;
 721                 m->pmatch[i].rm_so = sp - m->offp;
 722                 dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
 723                 if (dp != NULL)
 724                         return (dp);
 725                 m->pmatch[i].rm_so = offsave;
 726                 return (NULL);
 727         case ORPAREN:           /* must undo assignment if rest fails */
 728                 i = OPND(s);
 729                 assert(0 < i && i <= m->g->nsub);
 730                 offsave = m->pmatch[i].rm_eo;
 731                 m->pmatch[i].rm_eo = sp - m->offp;
 732                 dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
 733                 if (dp != NULL)
 734                         return (dp);
 735                 m->pmatch[i].rm_eo = offsave;
 736                 return (NULL);
 737         default:                /* uh oh */
 738                 assert(0);
 739                 break;
 740         }
 741 
 742         /* "can't happen" */
 743         assert(0);
 744         return (NULL);
 745 }
 746 
 747 /*
 748  * fast - step through the string at top speed
 749  */
 750 static const char *
 751 fast(struct match *m, const char *start, const char *stop, sopno startst,
 752     sopno stopst)
 753 {
 754         states st = m->st;
 755         states fresh = m->fresh;
 756         states tmp = m->tmp;
 757         const char *p = start;
 758         wint_t c;
 759         wint_t lastc;           /* previous c */
 760         wint_t flagch;
 761         int i;
 762         const char *coldp;      /* last p after which no match was underway */
 763         size_t clen;
 764 
 765         CLEAR(st);
 766         SET1(st, startst);
 767         SP("fast", st, *p);
 768         st = step(m->g, startst, stopst, st, NOTHING, st);
 769         ASSIGN(fresh, st);
 770         SP("start", st, *p);
 771         coldp = NULL;
 772         if (start == m->offp || (start == m->beginp && !(m->eflags&REG_NOTBOL)))
 773                 c = OUT;
 774         else {
 775                 /*
 776                  * XXX Wrong if the previous character was multi-byte.
 777                  * Newline never is (in encodings supported by FreeBSD),
 778                  * so this only breaks the ISWORD tests below.
 779                  */
 780                 c = (uch)*(start - 1);
 781         }
 782         for (;;) {
 783                 /* next character */
 784                 lastc = c;
 785                 if (p == m->endp) {
 786                         clen = 0;
 787                         c = OUT;
 788                 } else
 789                         clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
 790                 if (EQ(st, fresh))
 791                         coldp = p;
 792 
 793                 /* is there an EOL and/or BOL between lastc and c? */
 794                 flagch = '\0';
 795                 i = 0;
 796                 if ((lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
 797                     (lastc == OUT && !(m->eflags&REG_NOTBOL))) {
 798                         flagch = BOL;
 799                         i = m->g->nbol;
 800                 }
 801                 if ((c == '\n' && m->g->cflags&REG_NEWLINE) ||
 802                     (c == OUT && !(m->eflags&REG_NOTEOL))) {
 803                         flagch = (flagch == BOL) ? BOLEOL : EOL;
 804                         i += m->g->neol;
 805                 }
 806                 if (i != 0) {
 807                         for (; i > 0; i--)
 808                                 st = step(m->g, startst, stopst, st,
 809                                     flagch, st);
 810                         SP("boleol", st, c);
 811                 }
 812 
 813                 /* how about a word boundary? */
 814                 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
 815                     (c != OUT && ISWORD(c))) {
 816                         flagch = BOW;
 817                 }
 818                 if ((lastc != OUT && ISWORD(lastc)) &&
 819                     (flagch == EOL || (c != OUT && !ISWORD(c)))) {
 820                         flagch = EOW;
 821                 }
 822                 if (flagch == BOW || flagch == EOW) {
 823                         st = step(m->g, startst, stopst, st, flagch, st);
 824                         SP("boweow", st, c);
 825                 }
 826 
 827                 /* are we done? */
 828                 if (ISSET(st, stopst) || p == stop || clen > stop - p)
 829                         break;          /* NOTE BREAK OUT */
 830 
 831                 /* no, we must deal with this character */
 832                 ASSIGN(tmp, st);
 833                 ASSIGN(st, fresh);
 834                 assert(c != OUT);
 835                 st = step(m->g, startst, stopst, tmp, c, st);
 836                 SP("aft", st, c);
 837                 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
 838                 p += clen;
 839         }
 840 
 841         assert(coldp != NULL);
 842         m->coldp = coldp;
 843         if (ISSET(st, stopst))
 844                 return (p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0));
 845         else
 846                 return (NULL);
 847 }
 848 
 849 /*
 850  * slow - step through the string more deliberately
 851  */
 852 static const char *
 853 slow(struct match *m, const char *start, const char *stop, sopno startst,
 854     sopno stopst)
 855 {
 856         states st = m->st;
 857         states empty = m->empty;
 858         states tmp = m->tmp;
 859         const char *p = start;
 860         wint_t c;
 861         wint_t lastc;           /* previous c */
 862         wint_t flagch;
 863         int i;
 864         const char *matchp;     /* last p at which a match ended */
 865         size_t clen;
 866 
 867         AT("slow", start, stop, startst, stopst);
 868         CLEAR(st);
 869         SET1(st, startst);
 870         SP("sstart", st, *p);
 871         st = step(m->g, startst, stopst, st, NOTHING, st);
 872         matchp = NULL;
 873         if (start == m->offp || (start == m->beginp && !(m->eflags&REG_NOTBOL)))
 874                 c = OUT;
 875         else {
 876                 /*
 877                  * XXX Wrong if the previous character was multi-byte.
 878                  * Newline never is (in encodings supported by FreeBSD),
 879                  * so this only breaks the ISWORD tests below.
 880                  */
 881                 c = (uch)*(start - 1);
 882         }
 883         for (;;) {
 884                 /* next character */
 885                 lastc = c;
 886                 if (p == m->endp) {
 887                         c = OUT;
 888                         clen = 0;
 889                 } else
 890                         clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
 891 
 892                 /* is there an EOL and/or BOL between lastc and c? */
 893                 flagch = '\0';
 894                 i = 0;
 895                 if ((lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
 896                     (lastc == OUT && !(m->eflags&REG_NOTBOL))) {
 897                         flagch = BOL;
 898                         i = m->g->nbol;
 899                 }
 900                 if ((c == '\n' && m->g->cflags&REG_NEWLINE) ||
 901                     (c == OUT && !(m->eflags&REG_NOTEOL))) {
 902                         flagch = (flagch == BOL) ? BOLEOL : EOL;
 903                         i += m->g->neol;
 904                 }
 905                 if (i != 0) {
 906                         for (; i > 0; i--)
 907                                 st = step(m->g, startst, stopst, st,
 908                                     flagch, st);
 909                         SP("sboleol", st, c);
 910                 }
 911 
 912                 /* how about a word boundary? */
 913                 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
 914                     (c != OUT && ISWORD(c))) {
 915                         flagch = BOW;
 916                 }
 917                 if ((lastc != OUT && ISWORD(lastc)) &&
 918                     (flagch == EOL || (c != OUT && !ISWORD(c)))) {
 919                         flagch = EOW;
 920                 }
 921                 if (flagch == BOW || flagch == EOW) {
 922                         st = step(m->g, startst, stopst, st, flagch, st);
 923                         SP("sboweow", st, c);
 924                 }
 925 
 926                 /* are we done? */
 927                 if (ISSET(st, stopst))
 928                         matchp = p;
 929                 if (EQ(st, empty) || p == stop || clen > stop - p)
 930                         break;          /* NOTE BREAK OUT */
 931 
 932                 /* no, we must deal with this character */
 933                 ASSIGN(tmp, st);
 934                 ASSIGN(st, empty);
 935                 assert(c != OUT);
 936                 st = step(m->g, startst, stopst, tmp, c, st);
 937                 SP("saft", st, c);
 938                 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
 939                 p += clen;
 940         }
 941 
 942         return (matchp);
 943 }
 944 
 945 
 946 /*
 947  * step - map set of states reachable before char to set reachable after
 948  */
 949 static states
 950 step(struct re_guts *g,
 951     sopno start,        /* start state within strip */
 952     sopno stop,         /* state after stop state within strip */
 953     states bef,         /* states reachable before */
 954     wint_t ch,          /* character or NONCHAR code */
 955     states aft)         /* states already known reachable after */
 956 {
 957         cset *cs;
 958         sop s;
 959         sopno pc;
 960         onestate here;          /* note, macros know this name */
 961         sopno look;
 962         int i;
 963 
 964         for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
 965                 s = g->strip[pc];
 966                 switch (OP(s)) {
 967                 case OEND:
 968                         assert(pc == stop-1);
 969                         break;
 970                 case OCHAR:
 971                         /* only characters can match */
 972                         assert(!NONCHAR(ch) || ch != OPND(s));
 973                         if (ch == OPND(s))
 974                                 FWD(aft, bef, 1);
 975                         break;
 976                 case OBOL:
 977                         if (ch == BOL || ch == BOLEOL)
 978                                 FWD(aft, bef, 1);
 979                         break;
 980                 case OEOL:
 981                         if (ch == EOL || ch == BOLEOL)
 982                                 FWD(aft, bef, 1);
 983                         break;
 984                 case OBOW:
 985                         if (ch == BOW)
 986                                 FWD(aft, bef, 1);
 987                         break;
 988                 case OEOW:
 989                         if (ch == EOW)
 990                                 FWD(aft, bef, 1);
 991                         break;
 992                 case OANY:
 993                         if (!NONCHAR(ch))
 994                                 FWD(aft, bef, 1);
 995                         break;
 996                 case OANYOF:
 997                         cs = &g->sets[OPND(s)];
 998                         if (!NONCHAR(ch) && CHIN(cs, ch))
 999                                 FWD(aft, bef, 1);
1000                         break;
1001                 case OBACK_:            /* ignored here */
1002                 case O_BACK:
1003                         FWD(aft, aft, 1);
1004                         break;
1005                 case OPLUS_:            /* forward, this is just an empty */
1006                         FWD(aft, aft, 1);
1007                         break;
1008                 case O_PLUS:            /* both forward and back */
1009                         FWD(aft, aft, 1);
1010                         i = ISSETBACK(aft, OPND(s));
1011                         BACK(aft, aft, OPND(s));
1012                         if (!i && ISSETBACK(aft, OPND(s))) {
1013                                 /* oho, must reconsider loop body */
1014                                 pc -= OPND(s) + 1;
1015                                 INIT(here, pc);
1016                         }
1017                         break;
1018                 case OQUEST_:           /* two branches, both forward */
1019                         FWD(aft, aft, 1);
1020                         FWD(aft, aft, OPND(s));
1021                         break;
1022                 case O_QUEST:           /* just an empty */
1023                         FWD(aft, aft, 1);
1024                         break;
1025                 case OLPAREN:           /* not significant here */
1026                 case ORPAREN:
1027                         FWD(aft, aft, 1);
1028                         break;
1029                 case OCH_:              /* mark the first two branches */
1030                         FWD(aft, aft, 1);
1031                         assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1032                         FWD(aft, aft, OPND(s));
1033                         break;
1034                 case OOR1:              /* done a branch, find the O_CH */
1035                         if (ISSTATEIN(aft, here)) {
1036                                 for (look = 1;
1037                                     OP(s = g->strip[pc+look]) != O_CH;
1038                                     look += OPND(s))
1039                                         assert(OP(s) == OOR2);
1040                                 FWD(aft, aft, look + 1);
1041                         }
1042                         break;
1043                 case OOR2:              /* propagate OCH_'s marking */
1044                         FWD(aft, aft, 1);
1045                         if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1046                                 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1047                                 FWD(aft, aft, OPND(s));
1048                         }
1049                         break;
1050                 case O_CH:              /* just empty */
1051                         FWD(aft, aft, 1);
1052                         break;
1053                 default:                /* ooooops... */
1054                         assert(0);
1055                         break;
1056                 }
1057         }
1058 
1059         return (aft);
1060 }
1061 
1062 #ifdef REDEBUG
1063 /*
1064  * print - print a set of states
1065  */
1066 static void
1067 print(struct match *m, const char *caption, states st, int ch, FILE *d)
1068 {
1069         struct re_guts *g = m->g;
1070         sopno i;
1071         int first = 1;
1072 
1073         if (!(m->eflags&REG_TRACE))
1074                 return;
1075 
1076         (void) fprintf(d, "%s", caption);
1077         if (ch != '\0')
1078                 (void) fprintf(d, " %s", pchar(ch));
1079         for (i = 0; i < g->nstates; i++)
1080                 if (ISSET(st, i)) {
1081                         (void) fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1082                         first = 0;
1083                 }
1084         (void) fprintf(d, "\n");
1085 }
1086 
1087 /*
1088  * at - print current situation
1089  */
1090 static void
1091 at(struct match *m, const char *title, const char *start, const char *stop,
1092     sopno startst, sopno stopst)
1093 {
1094         if (!(m->eflags&REG_TRACE))
1095                 return;
1096 
1097         (void) printf("%s %s-", title, pchar(*start));
1098         (void) printf("%s ", pchar(*stop));
1099         (void) printf("%ld-%ld\n", (long)startst, (long)stopst);
1100 }
1101 
1102 #ifndef PCHARDONE
1103 #define PCHARDONE       /* never again */
1104 /*
1105  * pchar - make a character printable
1106  *
1107  * Is this identical to regchar() over in debug.c?  Well, yes.  But a
1108  * duplicate here avoids having a debugging-capable regexec.o tied to
1109  * a matching debug.o, and this is convenient.  It all disappears in
1110  * the non-debug compilation anyway, so it doesn't matter much.
1111  */
1112 static const char *
1113 pchar(int ch)
1114 {
1115         static char pbuf[10];
1116 
1117         if (isprint((uch)ch) || ch == ' ')
1118                 (void) sprintf(pbuf, "%c", ch);
1119         else
1120                 (void) sprintf(pbuf, "\\%o", ch);
1121         return (pbuf);
1122 }
1123 #endif
1124 #endif
1125 
1126 #undef  matcher
1127 #undef  fast
1128 #undef  slow
1129 #undef  dissect
1130 #undef  backref
1131 #undef  step
1132 #undef  print
1133 #undef  at
1134 #undef  match