1 /*
   2  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
   3  * Copyright 2012 Milan Jurik. All rights reserved.
   4  * Copyright (c) 2016 by Delphix. All rights reserved.
   5  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
   6  * Copyright (c) 1992, 1993, 1994
   7  *      The Regents of the University of California.  All rights reserved.
   8  *
   9  * This code is derived from software contributed to Berkeley by
  10  * Henry Spencer.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  */
  36 
  37 #include <stdbool.h>
  38 
  39 /*
  40  * The matching engine and friends.  This file is #included by regexec.c
  41  * after suitable #defines of a variety of macros used herein, so that
  42  * different state representations can be used without duplicating masses
  43  * of code.
  44  */
  45 
  46 #ifdef SNAMES
  47 #define matcher smatcher
  48 #define walk    swalk
  49 #define dissect sdissect
  50 #define backref sbackref
  51 #define step    sstep
  52 #define print   sprint
  53 #define at      sat
  54 #define match   smat
  55 #endif
  56 #ifdef LNAMES
  57 #define matcher lmatcher
  58 #define walk    lwalk
  59 #define dissect ldissect
  60 #define backref lbackref
  61 #define step    lstep
  62 #define print   lprint
  63 #define at      lat
  64 #define match   lmat
  65 #endif
  66 #ifdef MNAMES
  67 #define matcher mmatcher
  68 #define walk    mwalk
  69 #define dissect mdissect
  70 #define backref mbackref
  71 #define step    mstep
  72 #define print   mprint
  73 #define at      mat
  74 #define match   mmat
  75 #endif
  76 
  77 /* another structure passed up and down to avoid zillions of parameters */
  78 struct match {
  79         struct re_guts *g;
  80         int eflags;
  81         regmatch_t *pmatch;     /* [nsub+1] (0 element unused) */
  82         const char *offp;       /* offsets work from here */
  83         const char *beginp;     /* start of string -- virtual NUL precedes */
  84         const char *endp;       /* end of string -- virtual NUL here */
  85         const char *coldp;      /* can be no match starting before here */
  86         const char **lastpos;   /* [nplus+1] */
  87         STATEVARS;
  88         states st;              /* current states */
  89         states fresh;           /* states for a fresh start */
  90         states tmp;             /* temporary */
  91         states empty;           /* empty set of states */
  92         mbstate_t mbs;          /* multibyte conversion state */
  93 };
  94 
  95 /* ========= begin header generated by ./mkh ========= */
  96 #ifdef __cplusplus
  97 extern "C" {
  98 #endif
  99 
 100 /* === engine.c === */
 101 static int matcher(struct re_guts *, const char *, size_t, regmatch_t[], int);
 102 static const char *dissect(struct match *, const char *, const char *,
 103     sopno, sopno);
 104 static const char *backref(struct match *, const char *, const char *, sopno,
 105     sopno, sopno, int);
 106 static const char *walk(struct match *m, const char *start, const char *stop,
 107     sopno startst, sopno stopst, bool fast);
 108 static states step(struct re_guts *, sopno, sopno, states, wint_t, states);
 109 #define MAX_RECURSION   100
 110 #define BOL     (OUT-1)
 111 #define EOL     (BOL-1)
 112 #define BOLEOL  (BOL-2)
 113 #define NOTHING (BOL-3)
 114 #define BOW     (BOL-4)
 115 #define EOW     (BOL-5)
 116 #define BADCHAR (BOL-6)
 117 #define NONCHAR(c)      ((c) <= OUT)
 118 #ifdef REDEBUG
 119 static void print(struct match *, const char *, states, int, FILE *);
 120 #endif
 121 #ifdef REDEBUG
 122 static void at(struct match *, const char *, const char *, const char *,
 123     sopno, sopno);
 124 #endif
 125 #ifdef REDEBUG
 126 static const char *pchar(int ch);
 127 #endif
 128 
 129 #ifdef __cplusplus
 130 }
 131 #endif
 132 /* ========= end header generated by ./mkh ========= */
 133 
 134 #ifdef REDEBUG
 135 #define SP(t, s, c)     print(m, t, s, c, stdout)
 136 #define AT(t, p1, p2, s1, s2)   at(m, t, p1, p2, s1, s2)
 137 #define NOTE(str)       { if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
 138 #else
 139 #define SP(t, s, c)     /* nothing */
 140 #define AT(t, p1, p2, s1, s2)   /* nothing */
 141 #define NOTE(s) /* nothing */
 142 #endif
 143 
 144 /*
 145  * matcher - the actual matching engine
 146  */
 147 static int                      /* 0 success, REG_NOMATCH failure */
 148 matcher(struct re_guts *g, const char *string, size_t nmatch,
 149     regmatch_t pmatch[], int eflags)
 150 {
 151         const char *endp;
 152         size_t i;
 153         struct match mv;
 154         struct match *m = &mv;
 155         const char *dp = NULL;
 156         const sopno gf = g->firststate+1;    /* +1 for OEND */
 157         const sopno gl = g->laststate;
 158         const char *start;
 159         const char *stop;
 160         /* Boyer-Moore algorithms variables */
 161         const char *pp;
 162         int cj, mj;
 163         const char *mustfirst;
 164         const char *mustlast;
 165         int *matchjump;
 166         int *charjump;
 167 
 168         /* simplify the situation where possible */
 169         if (g->cflags&REG_NOSUB)
 170                 nmatch = 0;
 171 
 172         if (eflags&REG_STARTEND) {
 173                 start = string + pmatch[0].rm_so;
 174                 stop = string + pmatch[0].rm_eo;
 175         } else {
 176                 start = string;
 177                 stop = start + strlen(start);
 178         }
 179 
 180         if (stop < start)
 181                 return (REG_EFATAL);
 182 
 183         /* prescreening; this does wonders for this rather slow code */
 184         if (g->must != NULL) {
 185                 if (g->charjump != NULL && g->matchjump != NULL) {
 186                         mustfirst = g->must;
 187                         mustlast = g->must + g->mlen - 1;
 188                         charjump = g->charjump;
 189                         matchjump = g->matchjump;
 190                         pp = mustlast;
 191                         for (dp = start+g->mlen-1; dp < stop; ) {
 192                                 /* Fast skip non-matches */
 193                                 while (dp < stop && charjump[(int)*dp])
 194                                         dp += charjump[(int)*dp];
 195 
 196                                 if (dp >= stop)
 197                                         break;
 198 
 199                                 /* Greedy matcher */
 200                                 /*
 201                                  * We depend on not being used for
 202                                  * for strings of length 1
 203                                  */
 204                                 while (*--dp == *--pp && pp != mustfirst)
 205                                         ;
 206 
 207                                 if (*dp == *pp)
 208                                         break;
 209 
 210                                 /* Jump to next possible match */
 211                                 mj = matchjump[pp - mustfirst];
 212                                 cj = charjump[(int)*dp];
 213                                 dp += (cj < mj ? mj : cj);
 214                                 pp = mustlast;
 215                         }
 216                         if (pp != mustfirst)
 217                                 return (REG_NOMATCH);
 218                 } else {
 219                         for (dp = start; dp < stop; dp++)
 220                                 if (*dp == g->must[0] &&
 221                                     stop - dp >= g->mlen &&
 222                                     memcmp(dp, g->must, (size_t)g->mlen) == 0)
 223                                         break;
 224                         if (dp == stop)         /* we didn't find g->must */
 225                                 return (REG_NOMATCH);
 226                 }
 227         }
 228 
 229         /* match struct setup */
 230         m->g = g;
 231         m->eflags = eflags;
 232         m->pmatch = NULL;
 233         m->lastpos = NULL;
 234         m->offp = string;
 235         m->beginp = start;
 236         m->endp = stop;
 237         STATESETUP(m, 4);
 238         SETUP(m->st);
 239         SETUP(m->fresh);
 240         SETUP(m->tmp);
 241         SETUP(m->empty);
 242         CLEAR(m->empty);
 243         ZAPSTATE(&m->mbs);
 244 
 245         /* Adjust start according to moffset, to speed things up */
 246         if (dp != NULL && g->moffset > -1)
 247                 start = ((dp - g->moffset) < start) ? start : dp - g->moffset;
 248 
 249         SP("mloop", m->st, *start);
 250 
 251         /* this loop does only one repetition except for backrefs */
 252         for (;;) {
 253                 endp = walk(m, start, stop, gf, gl, true);
 254                 if (endp == NULL) {             /* a miss */
 255                         if (m->pmatch != NULL)
 256                                 free((char *)m->pmatch);
 257                         if (m->lastpos != NULL)
 258                                 free((char *)m->lastpos);
 259                         STATETEARDOWN(m);
 260                         return (REG_NOMATCH);
 261                 }
 262                 if (nmatch == 0 && !g->backrefs)
 263                         break;          /* no further info needed */
 264 
 265                 /* where? */
 266                 assert(m->coldp != NULL);
 267                 for (;;) {
 268                         NOTE("finding start");
 269                         endp = walk(m, m->coldp, stop, gf, gl, false);
 270                         if (endp != NULL)
 271                                 break;
 272                         assert(m->coldp < m->endp);
 273                         m->coldp += XMBRTOWC(NULL, m->coldp,
 274                             m->endp - m->coldp, &m->mbs, 0);
 275                 }
 276                 if (nmatch == 1 && !g->backrefs)
 277                         break;          /* no further info needed */
 278 
 279                 /* oh my, it wants the subexpressions... */
 280                 if (m->pmatch == NULL)
 281                         m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
 282                             sizeof (regmatch_t));
 283                 if (m->pmatch == NULL) {
 284                         STATETEARDOWN(m);
 285                         return (REG_ESPACE);
 286                 }
 287                 for (i = 1; i <= m->g->nsub; i++)
 288                         m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
 289                 if (!g->backrefs && !(m->eflags&REG_BACKR)) {
 290                         NOTE("dissecting");
 291                         dp = dissect(m, m->coldp, endp, gf, gl);
 292                 } else {
 293                         if (g->nplus > 0 && m->lastpos == NULL)
 294                                 m->lastpos = malloc((g->nplus+1) *
 295                                     sizeof (const char *));
 296                         if (g->nplus > 0 && m->lastpos == NULL) {
 297                                 free(m->pmatch);
 298                                 STATETEARDOWN(m);
 299                                 return (REG_ESPACE);
 300                         }
 301                         NOTE("backref dissect");
 302                         dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
 303                 }
 304                 if (dp != NULL)
 305                         break;
 306 
 307                 /* uh-oh... we couldn't find a subexpression-level match */
 308                 assert(g->backrefs); /* must be back references doing it */
 309                 assert(g->nplus == 0 || m->lastpos != NULL);
 310                 for (;;) {
 311                         if (dp != NULL || endp <= m->coldp)
 312                                 break;          /* defeat */
 313                         NOTE("backoff");
 314                         endp = walk(m, m->coldp, endp-1, gf, gl, false);
 315                         if (endp == NULL)
 316                                 break;          /* defeat */
 317                         /* try it on a shorter possibility */
 318 #ifndef NDEBUG
 319                         for (i = 1; i <= m->g->nsub; i++) {
 320                                 assert(m->pmatch[i].rm_so == -1);
 321                                 assert(m->pmatch[i].rm_eo == -1);
 322                         }
 323 #endif
 324                         NOTE("backoff dissect");
 325                         dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
 326                 }
 327                 assert(dp == NULL || dp == endp);
 328                 if (dp != NULL)         /* found a shorter one */
 329                         break;
 330 
 331                 /* despite initial appearances, there is no match here */
 332                 NOTE("false alarm");
 333                 /* recycle starting later */
 334                 start = m->coldp + XMBRTOWC(NULL, m->coldp,
 335                     stop - m->coldp, &m->mbs, 0);
 336                 assert(start <= stop);
 337         }
 338 
 339         /* fill in the details if requested */
 340         if (nmatch > 0) {
 341                 pmatch[0].rm_so = m->coldp - m->offp;
 342                 pmatch[0].rm_eo = endp - m->offp;
 343         }
 344         if (nmatch > 1) {
 345                 assert(m->pmatch != NULL);
 346                 for (i = 1; i < nmatch; i++)
 347                         if (i <= m->g->nsub)
 348                                 pmatch[i] = m->pmatch[i];
 349                         else {
 350                                 pmatch[i].rm_so = -1;
 351                                 pmatch[i].rm_eo = -1;
 352                         }
 353         }
 354 
 355         if (m->pmatch != NULL)
 356                 free((char *)m->pmatch);
 357         if (m->lastpos != NULL)
 358                 free((char *)m->lastpos);
 359         STATETEARDOWN(m);
 360         return (0);
 361 }
 362 
 363 /*
 364  * dissect - figure out what matched what, no back references
 365  */
 366 static const char *
 367 dissect(struct match *m, const char *start, const char *stop, sopno startst,
 368     sopno stopst)
 369 {
 370         int i;
 371         sopno ss;               /* start sop of current subRE */
 372         sopno es;               /* end sop of current subRE */
 373         const char *sp;         /* start of string matched by it */
 374         const char *stp;        /* string matched by it cannot pass here */
 375         const char *rest;       /* start of rest of string */
 376         const char *tail;       /* string unmatched by rest of RE */
 377         sopno ssub;             /* start sop of subsubRE */
 378         sopno esub;             /* end sop of subsubRE */
 379         const char *ssp;        /* start of string matched by subsubRE */
 380         const char *sep;        /* end of string matched by subsubRE */
 381         const char *oldssp;     /* previous ssp */
 382         const char *dp;
 383 
 384         (void) dp;
 385         AT("diss", start, stop, startst, stopst);
 386         sp = start;
 387         for (ss = startst; ss < stopst; ss = es) {
 388                 /* identify end of subRE */
 389                 es = ss;
 390                 switch (OP(m->g->strip[es])) {
 391                 case OPLUS_:
 392                 case OQUEST_:
 393                         es += OPND(m->g->strip[es]);
 394                         break;
 395                 case OCH_:
 396                         while (OP(m->g->strip[es]) != (sop)O_CH)
 397                                 es += OPND(m->g->strip[es]);
 398                         break;
 399                 }
 400                 es++;
 401 
 402                 /* figure out what it matched */
 403                 switch (OP(m->g->strip[ss])) {
 404                 case OEND:
 405                         assert(0);
 406                         break;
 407                 case OCHAR:
 408                         sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
 409                         break;
 410                 case OBOL:
 411                 case OEOL:
 412                 case OBOW:
 413                 case OEOW:
 414                         break;
 415                 case OANY:
 416                 case OANYOF:
 417                         sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
 418                         break;
 419                 case OBACK_:
 420                 case O_BACK:
 421                         assert(0);
 422                         break;
 423                 /* cases where length of match is hard to find */
 424                 case OQUEST_:
 425                         stp = stop;
 426                         for (;;) {
 427                                 /* how long could this one be? */
 428                                 rest = walk(m, sp, stp, ss, es, false);
 429                                 assert(rest != NULL);   /* it did match */
 430                                 /* could the rest match the rest? */
 431                                 tail = walk(m, rest, stop, es, stopst, false);
 432                                 if (tail == stop)
 433                                         break;          /* yes! */
 434                                 /* no -- try a shorter match for this one */
 435                                 stp = rest - 1;
 436                                 assert(stp >= sp);   /* it did work */
 437                         }
 438                         ssub = ss + 1;
 439                         esub = es - 1;
 440                         /* did innards match? */
 441                         if (walk(m, sp, rest, ssub, esub, false) != NULL) {
 442                                 dp = dissect(m, sp, rest, ssub, esub);
 443                                 assert(dp == rest);
 444                         } else          /* no */
 445                                 assert(sp == rest);
 446                         sp = rest;
 447                         break;
 448                 case OPLUS_:
 449                         stp = stop;
 450                         for (;;) {
 451                                 /* how long could this one be? */
 452                                 rest = walk(m, sp, stp, ss, es, false);
 453                                 assert(rest != NULL);   /* it did match */
 454                                 /* could the rest match the rest? */
 455                                 tail = walk(m, rest, stop, es, stopst, false);
 456                                 if (tail == stop)
 457                                         break;          /* yes! */
 458                                 /* no -- try a shorter match for this one */
 459                                 stp = rest - 1;
 460                                 assert(stp >= sp);   /* it did work */
 461                         }
 462                         ssub = ss + 1;
 463                         esub = es - 1;
 464                         ssp = sp;
 465                         oldssp = ssp;
 466                         for (;;) {      /* find last match of innards */
 467                                 sep = walk(m, ssp, rest, ssub, esub, false);
 468                                 if (sep == NULL || sep == ssp)
 469                                         break;  /* failed or matched null */
 470                                 oldssp = ssp;   /* on to next try */
 471                                 ssp = sep;
 472                         }
 473                         if (sep == NULL) {
 474                                 /* last successful match */
 475                                 sep = ssp;
 476                                 ssp = oldssp;
 477                         }
 478                         assert(sep == rest);    /* must exhaust substring */
 479                         assert(walk(m, ssp, sep, ssub, esub, false) == rest);
 480                         dp = dissect(m, ssp, sep, ssub, esub);
 481                         assert(dp == sep);
 482                         sp = rest;
 483                         break;
 484                 case OCH_:
 485                         stp = stop;
 486                         for (;;) {
 487                                 /* how long could this one be? */
 488                                 rest = walk(m, sp, stp, ss, es, false);
 489                                 assert(rest != NULL);   /* it did match */
 490                                 /* could the rest match the rest? */
 491                                 tail = walk(m, rest, stop, es, stopst, false);
 492                                 if (tail == stop)
 493                                         break;          /* yes! */
 494                                 /* no -- try a shorter match for this one */
 495                                 stp = rest - 1;
 496                                 assert(stp >= sp);   /* it did work */
 497                         }
 498                         ssub = ss + 1;
 499                         esub = ss + OPND(m->g->strip[ss]) - 1;
 500                         assert(OP(m->g->strip[esub]) == OOR1);
 501                         for (;;) {      /* find first matching branch */
 502                                 if (walk(m, sp, rest, ssub, esub,
 503                                     false) == rest)
 504                                         break;  /* it matched all of it */
 505                                 /* that one missed, try next one */
 506                                 assert(OP(m->g->strip[esub]) == OOR1);
 507                                 esub++;
 508                                 assert(OP(m->g->strip[esub]) == OOR2);
 509                                 ssub = esub + 1;
 510                                 esub += OPND(m->g->strip[esub]);
 511                                 if (OP(m->g->strip[esub]) == (sop)OOR2)
 512                                         esub--;
 513                                 else
 514                                         assert(OP(m->g->strip[esub]) == O_CH);
 515                         }
 516                         dp = dissect(m, sp, rest, ssub, esub);
 517                         assert(dp == rest);
 518                         sp = rest;
 519                         break;
 520                 case O_PLUS:
 521                 case O_QUEST:
 522                 case OOR1:
 523                 case OOR2:
 524                 case O_CH:
 525                         assert(0);
 526                         break;
 527                 case OLPAREN:
 528                         i = OPND(m->g->strip[ss]);
 529                         assert(0 < i && i <= m->g->nsub);
 530                         m->pmatch[i].rm_so = sp - m->offp;
 531                         break;
 532                 case ORPAREN:
 533                         i = OPND(m->g->strip[ss]);
 534                         assert(0 < i && i <= m->g->nsub);
 535                         m->pmatch[i].rm_eo = sp - m->offp;
 536                         break;
 537                 default:                /* uh oh */
 538                         assert(0);
 539                         break;
 540                 }
 541         }
 542 
 543         assert(sp == stop);
 544         return (sp);
 545 }
 546 
 547 /*
 548  * backref - figure out what matched what, figuring in back references
 549  */
 550 static const char *
 551 backref(struct match *m, const char *start, const char *stop, sopno startst,
 552     sopno stopst, sopno lev,            /* PLUS nesting level */
 553     int rec)
 554 {
 555         int i;
 556         sopno ss;               /* start sop of current subRE */
 557         const char *sp;         /* start of string matched by it */
 558         sopno ssub;             /* start sop of subsubRE */
 559         sopno esub;             /* end sop of subsubRE */
 560         const char *ssp;        /* start of string matched by subsubRE */
 561         const char *dp;
 562         size_t len;
 563         int hard;
 564         sop s;
 565         regoff_t offsave;
 566         cset *cs;
 567         wint_t wc;
 568 
 569         AT("back", start, stop, startst, stopst);
 570         sp = start;
 571 
 572         /* get as far as we can with easy stuff */
 573         hard = 0;
 574         for (ss = startst; !hard && ss < stopst; ss++)
 575                 switch (OP(s = m->g->strip[ss])) {
 576                 case OCHAR:
 577                         if (sp == stop)
 578                                 return (NULL);
 579                         sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
 580                         if (wc != OPND(s))
 581                                 return (NULL);
 582                         break;
 583                 case OANY:
 584                         if (sp == stop)
 585                                 return (NULL);
 586                         sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
 587                         if (wc == BADCHAR)
 588                                 return (NULL);
 589                         break;
 590                 case OANYOF:
 591                         if (sp == stop)
 592                                 return (NULL);
 593                         cs = &m->g->sets[OPND(s)];
 594                         sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
 595                         if (wc == BADCHAR || !CHIN(cs, wc))
 596                                 return (NULL);
 597                         break;
 598                 case OBOL:
 599                         if ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
 600                             (sp > m->offp && sp < m->endp &&
 601                             *(sp-1) == '\n' && (m->g->cflags&REG_NEWLINE))) {
 602                                 break;
 603                         }
 604                         return (NULL);
 605                 case OEOL:
 606                         if ((sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
 607                             (sp < m->endp && *sp == '\n' &&
 608                             (m->g->cflags&REG_NEWLINE))) {
 609                                 break;
 610                         }
 611                         return (NULL);
 612                 case OBOW:
 613                         if (sp < m->endp && ISWORD(*sp) &&
 614                             ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
 615                             (sp > m->offp && !ISWORD(*(sp-1))))) {
 616                                 break;
 617                         }
 618                         return (NULL);
 619                 case OEOW:
 620                         if (((sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
 621                             (sp < m->endp && *sp == '\n' &&
 622                             (m->g->cflags&REG_NEWLINE)) ||
 623                             (sp < m->endp && !ISWORD(*sp))) &&
 624                             (sp > m->beginp && ISWORD(*(sp-1)))) {
 625                                 break;
 626                         }
 627                         return (NULL);
 628                 case O_QUEST:
 629                         break;
 630                 case OOR1:      /* matches null but needs to skip */
 631                         ss++;
 632                         s = m->g->strip[ss];
 633                         do {
 634                                 assert(OP(s) == OOR2);
 635                                 ss += OPND(s);
 636                         } while (OP(s = m->g->strip[ss]) != (sop)O_CH);
 637                         /* note that the ss++ gets us past the O_CH */
 638                         break;
 639                 default:        /* have to make a choice */
 640                         hard = 1;
 641                         break;
 642                 }
 643         if (!hard) {            /* that was it! */
 644                 if (sp != stop)
 645                         return (NULL);
 646                 return (sp);
 647         }
 648         ss--;                   /* adjust for the for's final increment */
 649 
 650         /* the hard stuff */
 651         AT("hard", sp, stop, ss, stopst);
 652         s = m->g->strip[ss];
 653         switch (OP(s)) {
 654         case OBACK_:            /* the vilest depths */
 655                 i = OPND(s);
 656                 assert(0 < i && i <= m->g->nsub);
 657                 if (m->pmatch[i].rm_eo == -1)
 658                         return (NULL);
 659                 assert(m->pmatch[i].rm_so != -1);
 660                 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
 661                 if (len == 0 && rec++ > MAX_RECURSION)
 662                         return (NULL);
 663                 assert(stop - m->beginp >= len);
 664                 if (sp > stop - len)
 665                         return (NULL);  /* not enough left to match */
 666                 ssp = m->offp + m->pmatch[i].rm_so;
 667                 if (memcmp(sp, ssp, len) != 0)
 668                         return (NULL);
 669                 while (m->g->strip[ss] != (sop)SOP(O_BACK, i))
 670                         ss++;
 671                 return (backref(m, sp+len, stop, ss+1, stopst, lev, rec));
 672         case OQUEST_:           /* to null or not */
 673                 dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
 674                 if (dp != NULL)
 675                         return (dp);    /* not */
 676                 return (backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
 677         case OPLUS_:
 678                 assert(m->lastpos != NULL);
 679                 assert(lev+1 <= m->g->nplus);
 680                 m->lastpos[lev+1] = sp;
 681                 return (backref(m, sp, stop, ss+1, stopst, lev+1, rec));
 682         case O_PLUS:
 683                 if (sp == m->lastpos[lev])   /* last pass matched null */
 684                         return (backref(m, sp, stop, ss+1, stopst, lev-1, rec));
 685                 /* try another pass */
 686                 m->lastpos[lev] = sp;
 687                 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
 688                 if (dp == NULL)
 689                         return (backref(m, sp, stop, ss+1, stopst, lev-1, rec));
 690                 return (dp);
 691         case OCH_:              /* find the right one, if any */
 692                 ssub = ss + 1;
 693                 esub = ss + OPND(s) - 1;
 694                 assert(OP(m->g->strip[esub]) == OOR1);
 695                 for (;;) {      /* find first matching branch */
 696                         dp = backref(m, sp, stop, ssub, esub, lev, rec);
 697                         if (dp != NULL)
 698                                 return (dp);
 699                         /* that one missed, try next one */
 700                         if (OP(m->g->strip[esub]) == (sop)O_CH)
 701                                 return (NULL);  /* there is none */
 702                         esub++;
 703                         assert(OP(m->g->strip[esub]) == (sop)OOR2);
 704                         ssub = esub + 1;
 705                         esub += OPND(m->g->strip[esub]);
 706                         if (OP(m->g->strip[esub]) == (sop)OOR2)
 707                                 esub--;
 708                         else
 709                                 assert(OP(m->g->strip[esub]) == O_CH);
 710                 }
 711                 /* NOTREACHED */
 712                 break;
 713         case OLPAREN:           /* must undo assignment if rest fails */
 714                 i = OPND(s);
 715                 assert(0 < i && i <= m->g->nsub);
 716                 offsave = m->pmatch[i].rm_so;
 717                 m->pmatch[i].rm_so = sp - m->offp;
 718                 dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
 719                 if (dp != NULL)
 720                         return (dp);
 721                 m->pmatch[i].rm_so = offsave;
 722                 return (NULL);
 723         case ORPAREN:           /* must undo assignment if rest fails */
 724                 i = OPND(s);
 725                 assert(0 < i && i <= m->g->nsub);
 726                 offsave = m->pmatch[i].rm_eo;
 727                 m->pmatch[i].rm_eo = sp - m->offp;
 728                 dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
 729                 if (dp != NULL)
 730                         return (dp);
 731                 m->pmatch[i].rm_eo = offsave;
 732                 return (NULL);
 733         default:                /* uh oh */
 734                 assert(0);
 735                 break;
 736         }
 737 
 738         /* "can't happen" */
 739         assert(0);
 740         return (NULL);
 741 }
 742 
 743 /*
 744  * Step through the string either quickly or slowly.  Returns where it ended
 745  * or NULL.
 746  */
 747 static const char *
 748 walk(struct match *m, const char *start, const char *stop, sopno startst,
 749     sopno stopst, bool fast)
 750 {
 751         states st = m->st;
 752         states fresh = m->fresh;
 753         states empty = m->empty;
 754         states tmp = m->tmp;
 755         const char *p = start;
 756         wint_t c;
 757         wint_t lastc;           /* previous c */
 758         wint_t flagch;
 759         int i;
 760         const char *matchp;     /* last p at which a match ended */
 761         size_t clen;
 762 
 763         AT("slow", start, stop, startst, stopst);
 764         CLEAR(st);
 765         SET1(st, startst);
 766         SP("sstart", st, *p);
 767         st = step(m->g, startst, stopst, st, NOTHING, st);
 768         if (fast)
 769                 ASSIGN(fresh, st);
 770         matchp = NULL;
 771         if (start == m->offp || (start == m->beginp && !(m->eflags&REG_NOTBOL)))
 772                 c = OUT;
 773         else {
 774                 /*
 775                  * XXX Wrong if the previous character was multi-byte.
 776                  * Newline never is (in encodings supported by FreeBSD),
 777                  * so this only breaks the ISWORD tests below.
 778                  */
 779                 c = (uch)*(start - 1);
 780         }
 781         for (;;) {
 782                 /* next character */
 783                 lastc = c;
 784                 if (p == m->endp) {
 785                         c = OUT;
 786                         clen = 0;
 787                 } else
 788                         clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
 789 
 790                 if (fast && EQ(st, fresh))
 791                         matchp = p;
 792 
 793                 /* is there an EOL and/or BOL between lastc and c? */
 794                 flagch = '\0';
 795                 i = 0;
 796                 if ((lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
 797                     (lastc == OUT && !(m->eflags&REG_NOTBOL))) {
 798                         flagch = BOL;
 799                         i = m->g->nbol;
 800                 }
 801                 if ((c == '\n' && m->g->cflags&REG_NEWLINE) ||
 802                     (c == OUT && !(m->eflags&REG_NOTEOL))) {
 803                         flagch = (flagch == BOL) ? BOLEOL : EOL;
 804                         i += m->g->neol;
 805                 }
 806                 if (i != 0) {
 807                         for (; i > 0; i--)
 808                                 st = step(m->g, startst, stopst, st,
 809                                     flagch, st);
 810                         SP("sboleol", st, c);
 811                 }
 812 
 813                 /* how about a word boundary? */
 814                 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
 815                     (c != OUT && ISWORD(c))) {
 816                         flagch = BOW;
 817                 }
 818                 if ((lastc != OUT && ISWORD(lastc)) &&
 819                     (flagch == EOL || (c != OUT && !ISWORD(c)))) {
 820                         flagch = EOW;
 821                 }
 822                 if (flagch == BOW || flagch == EOW) {
 823                         st = step(m->g, startst, stopst, st, flagch, st);
 824                         SP("sboweow", st, c);
 825                 }
 826 
 827                 /* are we done? */
 828                 if (ISSET(st, stopst)) {
 829                         if (fast)
 830                                 break;
 831                         else
 832                                 matchp = p;
 833                 }
 834                 if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p))
 835                         break;          /* NOTE BREAK OUT */
 836 
 837                 /* no, we must deal with this character */
 838                 ASSIGN(tmp, st);
 839                 if (fast)
 840                         ASSIGN(st, fresh);
 841                 else
 842                         ASSIGN(st, empty);
 843                 assert(c != OUT);
 844                 st = step(m->g, startst, stopst, tmp, c, st);
 845                 SP("saft", st, c);
 846                 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
 847                 p += clen;
 848         }
 849 
 850         if (fast) {
 851                 assert(matchp != NULL);
 852                 m->coldp = matchp;
 853                 if (ISSET(st, stopst))
 854                         return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0));
 855                 else
 856                         return (NULL);
 857         } else
 858                 return (matchp);
 859 }
 860 
 861 /*
 862  * step - map set of states reachable before char to set reachable after
 863  */
 864 static states
 865 step(struct re_guts *g,
 866     sopno start,        /* start state within strip */
 867     sopno stop,         /* state after stop state within strip */
 868     states bef,         /* states reachable before */
 869     wint_t ch,          /* character or NONCHAR code */
 870     states aft)         /* states already known reachable after */
 871 {
 872         cset *cs;
 873         sop s;
 874         sopno pc;
 875         onestate here;          /* note, macros know this name */
 876         sopno look;
 877         int i;
 878 
 879         for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
 880                 s = g->strip[pc];
 881                 switch (OP(s)) {
 882                 case OEND:
 883                         assert(pc == stop-1);
 884                         break;
 885                 case OCHAR:
 886                         /* only characters can match */
 887                         assert(!NONCHAR(ch) || ch != OPND(s));
 888                         if (ch == OPND(s))
 889                                 FWD(aft, bef, 1);
 890                         break;
 891                 case OBOL:
 892                         if (ch == BOL || ch == BOLEOL)
 893                                 FWD(aft, bef, 1);
 894                         break;
 895                 case OEOL:
 896                         if (ch == EOL || ch == BOLEOL)
 897                                 FWD(aft, bef, 1);
 898                         break;
 899                 case OBOW:
 900                         if (ch == BOW)
 901                                 FWD(aft, bef, 1);
 902                         break;
 903                 case OEOW:
 904                         if (ch == EOW)
 905                                 FWD(aft, bef, 1);
 906                         break;
 907                 case OANY:
 908                         if (!NONCHAR(ch))
 909                                 FWD(aft, bef, 1);
 910                         break;
 911                 case OANYOF:
 912                         cs = &g->sets[OPND(s)];
 913                         if (!NONCHAR(ch) && CHIN(cs, ch))
 914                                 FWD(aft, bef, 1);
 915                         break;
 916                 case OBACK_:            /* ignored here */
 917                 case O_BACK:
 918                         FWD(aft, aft, 1);
 919                         break;
 920                 case OPLUS_:            /* forward, this is just an empty */
 921                         FWD(aft, aft, 1);
 922                         break;
 923                 case O_PLUS:            /* both forward and back */
 924                         FWD(aft, aft, 1);
 925                         i = ISSETBACK(aft, OPND(s));
 926                         BACK(aft, aft, OPND(s));
 927                         if (!i && ISSETBACK(aft, OPND(s))) {
 928                                 /* oho, must reconsider loop body */
 929                                 pc -= OPND(s) + 1;
 930                                 INIT(here, pc);
 931                         }
 932                         break;
 933                 case OQUEST_:           /* two branches, both forward */
 934                         FWD(aft, aft, 1);
 935                         FWD(aft, aft, OPND(s));
 936                         break;
 937                 case O_QUEST:           /* just an empty */
 938                         FWD(aft, aft, 1);
 939                         break;
 940                 case OLPAREN:           /* not significant here */
 941                 case ORPAREN:
 942                         FWD(aft, aft, 1);
 943                         break;
 944                 case OCH_:              /* mark the first two branches */
 945                         FWD(aft, aft, 1);
 946                         assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2);
 947                         FWD(aft, aft, OPND(s));
 948                         break;
 949                 case OOR1:              /* done a branch, find the O_CH */
 950                         if (ISSTATEIN(aft, here)) {
 951                                 for (look = 1;
 952                                     OP(s = g->strip[pc+look]) != (sop)O_CH;
 953                                     look += OPND(s))
 954                                         assert(OP(s) == (sop)OOR2);
 955                                 FWD(aft, aft, look + 1);
 956                         }
 957                         break;
 958                 case OOR2:              /* propagate OCH_'s marking */
 959                         FWD(aft, aft, 1);
 960                         if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) {
 961                                 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2);
 962                                 FWD(aft, aft, OPND(s));
 963                         }
 964                         break;
 965                 case O_CH:              /* just empty */
 966                         FWD(aft, aft, 1);
 967                         break;
 968                 default:                /* ooooops... */
 969                         assert(0);
 970                         break;
 971                 }
 972         }
 973 
 974         return (aft);
 975 }
 976 
 977 #ifdef REDEBUG
 978 /*
 979  * print - print a set of states
 980  */
 981 static void
 982 print(struct match *m, const char *caption, states st, int ch, FILE *d)
 983 {
 984         struct re_guts *g = m->g;
 985         sopno i;
 986         int first = 1;
 987 
 988         if (!(m->eflags&REG_TRACE))
 989                 return;
 990 
 991         (void) fprintf(d, "%s", caption);
 992         if (ch != '\0')
 993                 (void) fprintf(d, " %s", pchar(ch));
 994         for (i = 0; i < g->nstates; i++)
 995                 if (ISSET(st, i)) {
 996                         (void) fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
 997                         first = 0;
 998                 }
 999         (void) fprintf(d, "\n");
1000 }
1001 
1002 /*
1003  * at - print current situation
1004  */
1005 static void
1006 at(struct match *m, const char *title, const char *start, const char *stop,
1007     sopno startst, sopno stopst)
1008 {
1009         if (!(m->eflags&REG_TRACE))
1010                 return;
1011 
1012         (void) printf("%s %s-", title, pchar(*start));
1013         (void) printf("%s ", pchar(*stop));
1014         (void) printf("%ld-%ld\n", (long)startst, (long)stopst);
1015 }
1016 
1017 #ifndef PCHARDONE
1018 #define PCHARDONE       /* never again */
1019 /*
1020  * pchar - make a character printable
1021  *
1022  * Is this identical to regchar() over in debug.c?  Well, yes.  But a
1023  * duplicate here avoids having a debugging-capable regexec.o tied to
1024  * a matching debug.o, and this is convenient.  It all disappears in
1025  * the non-debug compilation anyway, so it doesn't matter much.
1026  */
1027 static const char *
1028 pchar(int ch)
1029 {
1030         static char pbuf[10];
1031 
1032         if (isprint((uch)ch) || ch == ' ')
1033                 (void) sprintf(pbuf, "%c", ch);
1034         else
1035                 (void) sprintf(pbuf, "\\%o", ch);
1036         return (pbuf);
1037 }
1038 #endif
1039 #endif
1040 
1041 #undef  matcher
1042 #undef  walk
1043 #undef  dissect
1044 #undef  backref
1045 #undef  step
1046 #undef  print
1047 #undef  at
1048 #undef  match