1 /* 2 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 3 * Copyright 2012 Milan Jurik. All rights reserved. 4 * Copyright (c) 2016 by Delphix. All rights reserved. 5 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 6 * Copyright (c) 1992, 1993, 1994 7 * The Regents of the University of California. All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * Henry Spencer. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 #include <stdbool.h> 38 39 /* 40 * The matching engine and friends. This file is #included by regexec.c 41 * after suitable #defines of a variety of macros used herein, so that 42 * different state representations can be used without duplicating masses 43 * of code. 44 */ 45 46 #ifdef SNAMES 47 #define matcher smatcher 48 #define walk swalk 49 #define dissect sdissect 50 #define backref sbackref 51 #define step sstep 52 #define print sprint 53 #define at sat 54 #define match smat 55 #endif 56 #ifdef LNAMES 57 #define matcher lmatcher 58 #define walk lwalk 59 #define dissect ldissect 60 #define backref lbackref 61 #define step lstep 62 #define print lprint 63 #define at lat 64 #define match lmat 65 #endif 66 #ifdef MNAMES 67 #define matcher mmatcher 68 #define walk mwalk 69 #define dissect mdissect 70 #define backref mbackref 71 #define step mstep 72 #define print mprint 73 #define at mat 74 #define match mmat 75 #endif 76 77 /* another structure passed up and down to avoid zillions of parameters */ 78 struct match { 79 struct re_guts *g; 80 int eflags; 81 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 82 const char *offp; /* offsets work from here */ 83 const char *beginp; /* start of string -- virtual NUL precedes */ 84 const char *endp; /* end of string -- virtual NUL here */ 85 const char *coldp; /* can be no match starting before here */ 86 const char **lastpos; /* [nplus+1] */ 87 STATEVARS; 88 states st; /* current states */ 89 states fresh; /* states for a fresh start */ 90 states tmp; /* temporary */ 91 states empty; /* empty set of states */ 92 mbstate_t mbs; /* multibyte conversion state */ 93 }; 94 95 /* ========= begin header generated by ./mkh ========= */ 96 #ifdef __cplusplus 97 extern "C" { 98 #endif 99 100 /* === engine.c === */ 101 static int matcher(struct re_guts *, const char *, size_t, regmatch_t[], int); 102 static const char *dissect(struct match *, const char *, const char *, 103 sopno, sopno); 104 static const char *backref(struct match *, const char *, const char *, sopno, 105 sopno, sopno, int); 106 static const char *walk(struct match *m, const char *start, const char *stop, 107 sopno startst, sopno stopst, bool fast); 108 static states step(struct re_guts *, sopno, sopno, states, wint_t, states); 109 #define MAX_RECURSION 100 110 #define BOL (OUT-1) 111 #define EOL (BOL-1) 112 #define BOLEOL (BOL-2) 113 #define NOTHING (BOL-3) 114 #define BOW (BOL-4) 115 #define EOW (BOL-5) 116 #define BADCHAR (BOL-6) 117 #define NONCHAR(c) ((c) <= OUT) 118 #ifdef REDEBUG 119 static void print(struct match *, const char *, states, int, FILE *); 120 #endif 121 #ifdef REDEBUG 122 static void at(struct match *, const char *, const char *, const char *, 123 sopno, sopno); 124 #endif 125 #ifdef REDEBUG 126 static const char *pchar(int ch); 127 #endif 128 129 #ifdef __cplusplus 130 } 131 #endif 132 /* ========= end header generated by ./mkh ========= */ 133 134 #ifdef REDEBUG 135 #define SP(t, s, c) print(m, t, s, c, stdout) 136 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 137 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 138 #else 139 #define SP(t, s, c) /* nothing */ 140 #define AT(t, p1, p2, s1, s2) /* nothing */ 141 #define NOTE(s) /* nothing */ 142 #endif 143 144 /* 145 * matcher - the actual matching engine 146 */ 147 static int /* 0 success, REG_NOMATCH failure */ 148 matcher(struct re_guts *g, const char *string, size_t nmatch, 149 regmatch_t pmatch[], int eflags) 150 { 151 const char *endp; 152 size_t i; 153 struct match mv; 154 struct match *m = &mv; 155 const char *dp = NULL; 156 const sopno gf = g->firststate+1; /* +1 for OEND */ 157 const sopno gl = g->laststate; 158 const char *start; 159 const char *stop; 160 /* Boyer-Moore algorithms variables */ 161 const char *pp; 162 int cj, mj; 163 const char *mustfirst; 164 const char *mustlast; 165 int *matchjump; 166 int *charjump; 167 168 /* simplify the situation where possible */ 169 if (g->cflags®_NOSUB) 170 nmatch = 0; 171 172 if (eflags®_STARTEND) { 173 start = string + pmatch[0].rm_so; 174 stop = string + pmatch[0].rm_eo; 175 } else { 176 start = string; 177 stop = start + strlen(start); 178 } 179 180 if (stop < start) 181 return (REG_EFATAL); 182 183 /* prescreening; this does wonders for this rather slow code */ 184 if (g->must != NULL) { 185 if (g->charjump != NULL && g->matchjump != NULL) { 186 mustfirst = g->must; 187 mustlast = g->must + g->mlen - 1; 188 charjump = g->charjump; 189 matchjump = g->matchjump; 190 pp = mustlast; 191 for (dp = start+g->mlen-1; dp < stop; ) { 192 /* Fast skip non-matches */ 193 while (dp < stop && charjump[(int)*dp]) 194 dp += charjump[(int)*dp]; 195 196 if (dp >= stop) 197 break; 198 199 /* Greedy matcher */ 200 /* 201 * We depend on not being used for 202 * for strings of length 1 203 */ 204 while (*--dp == *--pp && pp != mustfirst) 205 ; 206 207 if (*dp == *pp) 208 break; 209 210 /* Jump to next possible match */ 211 mj = matchjump[pp - mustfirst]; 212 cj = charjump[(int)*dp]; 213 dp += (cj < mj ? mj : cj); 214 pp = mustlast; 215 } 216 if (pp != mustfirst) 217 return (REG_NOMATCH); 218 } else { 219 for (dp = start; dp < stop; dp++) 220 if (*dp == g->must[0] && 221 stop - dp >= g->mlen && 222 memcmp(dp, g->must, (size_t)g->mlen) == 0) 223 break; 224 if (dp == stop) /* we didn't find g->must */ 225 return (REG_NOMATCH); 226 } 227 } 228 229 /* match struct setup */ 230 m->g = g; 231 m->eflags = eflags; 232 m->pmatch = NULL; 233 m->lastpos = NULL; 234 m->offp = string; 235 m->beginp = start; 236 m->endp = stop; 237 STATESETUP(m, 4); 238 SETUP(m->st); 239 SETUP(m->fresh); 240 SETUP(m->tmp); 241 SETUP(m->empty); 242 CLEAR(m->empty); 243 ZAPSTATE(&m->mbs); 244 245 /* Adjust start according to moffset, to speed things up */ 246 if (dp != NULL && g->moffset > -1) 247 start = ((dp - g->moffset) < start) ? start : dp - g->moffset; 248 249 SP("mloop", m->st, *start); 250 251 /* this loop does only one repetition except for backrefs */ 252 for (;;) { 253 endp = walk(m, start, stop, gf, gl, true); 254 if (endp == NULL) { /* a miss */ 255 if (m->pmatch != NULL) 256 free((char *)m->pmatch); 257 if (m->lastpos != NULL) 258 free((char *)m->lastpos); 259 STATETEARDOWN(m); 260 return (REG_NOMATCH); 261 } 262 if (nmatch == 0 && !g->backrefs) 263 break; /* no further info needed */ 264 265 /* where? */ 266 assert(m->coldp != NULL); 267 for (;;) { 268 NOTE("finding start"); 269 endp = walk(m, m->coldp, stop, gf, gl, false); 270 if (endp != NULL) 271 break; 272 assert(m->coldp < m->endp); 273 m->coldp += XMBRTOWC(NULL, m->coldp, 274 m->endp - m->coldp, &m->mbs, 0); 275 } 276 if (nmatch == 1 && !g->backrefs) 277 break; /* no further info needed */ 278 279 /* oh my, it wants the subexpressions... */ 280 if (m->pmatch == NULL) 281 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 282 sizeof (regmatch_t)); 283 if (m->pmatch == NULL) { 284 STATETEARDOWN(m); 285 return (REG_ESPACE); 286 } 287 for (i = 1; i <= m->g->nsub; i++) 288 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 289 if (!g->backrefs && !(m->eflags®_BACKR)) { 290 NOTE("dissecting"); 291 dp = dissect(m, m->coldp, endp, gf, gl); 292 } else { 293 if (g->nplus > 0 && m->lastpos == NULL) 294 m->lastpos = malloc((g->nplus+1) * 295 sizeof (const char *)); 296 if (g->nplus > 0 && m->lastpos == NULL) { 297 free(m->pmatch); 298 STATETEARDOWN(m); 299 return (REG_ESPACE); 300 } 301 NOTE("backref dissect"); 302 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 303 } 304 if (dp != NULL) 305 break; 306 307 /* uh-oh... we couldn't find a subexpression-level match */ 308 assert(g->backrefs); /* must be back references doing it */ 309 assert(g->nplus == 0 || m->lastpos != NULL); 310 for (;;) { 311 if (dp != NULL || endp <= m->coldp) 312 break; /* defeat */ 313 NOTE("backoff"); 314 endp = walk(m, m->coldp, endp-1, gf, gl, false); 315 if (endp == NULL) 316 break; /* defeat */ 317 /* try it on a shorter possibility */ 318 #ifndef NDEBUG 319 for (i = 1; i <= m->g->nsub; i++) { 320 assert(m->pmatch[i].rm_so == -1); 321 assert(m->pmatch[i].rm_eo == -1); 322 } 323 #endif 324 NOTE("backoff dissect"); 325 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 326 } 327 assert(dp == NULL || dp == endp); 328 if (dp != NULL) /* found a shorter one */ 329 break; 330 331 /* despite initial appearances, there is no match here */ 332 NOTE("false alarm"); 333 /* recycle starting later */ 334 start = m->coldp + XMBRTOWC(NULL, m->coldp, 335 stop - m->coldp, &m->mbs, 0); 336 assert(start <= stop); 337 } 338 339 /* fill in the details if requested */ 340 if (nmatch > 0) { 341 pmatch[0].rm_so = m->coldp - m->offp; 342 pmatch[0].rm_eo = endp - m->offp; 343 } 344 if (nmatch > 1) { 345 assert(m->pmatch != NULL); 346 for (i = 1; i < nmatch; i++) 347 if (i <= m->g->nsub) 348 pmatch[i] = m->pmatch[i]; 349 else { 350 pmatch[i].rm_so = -1; 351 pmatch[i].rm_eo = -1; 352 } 353 } 354 355 if (m->pmatch != NULL) 356 free((char *)m->pmatch); 357 if (m->lastpos != NULL) 358 free((char *)m->lastpos); 359 STATETEARDOWN(m); 360 return (0); 361 } 362 363 /* 364 * dissect - figure out what matched what, no back references 365 */ 366 static const char * 367 dissect(struct match *m, const char *start, const char *stop, sopno startst, 368 sopno stopst) 369 { 370 int i; 371 sopno ss; /* start sop of current subRE */ 372 sopno es; /* end sop of current subRE */ 373 const char *sp; /* start of string matched by it */ 374 const char *stp; /* string matched by it cannot pass here */ 375 const char *rest; /* start of rest of string */ 376 const char *tail; /* string unmatched by rest of RE */ 377 sopno ssub; /* start sop of subsubRE */ 378 sopno esub; /* end sop of subsubRE */ 379 const char *ssp; /* start of string matched by subsubRE */ 380 const char *sep; /* end of string matched by subsubRE */ 381 const char *oldssp; /* previous ssp */ 382 const char *dp; 383 384 (void) dp; 385 AT("diss", start, stop, startst, stopst); 386 sp = start; 387 for (ss = startst; ss < stopst; ss = es) { 388 /* identify end of subRE */ 389 es = ss; 390 switch (OP(m->g->strip[es])) { 391 case OPLUS_: 392 case OQUEST_: 393 es += OPND(m->g->strip[es]); 394 break; 395 case OCH_: 396 while (OP(m->g->strip[es]) != (sop)O_CH) 397 es += OPND(m->g->strip[es]); 398 break; 399 } 400 es++; 401 402 /* figure out what it matched */ 403 switch (OP(m->g->strip[ss])) { 404 case OEND: 405 assert(0); 406 break; 407 case OCHAR: 408 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 409 break; 410 case OBOL: 411 case OEOL: 412 case OBOW: 413 case OEOW: 414 break; 415 case OANY: 416 case OANYOF: 417 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 418 break; 419 case OBACK_: 420 case O_BACK: 421 assert(0); 422 break; 423 /* cases where length of match is hard to find */ 424 case OQUEST_: 425 stp = stop; 426 for (;;) { 427 /* how long could this one be? */ 428 rest = walk(m, sp, stp, ss, es, false); 429 assert(rest != NULL); /* it did match */ 430 /* could the rest match the rest? */ 431 tail = walk(m, rest, stop, es, stopst, false); 432 if (tail == stop) 433 break; /* yes! */ 434 /* no -- try a shorter match for this one */ 435 stp = rest - 1; 436 assert(stp >= sp); /* it did work */ 437 } 438 ssub = ss + 1; 439 esub = es - 1; 440 /* did innards match? */ 441 if (walk(m, sp, rest, ssub, esub, false) != NULL) { 442 dp = dissect(m, sp, rest, ssub, esub); 443 assert(dp == rest); 444 } else /* no */ 445 assert(sp == rest); 446 sp = rest; 447 break; 448 case OPLUS_: 449 stp = stop; 450 for (;;) { 451 /* how long could this one be? */ 452 rest = walk(m, sp, stp, ss, es, false); 453 assert(rest != NULL); /* it did match */ 454 /* could the rest match the rest? */ 455 tail = walk(m, rest, stop, es, stopst, false); 456 if (tail == stop) 457 break; /* yes! */ 458 /* no -- try a shorter match for this one */ 459 stp = rest - 1; 460 assert(stp >= sp); /* it did work */ 461 } 462 ssub = ss + 1; 463 esub = es - 1; 464 ssp = sp; 465 oldssp = ssp; 466 for (;;) { /* find last match of innards */ 467 sep = walk(m, ssp, rest, ssub, esub, false); 468 if (sep == NULL || sep == ssp) 469 break; /* failed or matched null */ 470 oldssp = ssp; /* on to next try */ 471 ssp = sep; 472 } 473 if (sep == NULL) { 474 /* last successful match */ 475 sep = ssp; 476 ssp = oldssp; 477 } 478 assert(sep == rest); /* must exhaust substring */ 479 assert(walk(m, ssp, sep, ssub, esub, false) == rest); 480 dp = dissect(m, ssp, sep, ssub, esub); 481 assert(dp == sep); 482 sp = rest; 483 break; 484 case OCH_: 485 stp = stop; 486 for (;;) { 487 /* how long could this one be? */ 488 rest = walk(m, sp, stp, ss, es, false); 489 assert(rest != NULL); /* it did match */ 490 /* could the rest match the rest? */ 491 tail = walk(m, rest, stop, es, stopst, false); 492 if (tail == stop) 493 break; /* yes! */ 494 /* no -- try a shorter match for this one */ 495 stp = rest - 1; 496 assert(stp >= sp); /* it did work */ 497 } 498 ssub = ss + 1; 499 esub = ss + OPND(m->g->strip[ss]) - 1; 500 assert(OP(m->g->strip[esub]) == OOR1); 501 for (;;) { /* find first matching branch */ 502 if (walk(m, sp, rest, ssub, esub, 503 false) == rest) 504 break; /* it matched all of it */ 505 /* that one missed, try next one */ 506 assert(OP(m->g->strip[esub]) == OOR1); 507 esub++; 508 assert(OP(m->g->strip[esub]) == OOR2); 509 ssub = esub + 1; 510 esub += OPND(m->g->strip[esub]); 511 if (OP(m->g->strip[esub]) == (sop)OOR2) 512 esub--; 513 else 514 assert(OP(m->g->strip[esub]) == O_CH); 515 } 516 dp = dissect(m, sp, rest, ssub, esub); 517 assert(dp == rest); 518 sp = rest; 519 break; 520 case O_PLUS: 521 case O_QUEST: 522 case OOR1: 523 case OOR2: 524 case O_CH: 525 assert(0); 526 break; 527 case OLPAREN: 528 i = OPND(m->g->strip[ss]); 529 assert(0 < i && i <= m->g->nsub); 530 m->pmatch[i].rm_so = sp - m->offp; 531 break; 532 case ORPAREN: 533 i = OPND(m->g->strip[ss]); 534 assert(0 < i && i <= m->g->nsub); 535 m->pmatch[i].rm_eo = sp - m->offp; 536 break; 537 default: /* uh oh */ 538 assert(0); 539 break; 540 } 541 } 542 543 assert(sp == stop); 544 return (sp); 545 } 546 547 /* 548 * backref - figure out what matched what, figuring in back references 549 */ 550 static const char * 551 backref(struct match *m, const char *start, const char *stop, sopno startst, 552 sopno stopst, sopno lev, /* PLUS nesting level */ 553 int rec) 554 { 555 int i; 556 sopno ss; /* start sop of current subRE */ 557 const char *sp; /* start of string matched by it */ 558 sopno ssub; /* start sop of subsubRE */ 559 sopno esub; /* end sop of subsubRE */ 560 const char *ssp; /* start of string matched by subsubRE */ 561 const char *dp; 562 size_t len; 563 int hard; 564 sop s; 565 regoff_t offsave; 566 cset *cs; 567 wint_t wc; 568 569 AT("back", start, stop, startst, stopst); 570 sp = start; 571 572 /* get as far as we can with easy stuff */ 573 hard = 0; 574 for (ss = startst; !hard && ss < stopst; ss++) 575 switch (OP(s = m->g->strip[ss])) { 576 case OCHAR: 577 if (sp == stop) 578 return (NULL); 579 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 580 if (wc != OPND(s)) 581 return (NULL); 582 break; 583 case OANY: 584 if (sp == stop) 585 return (NULL); 586 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 587 if (wc == BADCHAR) 588 return (NULL); 589 break; 590 case OANYOF: 591 if (sp == stop) 592 return (NULL); 593 cs = &m->g->sets[OPND(s)]; 594 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 595 if (wc == BADCHAR || !CHIN(cs, wc)) 596 return (NULL); 597 break; 598 case OBOL: 599 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 600 (sp > m->offp && sp < m->endp && 601 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) { 602 break; 603 } 604 return (NULL); 605 case OEOL: 606 if ((sp == m->endp && !(m->eflags®_NOTEOL)) || 607 (sp < m->endp && *sp == '\n' && 608 (m->g->cflags®_NEWLINE))) { 609 break; 610 } 611 return (NULL); 612 case OBOW: 613 if (sp < m->endp && ISWORD(*sp) && 614 ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 615 (sp > m->offp && !ISWORD(*(sp-1))))) { 616 break; 617 } 618 return (NULL); 619 case OEOW: 620 if (((sp == m->endp && !(m->eflags®_NOTEOL)) || 621 (sp < m->endp && *sp == '\n' && 622 (m->g->cflags®_NEWLINE)) || 623 (sp < m->endp && !ISWORD(*sp))) && 624 (sp > m->beginp && ISWORD(*(sp-1)))) { 625 break; 626 } 627 return (NULL); 628 case O_QUEST: 629 break; 630 case OOR1: /* matches null but needs to skip */ 631 ss++; 632 s = m->g->strip[ss]; 633 do { 634 assert(OP(s) == OOR2); 635 ss += OPND(s); 636 } while (OP(s = m->g->strip[ss]) != (sop)O_CH); 637 /* note that the ss++ gets us past the O_CH */ 638 break; 639 default: /* have to make a choice */ 640 hard = 1; 641 break; 642 } 643 if (!hard) { /* that was it! */ 644 if (sp != stop) 645 return (NULL); 646 return (sp); 647 } 648 ss--; /* adjust for the for's final increment */ 649 650 /* the hard stuff */ 651 AT("hard", sp, stop, ss, stopst); 652 s = m->g->strip[ss]; 653 switch (OP(s)) { 654 case OBACK_: /* the vilest depths */ 655 i = OPND(s); 656 assert(0 < i && i <= m->g->nsub); 657 if (m->pmatch[i].rm_eo == -1) 658 return (NULL); 659 assert(m->pmatch[i].rm_so != -1); 660 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 661 if (len == 0 && rec++ > MAX_RECURSION) 662 return (NULL); 663 assert(stop - m->beginp >= len); 664 if (sp > stop - len) 665 return (NULL); /* not enough left to match */ 666 ssp = m->offp + m->pmatch[i].rm_so; 667 if (memcmp(sp, ssp, len) != 0) 668 return (NULL); 669 while (m->g->strip[ss] != (sop)SOP(O_BACK, i)) 670 ss++; 671 return (backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 672 case OQUEST_: /* to null or not */ 673 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 674 if (dp != NULL) 675 return (dp); /* not */ 676 return (backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 677 case OPLUS_: 678 assert(m->lastpos != NULL); 679 assert(lev+1 <= m->g->nplus); 680 m->lastpos[lev+1] = sp; 681 return (backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 682 case O_PLUS: 683 if (sp == m->lastpos[lev]) /* last pass matched null */ 684 return (backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 685 /* try another pass */ 686 m->lastpos[lev] = sp; 687 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 688 if (dp == NULL) 689 return (backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 690 return (dp); 691 case OCH_: /* find the right one, if any */ 692 ssub = ss + 1; 693 esub = ss + OPND(s) - 1; 694 assert(OP(m->g->strip[esub]) == OOR1); 695 for (;;) { /* find first matching branch */ 696 dp = backref(m, sp, stop, ssub, esub, lev, rec); 697 if (dp != NULL) 698 return (dp); 699 /* that one missed, try next one */ 700 if (OP(m->g->strip[esub]) == (sop)O_CH) 701 return (NULL); /* there is none */ 702 esub++; 703 assert(OP(m->g->strip[esub]) == (sop)OOR2); 704 ssub = esub + 1; 705 esub += OPND(m->g->strip[esub]); 706 if (OP(m->g->strip[esub]) == (sop)OOR2) 707 esub--; 708 else 709 assert(OP(m->g->strip[esub]) == O_CH); 710 } 711 /* NOTREACHED */ 712 break; 713 case OLPAREN: /* must undo assignment if rest fails */ 714 i = OPND(s); 715 assert(0 < i && i <= m->g->nsub); 716 offsave = m->pmatch[i].rm_so; 717 m->pmatch[i].rm_so = sp - m->offp; 718 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 719 if (dp != NULL) 720 return (dp); 721 m->pmatch[i].rm_so = offsave; 722 return (NULL); 723 case ORPAREN: /* must undo assignment if rest fails */ 724 i = OPND(s); 725 assert(0 < i && i <= m->g->nsub); 726 offsave = m->pmatch[i].rm_eo; 727 m->pmatch[i].rm_eo = sp - m->offp; 728 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 729 if (dp != NULL) 730 return (dp); 731 m->pmatch[i].rm_eo = offsave; 732 return (NULL); 733 default: /* uh oh */ 734 assert(0); 735 break; 736 } 737 738 /* "can't happen" */ 739 assert(0); 740 return (NULL); 741 } 742 743 /* 744 * Step through the string either quickly or slowly. Returns where it ended 745 * or NULL. 746 */ 747 static const char * 748 walk(struct match *m, const char *start, const char *stop, sopno startst, 749 sopno stopst, bool fast) 750 { 751 states st = m->st; 752 states fresh = m->fresh; 753 states empty = m->empty; 754 states tmp = m->tmp; 755 const char *p = start; 756 wint_t c; 757 wint_t lastc; /* previous c */ 758 wint_t flagch; 759 int i; 760 const char *matchp; /* last p at which a match ended */ 761 size_t clen; 762 763 AT("slow", start, stop, startst, stopst); 764 CLEAR(st); 765 SET1(st, startst); 766 SP("sstart", st, *p); 767 st = step(m->g, startst, stopst, st, NOTHING, st); 768 if (fast) 769 ASSIGN(fresh, st); 770 matchp = NULL; 771 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 772 c = OUT; 773 else { 774 /* 775 * XXX Wrong if the previous character was multi-byte. 776 * Newline never is (in encodings supported by FreeBSD), 777 * so this only breaks the ISWORD tests below. 778 */ 779 c = (uch)*(start - 1); 780 } 781 for (;;) { 782 /* next character */ 783 lastc = c; 784 if (p == m->endp) { 785 c = OUT; 786 clen = 0; 787 } else 788 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 789 790 if (fast && EQ(st, fresh)) 791 matchp = p; 792 793 /* is there an EOL and/or BOL between lastc and c? */ 794 flagch = '\0'; 795 i = 0; 796 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 797 (lastc == OUT && !(m->eflags®_NOTBOL))) { 798 flagch = BOL; 799 i = m->g->nbol; 800 } 801 if ((c == '\n' && m->g->cflags®_NEWLINE) || 802 (c == OUT && !(m->eflags®_NOTEOL))) { 803 flagch = (flagch == BOL) ? BOLEOL : EOL; 804 i += m->g->neol; 805 } 806 if (i != 0) { 807 for (; i > 0; i--) 808 st = step(m->g, startst, stopst, st, 809 flagch, st); 810 SP("sboleol", st, c); 811 } 812 813 /* how about a word boundary? */ 814 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 815 (c != OUT && ISWORD(c))) { 816 flagch = BOW; 817 } 818 if ((lastc != OUT && ISWORD(lastc)) && 819 (flagch == EOL || (c != OUT && !ISWORD(c)))) { 820 flagch = EOW; 821 } 822 if (flagch == BOW || flagch == EOW) { 823 st = step(m->g, startst, stopst, st, flagch, st); 824 SP("sboweow", st, c); 825 } 826 827 /* are we done? */ 828 if (ISSET(st, stopst)) { 829 if (fast) 830 break; 831 else 832 matchp = p; 833 } 834 if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p)) 835 break; /* NOTE BREAK OUT */ 836 837 /* no, we must deal with this character */ 838 ASSIGN(tmp, st); 839 if (fast) 840 ASSIGN(st, fresh); 841 else 842 ASSIGN(st, empty); 843 assert(c != OUT); 844 st = step(m->g, startst, stopst, tmp, c, st); 845 SP("saft", st, c); 846 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 847 p += clen; 848 } 849 850 if (fast) { 851 assert(matchp != NULL); 852 m->coldp = matchp; 853 if (ISSET(st, stopst)) 854 return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 855 else 856 return (NULL); 857 } else 858 return (matchp); 859 } 860 861 /* 862 * step - map set of states reachable before char to set reachable after 863 */ 864 static states 865 step(struct re_guts *g, 866 sopno start, /* start state within strip */ 867 sopno stop, /* state after stop state within strip */ 868 states bef, /* states reachable before */ 869 wint_t ch, /* character or NONCHAR code */ 870 states aft) /* states already known reachable after */ 871 { 872 cset *cs; 873 sop s; 874 sopno pc; 875 onestate here; /* note, macros know this name */ 876 sopno look; 877 int i; 878 879 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 880 s = g->strip[pc]; 881 switch (OP(s)) { 882 case OEND: 883 assert(pc == stop-1); 884 break; 885 case OCHAR: 886 /* only characters can match */ 887 assert(!NONCHAR(ch) || ch != OPND(s)); 888 if (ch == OPND(s)) 889 FWD(aft, bef, 1); 890 break; 891 case OBOL: 892 if (ch == BOL || ch == BOLEOL) 893 FWD(aft, bef, 1); 894 break; 895 case OEOL: 896 if (ch == EOL || ch == BOLEOL) 897 FWD(aft, bef, 1); 898 break; 899 case OBOW: 900 if (ch == BOW) 901 FWD(aft, bef, 1); 902 break; 903 case OEOW: 904 if (ch == EOW) 905 FWD(aft, bef, 1); 906 break; 907 case OANY: 908 if (!NONCHAR(ch)) 909 FWD(aft, bef, 1); 910 break; 911 case OANYOF: 912 cs = &g->sets[OPND(s)]; 913 if (!NONCHAR(ch) && CHIN(cs, ch)) 914 FWD(aft, bef, 1); 915 break; 916 case OBACK_: /* ignored here */ 917 case O_BACK: 918 FWD(aft, aft, 1); 919 break; 920 case OPLUS_: /* forward, this is just an empty */ 921 FWD(aft, aft, 1); 922 break; 923 case O_PLUS: /* both forward and back */ 924 FWD(aft, aft, 1); 925 i = ISSETBACK(aft, OPND(s)); 926 BACK(aft, aft, OPND(s)); 927 if (!i && ISSETBACK(aft, OPND(s))) { 928 /* oho, must reconsider loop body */ 929 pc -= OPND(s) + 1; 930 INIT(here, pc); 931 } 932 break; 933 case OQUEST_: /* two branches, both forward */ 934 FWD(aft, aft, 1); 935 FWD(aft, aft, OPND(s)); 936 break; 937 case O_QUEST: /* just an empty */ 938 FWD(aft, aft, 1); 939 break; 940 case OLPAREN: /* not significant here */ 941 case ORPAREN: 942 FWD(aft, aft, 1); 943 break; 944 case OCH_: /* mark the first two branches */ 945 FWD(aft, aft, 1); 946 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); 947 FWD(aft, aft, OPND(s)); 948 break; 949 case OOR1: /* done a branch, find the O_CH */ 950 if (ISSTATEIN(aft, here)) { 951 for (look = 1; 952 OP(s = g->strip[pc+look]) != (sop)O_CH; 953 look += OPND(s)) 954 assert(OP(s) == (sop)OOR2); 955 FWD(aft, aft, look + 1); 956 } 957 break; 958 case OOR2: /* propagate OCH_'s marking */ 959 FWD(aft, aft, 1); 960 if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) { 961 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); 962 FWD(aft, aft, OPND(s)); 963 } 964 break; 965 case O_CH: /* just empty */ 966 FWD(aft, aft, 1); 967 break; 968 default: /* ooooops... */ 969 assert(0); 970 break; 971 } 972 } 973 974 return (aft); 975 } 976 977 #ifdef REDEBUG 978 /* 979 * print - print a set of states 980 */ 981 static void 982 print(struct match *m, const char *caption, states st, int ch, FILE *d) 983 { 984 struct re_guts *g = m->g; 985 sopno i; 986 int first = 1; 987 988 if (!(m->eflags®_TRACE)) 989 return; 990 991 (void) fprintf(d, "%s", caption); 992 if (ch != '\0') 993 (void) fprintf(d, " %s", pchar(ch)); 994 for (i = 0; i < g->nstates; i++) 995 if (ISSET(st, i)) { 996 (void) fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 997 first = 0; 998 } 999 (void) fprintf(d, "\n"); 1000 } 1001 1002 /* 1003 * at - print current situation 1004 */ 1005 static void 1006 at(struct match *m, const char *title, const char *start, const char *stop, 1007 sopno startst, sopno stopst) 1008 { 1009 if (!(m->eflags®_TRACE)) 1010 return; 1011 1012 (void) printf("%s %s-", title, pchar(*start)); 1013 (void) printf("%s ", pchar(*stop)); 1014 (void) printf("%ld-%ld\n", (long)startst, (long)stopst); 1015 } 1016 1017 #ifndef PCHARDONE 1018 #define PCHARDONE /* never again */ 1019 /* 1020 * pchar - make a character printable 1021 * 1022 * Is this identical to regchar() over in debug.c? Well, yes. But a 1023 * duplicate here avoids having a debugging-capable regexec.o tied to 1024 * a matching debug.o, and this is convenient. It all disappears in 1025 * the non-debug compilation anyway, so it doesn't matter much. 1026 */ 1027 static const char * 1028 pchar(int ch) 1029 { 1030 static char pbuf[10]; 1031 1032 if (isprint((uch)ch) || ch == ' ') 1033 (void) sprintf(pbuf, "%c", ch); 1034 else 1035 (void) sprintf(pbuf, "\\%o", ch); 1036 return (pbuf); 1037 } 1038 #endif 1039 #endif 1040 1041 #undef matcher 1042 #undef walk 1043 #undef dissect 1044 #undef backref 1045 #undef step 1046 #undef print 1047 #undef at 1048 #undef match