Print this page
9083 replace regex implementation with tre

*** 1,225 **** /* ! * Copyright 2010 Nexenta Systems, Inc. All rights reserved. ! * Copyright (c) 1992, 1993, 1994 Henry Spencer. ! * Copyright (c) 1992, 1993, 1994 ! * The Regents of the University of California. All rights reserved. * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. * ! * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ! * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ! * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ! * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ! * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ! * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ! * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ! * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ! * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ! * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ! * SUCH DAMAGE. */ /* ! * the outer shell of regexec() ! * ! * This file includes engine.c three times, after muchos fiddling with the ! * macros that code uses. This lets the same code operate on two different ! * representations for state sets and characters. ! */ ! #include "lint.h" ! #include "file64.h" ! #include <sys/types.h> ! #include <stdio.h> #include <stdlib.h> #include <string.h> - #include <limits.h> - #include <ctype.h> - #include <regex.h> #include <wchar.h> #include <wctype.h> - #include <note.h> - #include <assert.h> ! #include "utils.h" ! #include "regex2.h" ! /* we want _NOTE, but not NOTE (which collides with our own use) */ ! #undef NOTE ! ! static size_t ! xmbrtowc(wint_t *wi, const char *s, size_t n, mbstate_t *mbs, wint_t dummy) { ! size_t nr; ! wchar_t wc; ! nr = mbrtowc(&wc, s, n, mbs); ! if (wi != NULL) ! *wi = wc; ! if (nr == 0) ! return (1); ! else if (nr == (size_t)-1 || nr == (size_t)-2) { ! (void) memset(mbs, 0, sizeof (*mbs)); ! if (wi != NULL) ! *wi = dummy; ! return (1); ! } else ! return (nr); } ! static size_t ! xmbrtowc_dummy(wint_t *wi, const char *s, size_t n, mbstate_t *mbs, ! wint_t dummy) { ! _NOTE(ARGUNUSED(n)); ! _NOTE(ARGUNUSED(mbs)); ! _NOTE(ARGUNUSED(dummy)); ! if (wi != NULL) ! *wi = (unsigned char)*s; ! return (1); ! } ! /* macros for manipulating states, small version */ ! #define states long ! #define states1 states /* for later use in regexec() decision */ ! #define CLEAR(v) ((v) = 0) ! #define SET0(v, n) ((v) &= ~((unsigned long)1 << (n))) ! #define SET1(v, n) ((v) |= (unsigned long)1 << (n)) ! #define ISSET(v, n) (((v) & ((unsigned long)1 << (n))) != 0) ! #define ASSIGN(d, s) ((d) = (s)) ! #define EQ(a, b) ((a) == (b)) ! #define STATEVARS long dummy /* dummy version */ ! #define STATESETUP(m, n) /* nothing */ ! #define STATETEARDOWN(m) /* nothing */ ! #define SETUP(v) ((v) = 0) ! #define onestate long ! #define INIT(o, n) ((o) = (unsigned long)1 << (n)) ! #define INC(o) ((o) <<= 1) ! #define ISSTATEIN(v, o) (((v) & (o)) != 0) ! /* some abbreviations; note that some of these know variable names! */ ! /* do "if I'm here, I can also be there" etc without branches */ ! #define FWD(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) << (n)) ! #define BACK(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) >> (n)) ! #define ISSETBACK(v, n) (((v) & ((unsigned long)here >> (n))) != 0) ! /* no multibyte support */ ! #define XMBRTOWC xmbrtowc_dummy ! #define ZAPSTATE(mbs) ((void)(mbs)) ! /* function names */ ! #define SNAMES /* engine.c looks after details */ ! #include "engine.c" ! /* now undo things */ ! #undef states ! #undef CLEAR ! #undef SET0 ! #undef SET1 ! #undef ISSET ! #undef ASSIGN ! #undef EQ ! #undef STATEVARS ! #undef STATESETUP ! #undef STATETEARDOWN ! #undef SETUP ! #undef onestate ! #undef INIT ! #undef INC ! #undef ISSTATEIN ! #undef FWD ! #undef BACK ! #undef ISSETBACK ! #undef SNAMES ! #undef XMBRTOWC ! #undef ZAPSTATE ! /* macros for manipulating states, large version */ ! #define states char * ! #define CLEAR(v) (void) memset(v, 0, m->g->nstates) ! #define SET0(v, n) ((v)[n] = 0) ! #define SET1(v, n) ((v)[n] = 1) ! #define ISSET(v, n) ((v)[n]) ! #define ASSIGN(d, s) (void) memcpy(d, s, m->g->nstates) ! #define EQ(a, b) (memcmp(a, b, m->g->nstates) == 0) ! #define STATEVARS long vn; char *space ! #define STATESETUP(m, nv) { (m)->space = malloc((nv)*(m)->g->nstates); \ ! if ((m)->space == NULL) \ ! return (REG_ESPACE); \ ! (m)->vn = 0; } ! #define STATETEARDOWN(m) { free((m)->space); } ! #define SETUP(v) ((v) = &m->space[m->vn++ * m->g->nstates]) ! #define onestate long ! #define INIT(o, n) ((o) = (n)) ! #define INC(o) ((o)++) ! #define ISSTATEIN(v, o) ((v)[o]) ! /* some abbreviations; note that some of these know variable names! */ ! /* do "if I'm here, I can also be there" etc without branches */ ! #define FWD(dst, src, n) ((dst)[here+(n)] |= (src)[here]) ! #define BACK(dst, src, n) ((dst)[here-(n)] |= (src)[here]) ! #define ISSETBACK(v, n) ((v)[here - (n)]) ! /* no multibyte support */ ! #define XMBRTOWC xmbrtowc_dummy ! #define ZAPSTATE(mbs) ((void)(mbs)) ! /* function names */ ! #define LNAMES /* flag */ ! #include "engine.c" ! /* multibyte character & large states version */ ! #undef LNAMES ! #undef XMBRTOWC ! #undef ZAPSTATE ! #define XMBRTOWC xmbrtowc ! #define ZAPSTATE(mbs) (void) memset((mbs), 0, sizeof (*(mbs))) ! #define MNAMES ! #include "engine.c" /* ! * regexec - interface for matching ! * ! * We put this here so we can exploit knowledge of the state representation ! * when choosing which matcher to call. Also, by this point the matchers ! * have been prototyped. ! */ ! int /* 0 success, REG_NOMATCH failure */ ! regexec(const regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD string, ! size_t nmatch, regmatch_t pmatch[_RESTRICT_KYWD], int eflags) { ! struct re_guts *g = preg->re_g; ! #ifdef REDEBUG ! #define GOODFLAGS(f) (f) #else ! #ifdef REG_STARTEND ! #define GOODFLAGS(f) ((f)&(REG_NOTBOL|REG_NOTEOL|REG_STARTEND)) ! #else ! #define GOODFLAGS(f) ((f)&(REG_NOTBOL|REG_NOTEOL)) #endif ! #endif ! if (preg->re_magic != MAGIC1 || g->magic != MAGIC2) ! return (REG_BADPAT); ! assert(!(g->iflags&BAD)); ! if (g->iflags&BAD) /* backstop for no-debug case */ ! return (REG_BADPAT); ! eflags = GOODFLAGS(eflags); ! if (MB_CUR_MAX > 1) ! return (mmatcher(g, string, nmatch, pmatch, eflags)); ! else if (g->nstates <= CHAR_BIT*sizeof (states1) && !(eflags&REG_LARGE)) ! return (smatcher(g, string, nmatch, pmatch, eflags)); ! else ! return (lmatcher(g, string, nmatch, pmatch, eflags)); } --- 1,303 ---- /* ! * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> ! * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: + * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. + * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * ! * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ! * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ! * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ! * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ! * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ! * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ! * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ! * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ! * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ! * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ! * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* ! tre_regexec.c - TRE POSIX compatible matching functions (and more). ! */ ! ! #include <assert.h> ! #include <limits.h> #include <stdlib.h> #include <string.h> #include <wchar.h> #include <wctype.h> ! #include "tre-internal.h" ! #include "tre-match-utils.h" ! #include "tre.h" ! /* For each tre_last_matched_t in the lm array, find the last matched branch by ! comparing the touch value of the cmp_tag's. For all other branches, reset ! the corresponding tags. If reset_all is non-zero, reset all tags in all ! branches. Recurse into the nested last matched structures, clearing tags as ! apprpriate. */ ! static void ! tre_reset_last_matched_branches(tre_tag_t *tags, const tre_last_matched_t *lm, ! int n, int start_tag, int reset_all) { ! int max, i, reset; ! tre_last_matched_branch_t *b; ! DPRINT(("tre_reset_last_matched_branches: n=%d start_tag=%d reset_all=%d\n", ! n, start_tag, reset_all)); ! for (; n-- > 0; lm++) ! { ! if (lm->n_branches == 1) ! { ! b = lm->branches; ! if (start_tag > 0) ! { ! DPRINT((" b->cmp_tag=%d %d <? %d\n", b->cmp_tag, ! tre_tag_touch_get(tags, b->cmp_tag), ! tre_tag_touch_get(tags, start_tag))); ! reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) < ! tre_tag_touch_get(tags, start_tag)); ! } ! else ! reset = 0; ! ! if (reset) ! { ! int *t; ! ! for (i = b->n_tags, t = b->tags; i > 0; i--, t++) ! { ! DPRINT((" Resetting t%d\n", *t)); ! tre_tag_reset(tags, *t); ! } ! } ! if (b->n_last_matched > 0) ! tre_reset_last_matched_branches(tags, b->last_matched, ! b->n_last_matched, ! lm->start_tag, reset); ! } ! else ! { ! if (!reset_all) ! { ! #ifdef TRE_DEBUG ! int last; ! #endif /* TRE_DEBUG */ ! max = 0; ! for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++) ! { ! int t = b->cmp_tag; ! int touch = tre_tag_touch_get(tags, t); ! if (touch > max) ! { ! max = touch; ! #ifdef TRE_DEBUG ! last = t; ! #endif /* TRE_DEBUG */ ! } ! } ! DPRINT((" Last touched end tag t%d=%d\n", last, max)); ! } ! ! for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++) ! { ! reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) < max); ! if (reset) ! { ! int j; ! int *t; ! ! for (j = b->n_tags, t = b->tags; j > 0; j--, t++) ! { ! DPRINT((" Resetting t%d\n", *t)); ! tre_tag_reset(tags, *t); ! } ! } ! if (b->n_last_matched > 0) ! tre_reset_last_matched_branches(tags, b->last_matched, ! b->n_last_matched, ! lm->start_tag, reset); ! } ! } ! } } ! /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match ! endpoint values. */ ! reg_errcode_t ! tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, ! const tre_tnfa_t *tnfa, const tre_tag_t *intags, int match_eo) { ! unsigned int i; ! if (cflags & REG_NOSUB) return REG_OK; ! i = 0; ! if (match_eo >= 0 && intags) ! { ! const tre_tag_t *tags = intags; ! tre_submatch_data_t *submatch_data; ! if (tnfa->last_matched_branch && ! tnfa->last_matched_branch->n_last_matched > 0) ! { ! tre_tag_t *t; ! t = malloc(sizeof(*t) * tnfa->num_tags); ! if (!t) return REG_ESPACE; ! memcpy(t, intags, tnfa->num_tags * sizeof(tre_tag_t)); ! tre_reset_last_matched_branches(t, ! tnfa->last_matched_branch->last_matched, ! tnfa->last_matched_branch->n_last_matched, ! 0, 0); ! tags = t; ! } ! /* Construct submatch offsets from the tags. */ ! DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo)); ! submatch_data = tnfa->submatch_data; ! while (i < tnfa->num_submatches && i < nmatch) ! { ! if (submatch_data[i].so_tag == tnfa->end_tag) ! pmatch[i].rm_so = match_eo; ! else ! pmatch[i].rm_so = tre_tag_get(tags, submatch_data[i].so_tag); ! if (submatch_data[i].eo_tag == tnfa->end_tag) ! pmatch[i].rm_eo = match_eo; ! else ! pmatch[i].rm_eo = tre_tag_get(tags, submatch_data[i].eo_tag); ! /* If either of the endpoints were not used, this submatch ! was not part of the match. */ ! if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1) ! pmatch[i].rm_so = pmatch[i].rm_eo = -1; ! DPRINT(("pmatch[%d] = {t%d = %qd, t%d = %qd}\n", i, ! submatch_data[i].so_tag, pmatch[i].rm_so, ! submatch_data[i].eo_tag, pmatch[i].rm_eo)); ! i++; ! } ! if (tags != intags) free(__DECONST(tre_tag_t *,tags)); ! } ! while (i < nmatch) ! { ! pmatch[i].rm_so = -1; ! pmatch[i].rm_eo = -1; ! i++; ! } ! return REG_OK; ! } /* ! Wrapper functions for POSIX compatible regexp matching. ! */ ! ! int ! tre_have_backrefs(const regex_t *preg) { ! tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; ! return tnfa->have_backrefs; ! } ! ! static int ! tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len, ! tre_str_type_t type, size_t nmatch, regmatch_t pmatch[], ! int eflags) ! { ! reg_errcode_t status; ! tre_tag_t *tags = NULL; ! int eo; ! size_t offset = 0, count = 0; ! if (tnfa->num_tags > 0 && nmatch > 0) ! { ! tags = malloc(sizeof(*tags) * tnfa->num_tags); ! if (tags == NULL) ! return REG_ESPACE; ! } ! ! if ( ! (eflags & REG_STARTEND) && pmatch) ! { ! if (pmatch->rm_so < 0) ! return REG_INVARG; ! if (len == (size_t)-1) ! { ! if (pmatch->rm_eo < 0 || pmatch->rm_so > pmatch->rm_eo) ! return REG_INVARG; ! len = pmatch->rm_eo - pmatch->rm_so; ! } ! count = offset = pmatch->rm_so; ! if (type == STR_WIDE) offset *= sizeof(wchar_t); ! } ! ! /* Dispatch to the appropriate matcher. */ ! #ifdef REG_BACKTRACKING_MATCHER ! if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER) #else ! if (tnfa->have_backrefs) #endif ! { ! /* The regex has back references, use the backtracking matcher. */ ! status = tre_tnfa_run_backtrack(tnfa, string + offset, (int)len, type, ! tags, eflags, &eo); ! } ! else ! { ! /* Exact matching, no back references, use the parallel matcher. */ ! status = tre_tnfa_run_parallel(tnfa, string + offset, (int)len, type, ! tags, eflags, &eo); ! } ! if (status == REG_OK) ! { ! /* A match was found, so fill the submatch registers. */ ! status = tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); ! /* If doing REG_STARTEND, adjust the pmatch array (we can't build ! this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls ! tre_fill_pmatch itself). */ ! if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) && ! (eflags & REG_STARTEND) && pmatch && nmatch > 0) ! { ! size_t i; ! regmatch_t *p; ! for (i = nmatch, p = pmatch; i > 0; p++, i--) ! { ! if (p->rm_so >= 0) p->rm_so += count; ! if (p->rm_eo >= 0) p->rm_eo += count; ! } ! } ! } ! if (tags) ! free(tags); ! return status; ! } ! int ! tre_regnexec(const regex_t *preg, const char *str, size_t len, ! size_t nmatch, regmatch_t pmatch[], int eflags) ! { ! tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; ! tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS; ! ! if (preg->re_magic != RE_MAGIC) return REG_BADPAT; ! ! return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags); ! } ! ! int ! regexec(const regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD str, ! size_t nmatch, regmatch_t pmatch[_RESTRICT_KYWD], int eflags) ! { ! return tre_regnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags); }