1 /* 2 * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 tre_regexec.c - TRE POSIX compatible matching functions (and more). 31 */ 32 33 #include <assert.h> 34 #include <limits.h> 35 #include <stdlib.h> 36 #include <string.h> 37 #include <wchar.h> 38 #include <wctype.h> 39 40 #include "tre-internal.h" 41 #include "tre-match-utils.h" 42 #include "tre.h" 43 44 /* For each tre_last_matched_t in the lm array, find the last matched branch by 45 comparing the touch value of the cmp_tag's. For all other branches, reset 46 the corresponding tags. If reset_all is non-zero, reset all tags in all 47 branches. Recurse into the nested last matched structures, clearing tags as 48 apprpriate. */ 49 static void 50 tre_reset_last_matched_branches(tre_tag_t *tags, const tre_last_matched_t *lm, 51 int n, int start_tag, int reset_all) 52 { 53 int max, i, reset; 54 tre_last_matched_branch_t *b; 55 56 DPRINT(("tre_reset_last_matched_branches: n=%d start_tag=%d reset_all=%d\n", 57 n, start_tag, reset_all)); 58 for (; n-- > 0; lm++) 59 { 60 if (lm->n_branches == 1) 61 { 62 b = lm->branches; 63 if (start_tag > 0) 64 { 65 DPRINT((" b->cmp_tag=%d %d <? %d\n", b->cmp_tag, 66 tre_tag_touch_get(tags, b->cmp_tag), 67 tre_tag_touch_get(tags, start_tag))); 68 reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) < 69 tre_tag_touch_get(tags, start_tag)); 70 } 71 else 72 reset = 0; 73 74 if (reset) 75 { 76 int *t; 77 78 for (i = b->n_tags, t = b->tags; i > 0; i--, t++) 79 { 80 DPRINT((" Resetting t%d\n", *t)); 81 tre_tag_reset(tags, *t); 82 } 83 } 84 if (b->n_last_matched > 0) 85 tre_reset_last_matched_branches(tags, b->last_matched, 86 b->n_last_matched, 87 lm->start_tag, reset); 88 } 89 else 90 { 91 if (!reset_all) 92 { 93 #ifdef TRE_DEBUG 94 int last; 95 #endif /* TRE_DEBUG */ 96 max = 0; 97 for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++) 98 { 99 int t = b->cmp_tag; 100 int touch = tre_tag_touch_get(tags, t); 101 if (touch > max) 102 { 103 max = touch; 104 #ifdef TRE_DEBUG 105 last = t; 106 #endif /* TRE_DEBUG */ 107 } 108 } 109 DPRINT((" Last touched end tag t%d=%d\n", last, max)); 110 } 111 112 for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++) 113 { 114 reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) < max); 115 if (reset) 116 { 117 int j; 118 int *t; 119 120 for (j = b->n_tags, t = b->tags; j > 0; j--, t++) 121 { 122 DPRINT((" Resetting t%d\n", *t)); 123 tre_tag_reset(tags, *t); 124 } 125 } 126 if (b->n_last_matched > 0) 127 tre_reset_last_matched_branches(tags, b->last_matched, 128 b->n_last_matched, 129 lm->start_tag, reset); 130 } 131 } 132 } 133 } 134 135 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match 136 endpoint values. */ 137 reg_errcode_t 138 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, 139 const tre_tnfa_t *tnfa, const tre_tag_t *intags, int match_eo) 140 { 141 unsigned int i; 142 143 if (cflags & REG_NOSUB) return REG_OK; 144 145 i = 0; 146 if (match_eo >= 0 && intags) 147 { 148 const tre_tag_t *tags = intags; 149 tre_submatch_data_t *submatch_data; 150 151 if (tnfa->last_matched_branch && 152 tnfa->last_matched_branch->n_last_matched > 0) 153 { 154 tre_tag_t *t; 155 t = malloc(sizeof(*t) * tnfa->num_tags); 156 if (!t) return REG_ESPACE; 157 memcpy(t, intags, tnfa->num_tags * sizeof(tre_tag_t)); 158 tre_reset_last_matched_branches(t, 159 tnfa->last_matched_branch->last_matched, 160 tnfa->last_matched_branch->n_last_matched, 161 0, 0); 162 tags = t; 163 } 164 /* Construct submatch offsets from the tags. */ 165 DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo)); 166 submatch_data = tnfa->submatch_data; 167 while (i < tnfa->num_submatches && i < nmatch) 168 { 169 if (submatch_data[i].so_tag == tnfa->end_tag) 170 pmatch[i].rm_so = match_eo; 171 else 172 pmatch[i].rm_so = tre_tag_get(tags, submatch_data[i].so_tag); 173 174 if (submatch_data[i].eo_tag == tnfa->end_tag) 175 pmatch[i].rm_eo = match_eo; 176 else 177 pmatch[i].rm_eo = tre_tag_get(tags, submatch_data[i].eo_tag); 178 179 /* If either of the endpoints were not used, this submatch 180 was not part of the match. */ 181 if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1) 182 pmatch[i].rm_so = pmatch[i].rm_eo = -1; 183 184 DPRINT(("pmatch[%d] = {t%d = %qd, t%d = %qd}\n", i, 185 submatch_data[i].so_tag, pmatch[i].rm_so, 186 submatch_data[i].eo_tag, pmatch[i].rm_eo)); 187 i++; 188 } 189 if (tags != intags) free(__DECONST(tre_tag_t *,tags)); 190 } 191 192 while (i < nmatch) 193 { 194 pmatch[i].rm_so = -1; 195 pmatch[i].rm_eo = -1; 196 i++; 197 } 198 199 return REG_OK; 200 } 201 202 /* 203 Wrapper functions for POSIX compatible regexp matching. 204 */ 205 206 int 207 tre_have_backrefs(const regex_t *preg) 208 { 209 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 210 return tnfa->have_backrefs; 211 } 212 213 static int 214 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len, 215 tre_str_type_t type, size_t nmatch, regmatch_t pmatch[], 216 int eflags) 217 { 218 reg_errcode_t status; 219 tre_tag_t *tags = NULL; 220 int eo; 221 size_t offset = 0, count = 0; 222 if (tnfa->num_tags > 0 && nmatch > 0) 223 { 224 tags = malloc(sizeof(*tags) * tnfa->num_tags); 225 if (tags == NULL) 226 return REG_ESPACE; 227 } 228 229 if ( 230 (eflags & REG_STARTEND) && pmatch) 231 { 232 if (pmatch->rm_so < 0) 233 return REG_INVARG; 234 if (len == (size_t)-1) 235 { 236 if (pmatch->rm_eo < 0 || pmatch->rm_so > pmatch->rm_eo) 237 return REG_INVARG; 238 len = pmatch->rm_eo - pmatch->rm_so; 239 } 240 count = offset = pmatch->rm_so; 241 if (type == STR_WIDE) offset *= sizeof(wchar_t); 242 } 243 244 /* Dispatch to the appropriate matcher. */ 245 #ifdef REG_BACKTRACKING_MATCHER 246 if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER) 247 #else 248 if (tnfa->have_backrefs) 249 #endif 250 { 251 /* The regex has back references, use the backtracking matcher. */ 252 status = tre_tnfa_run_backtrack(tnfa, string + offset, (int)len, type, 253 tags, eflags, &eo); 254 } 255 else 256 { 257 /* Exact matching, no back references, use the parallel matcher. */ 258 status = tre_tnfa_run_parallel(tnfa, string + offset, (int)len, type, 259 tags, eflags, &eo); 260 } 261 262 if (status == REG_OK) 263 { 264 /* A match was found, so fill the submatch registers. */ 265 status = tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); 266 /* If doing REG_STARTEND, adjust the pmatch array (we can't build 267 this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls 268 tre_fill_pmatch itself). */ 269 if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) && 270 (eflags & REG_STARTEND) && pmatch && nmatch > 0) 271 { 272 size_t i; 273 regmatch_t *p; 274 for (i = nmatch, p = pmatch; i > 0; p++, i--) 275 { 276 if (p->rm_so >= 0) p->rm_so += count; 277 if (p->rm_eo >= 0) p->rm_eo += count; 278 } 279 } 280 } 281 if (tags) 282 free(tags); 283 return status; 284 } 285 286 int 287 tre_regnexec(const regex_t *preg, const char *str, size_t len, 288 size_t nmatch, regmatch_t pmatch[], int eflags) 289 { 290 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; 291 tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS; 292 293 if (preg->re_magic != RE_MAGIC) return REG_BADPAT; 294 295 return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags); 296 } 297 298 int 299 regexec(const regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD str, 300 size_t nmatch, regmatch_t pmatch[_RESTRICT_KYWD], int eflags) 301 { 302 return tre_regnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags); 303 }