1 /* 2 * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 tre-match-utils.h - TRE matcher helper definitions 31 */ 32 33 #include "tre-internal.h" 34 35 #define str_source ((const tre_str_source*)string) 36 37 /* 38 * Because all multibyte encodings are exclusively single-shift encoding, 39 * with the shift codes having the high bit set, we can make an optimization 40 * for STR_MBS that only calls tre_mbrtowc_l() when a high-bit character 41 * is detected, and just do a direct copy for ASCII characters. 42 */ 43 #define GET_NEXT_WCHAR() \ 44 do { \ 45 prev_c = next_c; \ 46 switch (type) \ 47 { \ 48 case STR_BYTE: \ 49 pos++; \ 50 if (len >= 0 && pos >= len) \ 51 next_c = '\0'; \ 52 else \ 53 next_c = (unsigned char)(*str_byte++); \ 54 break; \ 55 case STR_WIDE: \ 56 pos++; \ 57 if (len >= 0 && pos >= len) \ 58 next_c = L'\0'; \ 59 else \ 60 next_c = *str_wide++; \ 61 break; \ 62 case STR_MBS: \ 63 pos += pos_add_next; \ 64 if (__builtin_expect(len >= 0 && pos >= len, 0)) \ 65 { \ 66 next_c = L'\0'; \ 67 pos_add_next = 1; \ 68 } \ 69 else if (__builtin_expect(!(*str_byte & 0x80), 1)) \ 70 { \ 71 next_c = (unsigned char)(*str_byte++); \ 72 pos_add_next = 1; \ 73 } \ 74 else \ 75 { \ 76 size_t w; \ 77 int max; \ 78 if (len >= 0) \ 79 max = len - pos; \ 80 else \ 81 max = 32; \ 82 w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate, \ 83 tnfa->loc); \ 84 if (w == (size_t)-1 || w == (size_t)-2) \ 85 return REG_NOMATCH; \ 86 if (w == 0 && len >= 0) \ 87 { \ 88 pos_add_next = 1; \ 89 next_c = 0; \ 90 str_byte++; \ 91 } \ 92 else \ 93 { \ 94 pos_add_next = w; \ 95 str_byte += w; \ 96 } \ 97 } \ 98 break; \ 99 } \ 100 } while(/*CONSTCOND*/0) 101 102 /* Assumes tre_tnfa_t *tnfa in scope */ 103 #define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum_l(c, tnfa->loc)) 104 105 #define CHECK_ASSERTIONS(assertions) \ 106 (((assertions & ASSERT_AT_BOL) \ 107 && (pos > 0 || reg_notbol) \ 108 && (prev_c != L'\n' || !reg_newline)) \ 109 || ((assertions & ASSERT_AT_EOL) \ 110 && (next_c != L'\0' || reg_noteol) \ 111 && (next_c != L'\n' || !reg_newline)) \ 112 || ((assertions & ASSERT_AT_BOW) \ 113 && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \ 114 || ((assertions & ASSERT_AT_EOW) \ 115 && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \ 116 || ((assertions & ASSERT_AT_WB) \ 117 && (pos != 0 && next_c != L'\0' \ 118 && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \ 119 || ((assertions & ASSERT_AT_WB_NEG) \ 120 && (pos == 0 || next_c == L'\0' \ 121 || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c)))) 122 123 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \ 124 ((trans_i->assertions & ASSERT_BRACKET_MATCH) \ 125 && !tre_bracket_match(trans_i->u.bracket_match_list,(tre_cint_t)prev_c, \ 126 tnfa)) 127 128 inline static int 129 tre_tag_get(const tre_tag_t *tags, int i) 130 { 131 tags += i; 132 return tags->count > 0 ? tags->value : -1; 133 } 134 135 inline static void 136 tre_tag_set(tre_tag_t *tags, int i, int val, int touch) 137 { 138 tags += i; 139 if (tags->count++ == 0) 140 tags->first = val; 141 tags->value = val; 142 tags->touch = touch; 143 } 144 145 inline static void 146 tre_tag_reset(tre_tag_t *tags, int i) 147 { 148 tags[i].count = 0; 149 } 150 151 inline static int 152 tre_tag_touch_get(const tre_tag_t *tags, int i) 153 { 154 return tags[i].touch; 155 } 156 157 #ifdef TRE_DEBUG 158 inline static void 159 tre_print_tags(const tre_tag_t *tags, int num_tags) 160 { 161 int i; 162 for (i = 0; i < num_tags; i++, tags++) 163 { 164 switch(tags->count) 165 { 166 case 0: 167 DPRINT(("%d:(0,-1)", i)); 168 break; 169 case 1: 170 DPRINT(("%d:(1,%d)", i, tags->first)); 171 break; 172 default: 173 DPRINT(("%d:(%d,%d,%d)", i, tags->count, tags->first, 174 tags->value)); 175 break; 176 } 177 if (i < (num_tags - 1)) 178 DPRINT((" ")); 179 } 180 } 181 182 inline static void 183 tre_print_tags_all(const tre_tag_t *tags, int num_tags) 184 { 185 int i; 186 for (i = 0; i < num_tags; i++, tags++) 187 { 188 switch(tags->count) 189 { 190 case 0: 191 DPRINT(("%d:(0,-1)/%d", i, tags->touch)); 192 break; 193 case 1: 194 DPRINT(("%d:(1,%d)/%d", i, tags->first, tags->touch)); 195 break; 196 default: 197 DPRINT(("%d:(%d,%d,%d)/%d", i, tags->count, tags->first, 198 tags->value, tags->touch)); 199 break; 200 } 201 if (i < (num_tags - 1)) 202 DPRINT((" ")); 203 } 204 } 205 #endif /* TRE_DEBUG */ 206 207 /* Return < 0, = 0 or > 0 depending on how the start/end pairs of a minimal 208 * group between t1 and t2 compare (t1 loses if < 0, t1 wins if > 0) */ 209 inline static int 210 tre_minimal_tag_order(int start, int end, const tre_tag_t *tags1, 211 const tre_tag_t *tags2) 212 { 213 const tre_tag_t *t1, *t2; 214 215 t1 = tags1 + start; 216 t2 = tags2 + start; 217 /* We need both start tags to be set */ 218 if (t1->count == 0 || t2->count == 0) 219 return 0; 220 221 /* The start tags must be equal */ 222 if (t1->value != t2->value) 223 return 0; 224 225 t1 = tags1 + end; 226 t2 = tags2 + end; 227 /* For the end tags, we prefer set over unset, because unset means that 228 * the end tag is still growing */ 229 if (t1->count == 0) 230 { 231 /* if t2 is set, t1 loses since it is unset */ 232 if (t2->count != 0) 233 return -1; 234 } 235 /* if t2 not set, t1 wins since it is set */ 236 else if (t2->count == 0) 237 return 1; 238 239 /* least current value wins */ 240 return t2->value - t1->value; 241 } 242 243 /* Return < 0, = 0 or > 0 depending on how the i-th item of t1 and t2 compare 244 * (t1 loses if < 0, t1 wins if > 0) */ 245 inline static int 246 tre_tag_order_1(int i, tre_tag_direction_t dir, const tre_tag_t *t1, 247 const tre_tag_t *t2) 248 { 249 int diff; 250 251 t1 += i; 252 t2 += i; 253 switch (dir) 254 { 255 case TRE_TAG_MINIMIZE: 256 /* least current value wins (because tags are initialized to all zeros, 257 * unset wins over set; also, tre_minimal_tag_order() will have already 258 * been run, which checks for being unset) */ 259 return t2->value - t1->value; 260 261 case TRE_TAG_MAXIMIZE: 262 /* prefer set */ 263 if (t1->count == 0) 264 { 265 /* if neither t1 and t2 are set, try next tag */ 266 if (t2->count == 0) 267 return 0; 268 /* t2 is set, t1 loses since it is unset */ 269 return -1; 270 } 271 /* if t2 not set, t1 wins since it is set */ 272 else if (t2->count == 0) 273 return 1; 274 /* greatest initial value wins */ 275 if ((diff = t1->first - t2->first) != 0) 276 return diff; 277 /* least number of times the tag was set, wins */ 278 if ((diff = t2->count - t1->count) != 0) 279 return diff; 280 /* if the tags were only set once, they only have initial values */ 281 if (t1->count == 1) 282 return 0; 283 /* greatest current value wins */ 284 return t1->value - t2->value; 285 286 case TRE_TAG_LEFT_MAXIMIZE: 287 /* prefer set */ 288 if (t1->count == 0) 289 { 290 /* if neither t1 and t2 are set, try next tag */ 291 if (t2->count == 0) 292 return 0; 293 /* t2 is set, t1 loses since it is unset */ 294 return -1; 295 } 296 /* if t2 not set, t1 wins since it is set */ 297 else if (t2->count == 0) 298 return 1; 299 /* least initial value wins */ 300 if ((diff = t2->first - t1->first) != 0) 301 return diff; 302 /* least number of times the tag was set, wins */ 303 if ((diff = t2->count - t1->count) != 0) 304 return diff; 305 /* if the tags were only set once, they only have initial values */ 306 if (t1->count == 1) 307 return 0; 308 /* greatest current value wins */ 309 return t1->value - t2->value; 310 311 default: 312 /* Shouldn't happen: only assert if TRE_DEBUG defined */ 313 assert(0); 314 break; 315 } 316 return 0; 317 } 318 319 #ifdef TRE_DEBUG 320 #define _MORE_DEBUGGING 321 #endif /* TRE_DEBUG */ 322 323 /* Returns 1 if `t1' wins `t2', 0 otherwise. */ 324 inline static int 325 #ifdef _MORE_DEBUGGING 326 _tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, 327 const tre_tag_t *t1, const tre_tag_t *t2) 328 #else /* !_MORE_DEBUGGING */ 329 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, 330 const tre_tag_t *t1, const tre_tag_t *t2) 331 #endif /* !_MORE_DEBUGGING */ 332 { 333 int i, ret; 334 335 for (i = 0; i < num_tags; i++) 336 { 337 if ((ret = tre_tag_order_1(i, tag_directions[i], t1, t2)) != 0) 338 return (ret > 0); 339 } 340 /* assert(0);*/ 341 return 0; 342 } 343 344 #ifdef _MORE_DEBUGGING 345 inline static int 346 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, 347 const tre_tag_t *t1, const tre_tag_t *t2) 348 { 349 int ret = _tre_tag_order(num_tags, tag_directions, t1, t2); 350 DPRINT(("tre_tag_order: ")); 351 tre_print_tags(t1, num_tags); 352 DPRINT((" %s ", ret ? "wins" : "doesn't win")); 353 tre_print_tags(t2, num_tags); 354 DPRINT(("\n")); 355 return ret; 356 } 357 #endif /* _MORE_DEBUGGING */ 358 359 int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len); 360 361 inline static int 362 tre_bracket_match(tre_bracket_match_list_t * __restrict list, tre_cint_t wc, 363 const tre_tnfa_t * __restrict tnfa) 364 { 365 int match = 0; 366 int i; 367 tre_bracket_match_t *b; 368 tre_cint_t uc, lc; 369 int we, ue, le, got_equiv = 0; 370 int icase = ((tnfa->cflags & REG_ICASE) != 0); 371 372 DPRINT(("tre_bracket_match: %p, %d, %d\n", list, wc, icase)); 373 if (icase) 374 { 375 if (tre_islower_l(wc, tnfa->loc)) 376 { 377 lc = wc; 378 uc = tre_toupper_l(wc, tnfa->loc); 379 } 380 else if (tre_isupper_l(wc, tnfa->loc)) 381 { 382 uc = wc; 383 lc = tre_tolower_l(wc, tnfa->loc); 384 } 385 else 386 { 387 icase = 0; 388 } 389 } 390 for (i = 0, b = list->bracket_matches; i < list->num_bracket_matches; 391 i++, b++) 392 { 393 switch (b->type) 394 { 395 case TRE_BRACKET_MATCH_TYPE_CHAR: 396 if (icase) 397 match = (b->value == uc || b->value == lc); 398 else 399 match = (b->value == wc); 400 break; 401 case TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN: 402 { 403 tre_cint_t start = b->value, end; 404 if (++i >= list->num_bracket_matches || 405 (++b)->type != TRE_BRACKET_MATCH_TYPE_RANGE_END) 406 { 407 DPRINT(("tre_bracket_match: no following range end\n")); 408 assert(0); 409 goto error; 410 } 411 end = b->value; 412 if (!got_equiv) 413 { 414 if (icase) 415 { 416 ue = __collate_equiv_value(tnfa->loc, &uc, 1); 417 le = __collate_equiv_value(tnfa->loc, &lc, 1); 418 } 419 else 420 we = __collate_equiv_value(tnfa->loc, &wc, 1); 421 got_equiv = 1; 422 } 423 if (icase) 424 match = ((start <= ue && ue <= end) || 425 (start <= le && le <= end)); 426 else 427 match = (start <= we && we <= end); 428 break; 429 } 430 case TRE_BRACKET_MATCH_TYPE_RANGE_END: 431 DPRINT(("tre_bracket_match: range end without preceeding start\n")); 432 assert(0); 433 break; 434 case TRE_BRACKET_MATCH_TYPE_CLASS: 435 if (icase) 436 match = (tre_isctype_l(uc, b->value, tnfa->loc) || 437 tre_isctype_l(lc, b->value, tnfa->loc)); 438 else 439 match = (tre_isctype_l(wc, b->value, tnfa->loc)); 440 break; 441 case TRE_BRACKET_MATCH_TYPE_EQUIVALENCE: 442 if (!got_equiv) 443 { 444 if (icase) 445 { 446 ue = __collate_equiv_value(tnfa->loc, &uc, 1); 447 le = __collate_equiv_value(tnfa->loc, &lc, 1); 448 } 449 else 450 we = __collate_equiv_value(tnfa->loc, &wc, 1); 451 got_equiv = 1; 452 } 453 if (icase) 454 match = (b->value == ue || b->value == le); 455 else 456 match = (b->value == we); 457 break; 458 default: 459 DPRINT(("tre_bracket_match: unknown type %d\n", b->type)); 460 assert(0); 461 break; 462 } 463 if (match) 464 break; 465 } 466 error: 467 if (list->flags & TRE_BRACKET_MATCH_FLAG_NEGATE) { 468 if ((tnfa->cflags & REG_NEWLINE) && wc == '\n') return 0; 469 match = !match; 470 } 471 return match; 472 }