1 /*
2 * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 tre-match-utils.h - TRE matcher helper definitions
31 */
32
33 #include "tre-internal.h"
34
35 #define str_source ((const tre_str_source*)string)
36
37 /*
38 * Because all multibyte encodings are exclusively single-shift encoding,
39 * with the shift codes having the high bit set, we can make an optimization
40 * for STR_MBS that only calls tre_mbrtowc_l() when a high-bit character
41 * is detected, and just do a direct copy for ASCII characters.
42 */
43 #define GET_NEXT_WCHAR() \
44 do { \
45 prev_c = next_c; \
46 switch (type) \
47 { \
48 case STR_BYTE: \
49 pos++; \
50 if (len >= 0 && pos >= len) \
51 next_c = '\0'; \
52 else \
53 next_c = (unsigned char)(*str_byte++); \
54 break; \
55 case STR_WIDE: \
56 pos++; \
57 if (len >= 0 && pos >= len) \
58 next_c = L'\0'; \
59 else \
60 next_c = *str_wide++; \
61 break; \
62 case STR_MBS: \
63 pos += pos_add_next; \
64 if (__builtin_expect(len >= 0 && pos >= len, 0)) \
65 { \
66 next_c = L'\0'; \
67 pos_add_next = 1; \
68 } \
69 else if (__builtin_expect(!(*str_byte & 0x80), 1)) \
70 { \
71 next_c = (unsigned char)(*str_byte++); \
72 pos_add_next = 1; \
73 } \
74 else \
75 { \
76 size_t w; \
77 int max; \
78 if (len >= 0) \
79 max = len - pos; \
80 else \
81 max = 32; \
82 w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate, \
83 tnfa->loc); \
84 if (w == (size_t)-1 || w == (size_t)-2) \
85 return REG_NOMATCH; \
86 if (w == 0 && len >= 0) \
87 { \
88 pos_add_next = 1; \
89 next_c = 0; \
90 str_byte++; \
91 } \
92 else \
93 { \
94 pos_add_next = w; \
95 str_byte += w; \
96 } \
97 } \
98 break; \
99 } \
100 } while(/*CONSTCOND*/0)
101
102 /* Assumes tre_tnfa_t *tnfa in scope */
103 #define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum_l(c, tnfa->loc))
104
105 #define CHECK_ASSERTIONS(assertions) \
106 (((assertions & ASSERT_AT_BOL) \
107 && (pos > 0 || reg_notbol) \
108 && (prev_c != L'\n' || !reg_newline)) \
109 || ((assertions & ASSERT_AT_EOL) \
110 && (next_c != L'\0' || reg_noteol) \
111 && (next_c != L'\n' || !reg_newline)) \
112 || ((assertions & ASSERT_AT_BOW) \
113 && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \
114 || ((assertions & ASSERT_AT_EOW) \
115 && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \
116 || ((assertions & ASSERT_AT_WB) \
117 && (pos != 0 && next_c != L'\0' \
118 && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \
119 || ((assertions & ASSERT_AT_WB_NEG) \
120 && (pos == 0 || next_c == L'\0' \
121 || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
122
123 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \
124 ((trans_i->assertions & ASSERT_BRACKET_MATCH) \
125 && !tre_bracket_match(trans_i->u.bracket_match_list,(tre_cint_t)prev_c, \
126 tnfa))
127
128 inline static int
129 tre_tag_get(const tre_tag_t *tags, int i)
130 {
131 tags += i;
132 return tags->count > 0 ? tags->value : -1;
133 }
134
135 inline static void
136 tre_tag_set(tre_tag_t *tags, int i, int val, int touch)
137 {
138 tags += i;
139 if (tags->count++ == 0)
140 tags->first = val;
141 tags->value = val;
142 tags->touch = touch;
143 }
144
145 inline static void
146 tre_tag_reset(tre_tag_t *tags, int i)
147 {
148 tags[i].count = 0;
149 }
150
151 inline static int
152 tre_tag_touch_get(const tre_tag_t *tags, int i)
153 {
154 return tags[i].touch;
155 }
156
157 #ifdef TRE_DEBUG
158 inline static void
159 tre_print_tags(const tre_tag_t *tags, int num_tags)
160 {
161 int i;
162 for (i = 0; i < num_tags; i++, tags++)
163 {
164 switch(tags->count)
165 {
166 case 0:
167 DPRINT(("%d:(0,-1)", i));
168 break;
169 case 1:
170 DPRINT(("%d:(1,%d)", i, tags->first));
171 break;
172 default:
173 DPRINT(("%d:(%d,%d,%d)", i, tags->count, tags->first,
174 tags->value));
175 break;
176 }
177 if (i < (num_tags - 1))
178 DPRINT((" "));
179 }
180 }
181
182 inline static void
183 tre_print_tags_all(const tre_tag_t *tags, int num_tags)
184 {
185 int i;
186 for (i = 0; i < num_tags; i++, tags++)
187 {
188 switch(tags->count)
189 {
190 case 0:
191 DPRINT(("%d:(0,-1)/%d", i, tags->touch));
192 break;
193 case 1:
194 DPRINT(("%d:(1,%d)/%d", i, tags->first, tags->touch));
195 break;
196 default:
197 DPRINT(("%d:(%d,%d,%d)/%d", i, tags->count, tags->first,
198 tags->value, tags->touch));
199 break;
200 }
201 if (i < (num_tags - 1))
202 DPRINT((" "));
203 }
204 }
205 #endif /* TRE_DEBUG */
206
207 /* Return < 0, = 0 or > 0 depending on how the start/end pairs of a minimal
208 * group between t1 and t2 compare (t1 loses if < 0, t1 wins if > 0) */
209 inline static int
210 tre_minimal_tag_order(int start, int end, const tre_tag_t *tags1,
211 const tre_tag_t *tags2)
212 {
213 const tre_tag_t *t1, *t2;
214
215 t1 = tags1 + start;
216 t2 = tags2 + start;
217 /* We need both start tags to be set */
218 if (t1->count == 0 || t2->count == 0)
219 return 0;
220
221 /* The start tags must be equal */
222 if (t1->value != t2->value)
223 return 0;
224
225 t1 = tags1 + end;
226 t2 = tags2 + end;
227 /* For the end tags, we prefer set over unset, because unset means that
228 * the end tag is still growing */
229 if (t1->count == 0)
230 {
231 /* if t2 is set, t1 loses since it is unset */
232 if (t2->count != 0)
233 return -1;
234 }
235 /* if t2 not set, t1 wins since it is set */
236 else if (t2->count == 0)
237 return 1;
238
239 /* least current value wins */
240 return t2->value - t1->value;
241 }
242
243 /* Return < 0, = 0 or > 0 depending on how the i-th item of t1 and t2 compare
244 * (t1 loses if < 0, t1 wins if > 0) */
245 inline static int
246 tre_tag_order_1(int i, tre_tag_direction_t dir, const tre_tag_t *t1,
247 const tre_tag_t *t2)
248 {
249 int diff;
250
251 t1 += i;
252 t2 += i;
253 switch (dir)
254 {
255 case TRE_TAG_MINIMIZE:
256 /* least current value wins (because tags are initialized to all zeros,
257 * unset wins over set; also, tre_minimal_tag_order() will have already
258 * been run, which checks for being unset) */
259 return t2->value - t1->value;
260
261 case TRE_TAG_MAXIMIZE:
262 /* prefer set */
263 if (t1->count == 0)
264 {
265 /* if neither t1 and t2 are set, try next tag */
266 if (t2->count == 0)
267 return 0;
268 /* t2 is set, t1 loses since it is unset */
269 return -1;
270 }
271 /* if t2 not set, t1 wins since it is set */
272 else if (t2->count == 0)
273 return 1;
274 /* greatest initial value wins */
275 if ((diff = t1->first - t2->first) != 0)
276 return diff;
277 /* least number of times the tag was set, wins */
278 if ((diff = t2->count - t1->count) != 0)
279 return diff;
280 /* if the tags were only set once, they only have initial values */
281 if (t1->count == 1)
282 return 0;
283 /* greatest current value wins */
284 return t1->value - t2->value;
285
286 case TRE_TAG_LEFT_MAXIMIZE:
287 /* prefer set */
288 if (t1->count == 0)
289 {
290 /* if neither t1 and t2 are set, try next tag */
291 if (t2->count == 0)
292 return 0;
293 /* t2 is set, t1 loses since it is unset */
294 return -1;
295 }
296 /* if t2 not set, t1 wins since it is set */
297 else if (t2->count == 0)
298 return 1;
299 /* least initial value wins */
300 if ((diff = t2->first - t1->first) != 0)
301 return diff;
302 /* least number of times the tag was set, wins */
303 if ((diff = t2->count - t1->count) != 0)
304 return diff;
305 /* if the tags were only set once, they only have initial values */
306 if (t1->count == 1)
307 return 0;
308 /* greatest current value wins */
309 return t1->value - t2->value;
310
311 default:
312 /* Shouldn't happen: only assert if TRE_DEBUG defined */
313 assert(0);
314 break;
315 }
316 return 0;
317 }
318
319 #ifdef TRE_DEBUG
320 #define _MORE_DEBUGGING
321 #endif /* TRE_DEBUG */
322
323 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
324 inline static int
325 #ifdef _MORE_DEBUGGING
326 _tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
327 const tre_tag_t *t1, const tre_tag_t *t2)
328 #else /* !_MORE_DEBUGGING */
329 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
330 const tre_tag_t *t1, const tre_tag_t *t2)
331 #endif /* !_MORE_DEBUGGING */
332 {
333 int i, ret;
334
335 for (i = 0; i < num_tags; i++)
336 {
337 if ((ret = tre_tag_order_1(i, tag_directions[i], t1, t2)) != 0)
338 return (ret > 0);
339 }
340 /* assert(0);*/
341 return 0;
342 }
343
344 #ifdef _MORE_DEBUGGING
345 inline static int
346 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
347 const tre_tag_t *t1, const tre_tag_t *t2)
348 {
349 int ret = _tre_tag_order(num_tags, tag_directions, t1, t2);
350 DPRINT(("tre_tag_order: "));
351 tre_print_tags(t1, num_tags);
352 DPRINT((" %s ", ret ? "wins" : "doesn't win"));
353 tre_print_tags(t2, num_tags);
354 DPRINT(("\n"));
355 return ret;
356 }
357 #endif /* _MORE_DEBUGGING */
358
359 int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len);
360
361 inline static int
362 tre_bracket_match(tre_bracket_match_list_t * __restrict list, tre_cint_t wc,
363 const tre_tnfa_t * __restrict tnfa)
364 {
365 int match = 0;
366 int i;
367 tre_bracket_match_t *b;
368 tre_cint_t uc, lc;
369 int we, ue, le, got_equiv = 0;
370 int icase = ((tnfa->cflags & REG_ICASE) != 0);
371
372 DPRINT(("tre_bracket_match: %p, %d, %d\n", list, wc, icase));
373 if (icase)
374 {
375 if (tre_islower_l(wc, tnfa->loc))
376 {
377 lc = wc;
378 uc = tre_toupper_l(wc, tnfa->loc);
379 }
380 else if (tre_isupper_l(wc, tnfa->loc))
381 {
382 uc = wc;
383 lc = tre_tolower_l(wc, tnfa->loc);
384 }
385 else
386 {
387 icase = 0;
388 }
389 }
390 for (i = 0, b = list->bracket_matches; i < list->num_bracket_matches;
391 i++, b++)
392 {
393 switch (b->type)
394 {
395 case TRE_BRACKET_MATCH_TYPE_CHAR:
396 if (icase)
397 match = (b->value == uc || b->value == lc);
398 else
399 match = (b->value == wc);
400 break;
401 case TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN:
402 {
403 tre_cint_t start = b->value, end;
404 if (++i >= list->num_bracket_matches ||
405 (++b)->type != TRE_BRACKET_MATCH_TYPE_RANGE_END)
406 {
407 DPRINT(("tre_bracket_match: no following range end\n"));
408 assert(0);
409 goto error;
410 }
411 end = b->value;
412 if (!got_equiv)
413 {
414 if (icase)
415 {
416 ue = __collate_equiv_value(tnfa->loc, &uc, 1);
417 le = __collate_equiv_value(tnfa->loc, &lc, 1);
418 }
419 else
420 we = __collate_equiv_value(tnfa->loc, &wc, 1);
421 got_equiv = 1;
422 }
423 if (icase)
424 match = ((start <= ue && ue <= end) ||
425 (start <= le && le <= end));
426 else
427 match = (start <= we && we <= end);
428 break;
429 }
430 case TRE_BRACKET_MATCH_TYPE_RANGE_END:
431 DPRINT(("tre_bracket_match: range end without preceeding start\n"));
432 assert(0);
433 break;
434 case TRE_BRACKET_MATCH_TYPE_CLASS:
435 if (icase)
436 match = (tre_isctype_l(uc, b->value, tnfa->loc) ||
437 tre_isctype_l(lc, b->value, tnfa->loc));
438 else
439 match = (tre_isctype_l(wc, b->value, tnfa->loc));
440 break;
441 case TRE_BRACKET_MATCH_TYPE_EQUIVALENCE:
442 if (!got_equiv)
443 {
444 if (icase)
445 {
446 ue = __collate_equiv_value(tnfa->loc, &uc, 1);
447 le = __collate_equiv_value(tnfa->loc, &lc, 1);
448 }
449 else
450 we = __collate_equiv_value(tnfa->loc, &wc, 1);
451 got_equiv = 1;
452 }
453 if (icase)
454 match = (b->value == ue || b->value == le);
455 else
456 match = (b->value == we);
457 break;
458 default:
459 DPRINT(("tre_bracket_match: unknown type %d\n", b->type));
460 assert(0);
461 break;
462 }
463 if (match)
464 break;
465 }
466 error:
467 if (list->flags & TRE_BRACKET_MATCH_FLAG_NEGATE) {
468 if ((tnfa->cflags & REG_NEWLINE) && wc == '\n') return 0;
469 match = !match;
470 }
471 return match;
472 }