1 /*
2 * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 tre-parse.c - Regexp parser
31 */
32
33 /*
34 This parser is just a simple recursive descent parser for POSIX.2
35 regexps. The parser supports both the obsolete default syntax and
36 the "extended" syntax, and some nonstandard extensions.
37 */
38
39 #include <assert.h>
40 #include <errno.h>
41 #include <limits.h>
42 #include <stddef.h>
43 #include <string.h>
44
45 #include "malloc.h"
46 #include "tre-mem.h"
47 #include "tre-ast.h"
48 #include "tre-stack.h"
49 #include "tre-parse.h"
50
51 #include "../locale/collate.h"
52
53 /* BSD compatibility:
54 Before looking up a collating symbol, check if the name matches in
55 the character names (cnames) array; if so, use the corresponding
56 character.
57
58 Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve
59 the implementation choice that for ERE, a non-numeric character following
60 a left brace that would normally be a bound, causes the left brace to be
61 literal. */
62 #define BSD_COMPATIBILITY
63 #ifdef BSD_COMPATIBILITY
64 #include "cname.h"
65 #define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
66 #endif /* BSD_COMPATIBILITY */
67
68 /* Characters with special meanings in regexp syntax. */
69 #define CHAR_PIPE L'|'
70 #define CHAR_LPAREN L'('
71 #define CHAR_RPAREN L')'
72 #define CHAR_LBRACE L'{'
73 #define CHAR_RBRACE L'}'
74 #define CHAR_LBRACKET L'['
75 #define CHAR_RBRACKET L']'
76 #define CHAR_MINUS L'-'
77 #define CHAR_STAR L'*'
78 #define CHAR_QUESTIONMARK L'?'
79 #define CHAR_PLUS L'+'
80 #define CHAR_PERIOD L'.'
81 #define CHAR_COLON L':'
82 #define CHAR_EQUAL L'='
83 #define CHAR_COMMA L','
84 #define CHAR_CARET L'^'
85 #define CHAR_DOLLAR L'$'
86 #define CHAR_BACKSLASH L'\\'
87 #define CHAR_HASH L'#'
88 #define CHAR_TILDE L'~'
89
90 /* Some macros for expanding \w, \s, etc. */
91 static const struct tre_macro_struct {
92 const char c;
93 const char *expansion;
94 } tre_macros[] =
95 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
96 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
97 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
98 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
99 { 0, NULL }
100 };
101
102 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
103 must have at least `len' items. Sets buf[0] to zero if the there
104 is no match in `tre_macros'. */
105 static void
106 tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
107 tre_char_t *buf, size_t buf_len)
108 {
109 int i;
110
111 buf[0] = 0;
112 if (regex >= regex_end)
113 return;
114
115 for (i = 0; tre_macros[i].expansion; i++)
116 {
117 if (tre_macros[i].c == *regex)
118 {
119 unsigned int j;
120 DPRINT(("Expanding macro '%c' => '%s'\n",
121 tre_macros[i].c, tre_macros[i].expansion));
122 for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
123 buf[j] = tre_macros[i].expansion[j];
124 buf[j] = 0;
125 break;
126 }
127 }
128 }
129
130 static reg_errcode_t
131 tre_new_item(tre_mem_t mem, int type, int val, int *max_i,
132 tre_bracket_match_list_t **items)
133 {
134 reg_errcode_t status = REG_OK;
135 tre_bracket_match_list_t *array = *items;
136 int i = array->num_bracket_matches;
137 /* Allocate more space if necessary. */
138 if (i >= *max_i)
139 {
140 tre_bracket_match_list_t *new_items;
141 DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i));
142 /* If the array is already 1024 items large, give up -- there's
143 probably an error in the regexp (e.g. not a '\0' terminated
144 string and missing ']') */
145 if (*max_i >= 1024)
146 return REG_ESPACE;
147 *max_i *= 2;
148 new_items = realloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i));
149 if (new_items == NULL)
150 return REG_ESPACE;
151 *items = array = new_items;
152 }
153 array->bracket_matches[i].type = type;
154 array->bracket_matches[i].value = val;
155 array->num_bracket_matches++;
156 return status;
157 }
158
159 #define REST(re) (int)(ctx->re_end - (re)), (re)
160
161 #define START_COLLATING_SYMBOLS 16
162 #define MAX_COLLATING_SYMBOL_LEN 4
163
164 typedef struct {
165 const tre_char_t *start;
166 int len;
167 } tre_collating_symbol;
168
169 #ifdef BSD_COMPATIBILITY
170 static wchar_t
171 tre_search_cnames(const wchar_t *name, size_t len)
172 {
173 size_t low = 0;
174 size_t high = NCNAMES - 1;
175 size_t cur;
176 int cmp;
177
178 while(low <= high)
179 {
180 cur = (low + high) / 2;
181 cmp = wcsncmp(name, cnames[cur].name, len);
182 if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code;
183 if (cmp > 0) low = cur + 1;
184 else high = cur - 1;
185 }
186 return (wchar_t)-1;
187 }
188 #endif /* BSD_COMPATIBILITY */
189
190 /* Scan the contents of a bracket expression, and create a
191 * tre_bracket_match_list_t encoding the bracket expression. If during
192 * the scan, multi-character collating symbols are detected, switch
193 * into a mode to collect those MCCSs into a tre_collating_symbol
194 * list and pass them back. tre_parse_bracket will use that to
195 * create a new string composed of a union of the bracket expression
196 * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and
197 * call tre_parse (recursive) to parse that new string (which will
198 * call tre_parse_bracket and tre_parse_bracket_items again. */
199 static reg_errcode_t
200 tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items,
201 int *items_size, tre_collating_symbol **result)
202 {
203 const tre_char_t *re = ctx->re;
204 const tre_char_t *re_end = ctx->re_end;
205 tre_collating_symbol *col_syms = NULL;
206 tre_collating_symbol *cp = NULL;
207 int n_col_syms = 0;
208 reg_errcode_t status;
209 int max_i = *items_size;
210 int other = 0; /* contains content other than multi-character collating
211 * symbols */
212 int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */
213 tre_cint_t min, c;
214 int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE);
215 int collect_MCCS = 0;
216 const tre_char_t *start;
217
218 for ( ;re < re_end; re++)
219 {
220 switch (*re)
221 {
222 case CHAR_MINUS:
223 /* A first hyphen */
224 if (re == ctx->re)
225 {
226 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
227 min = CHAR_MINUS;
228 other++;
229 range = 0;
230 break;
231 }
232 /* The hyphen is the end range */
233 if (range > 0)
234 {
235 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
236 c = CHAR_MINUS;
237 goto process_end_range;
238 }
239 if (re + 1 >= re_end)
240 {
241 status = REG_EBRACK;
242 goto error;
243 }
244 /* The hyphen is at the end */
245 if (re[1] == CHAR_RBRACKET)
246 {
247 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
248 c = CHAR_MINUS;
249 goto process_begin_range;
250 }
251 /* Two ranges are not allowed to share an endpoint, or begin
252 * range is illegal. */
253 if (range < 0)
254 {
255 status = REG_ERANGE;
256 goto error;
257 }
258 range = 1; /* Expect end range */
259 DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re)));
260 break;
261
262 case CHAR_LBRACKET:
263 if (re + 1 >= re_end)
264 {
265 status = REG_EBRACK;
266 goto error;
267 }
268 switch (re[1])
269 {
270 case CHAR_PERIOD:
271 {
272 re += 2;
273 start = re;
274 for (;; re++)
275 {
276 if (re >= re_end)
277 {
278 status = REG_ECOLLATE;
279 goto error;
280 }
281 if (*re == CHAR_PERIOD)
282 {
283 if (re + 1 >= re_end)
284 {
285 status = REG_ECOLLATE;
286 goto error;
287 }
288 /* Found end */
289 if (re[1] == CHAR_RBRACKET)
290 {
291 DPRINT(("tre_parse_bracket: collating "
292 "symbol: '%.*" STRF "'\n",
293 REST(start - 2)));
294 /* Empty name */
295 if (re == start)
296 {
297 status = REG_ECOLLATE;
298 goto error;
299 }
300 #ifdef BSD_COMPATIBILITY
301 /* Check if the name is in cnames; if so, use
302 the corresponding code */
303 c = tre_search_cnames(start, re - start);
304 if (c != (wchar_t)-1)
305 {
306 re++;
307 goto process_single_character;
308 }
309 #endif /* BSD_COMPATIBILITY */
310 /* Verify this is a known sequence */
311 if (__collate_equiv_value(ctx->loc, start,
312 re - start) <= 0)
313 {
314 status = REG_ECOLLATE;
315 goto error;
316 }
317 /* Process single character collating symbols */
318 if (re - start == 1)
319 {
320 c = *start;
321 re++;
322 goto process_single_character;
323 }
324 /* Inverted MCCSs are undefined */
325 if (invert)
326 {
327 status = REG_ECOLLATE;
328 goto error;
329 }
330 /* Can't have MCCSs as an endpoint to a range */
331 if (range > 0)
332 {
333 status = REG_ERANGE;
334 goto error;
335 }
336 range = -1;
337 /* Switch into MCCS collection mode (if not
338 * already there */
339 #if TRE_DEBUG
340 if (!collect_MCCS)
341 {
342 collect_MCCS = 1;
343 DPRINT(("tre_parse_bracket: Detected MCCS\n"));
344 }
345 #else /* !TRE_DEBUG */
346 collect_MCCS = 1;
347 #endif /* !TRE_DEBUG */
348 /* Allocate a memory block the first time */
349 if (!cp)
350 {
351 if ((col_syms = malloc(sizeof(*col_syms) *
352 (START_COLLATING_SYMBOLS + 2)))
353 == NULL)
354 return REG_ESPACE;
355 cp = col_syms + 1;
356 n_col_syms = START_COLLATING_SYMBOLS;
357 }
358 /* Enlarge the memory block is more is needed */
359 if ((cp - col_syms) - 1 >= n_col_syms)
360 {
361 int i = n_col_syms;
362 tre_collating_symbol *tmp =
363 realloc(col_syms, sizeof(*col_syms) *
364 ((n_col_syms *= 2) + 2));
365 if (tmp == NULL)
366 {
367 free(col_syms);
368 return REG_ESPACE;
369 }
370 DPRINT(("tre_list_collating_symbols: "
371 "Enlarging col_syms to %d\n",
372 n_col_syms));
373 col_syms = tmp;
374 cp = col_syms + i + 1;
375 }
376 cp->start = start;
377 cp->len = re - start;
378 cp++;
379 re++;
380 break;
381 }
382 }
383 }
384 break;
385 }
386
387 case CHAR_EQUAL:
388 case CHAR_COLON:
389 {
390 /* Process equivalence and character classes */
391 tre_char_t kind = re[1];
392
393 /* Can't have a class as an endpoint to a range */
394 if (range > 0)
395 {
396 status = REG_ERANGE;
397 goto error;
398 }
399 if (!collect_MCCS && range == 0)
400 {
401 status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
402 min, &max_i, items);
403 if (status != REG_OK)
404 goto error;
405 }
406 range = -1;
407 re += 2;
408 start = re;
409 for (;; re++)
410 {
411 if (re >= re_end)
412 {
413 status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE;
414 goto error;
415 }
416 if (*re == kind)
417 {
418 if (re + 1 >= re_end)
419 {
420 status = kind == CHAR_EQUAL ? REG_ECOLLATE :
421 REG_ECTYPE;
422 goto error;
423 }
424 /* Found end */
425 if (re[1] == CHAR_RBRACKET)
426 {
427 if (re == start)
428 {
429 /* Empty class name */
430 status = kind == CHAR_EQUAL ? REG_ECOLLATE :
431 REG_ECTYPE;
432 goto error;
433 }
434 /* Process equivalence class */
435 if (kind == CHAR_EQUAL)
436 {
437 int equiv;
438
439 DPRINT(("tre_parse_bracket: equivalence: '%.*"
440 STRF "'\n", REST(start - 2)));
441
442 /* While we find the collation value even for
443 multi-character collating elements , we
444 don't (yet) match any collation values
445 against multi-character sequences. We'd have
446 to enumerate those multi-character sequences
447 and like multi-character collating symbols,
448 create a union of those sequences with the
449 rest of the bracket expression. While
450 doable, a bracket expression matching
451 multiple characters, that doesn't explicitly
452 contain multi-character sequences, might
453 be unexpected, so we punt for now. */
454 if ((equiv = __collate_equiv_value(ctx->loc,
455 start, re - start)) <= 0)
456 {
457 /* The standard says that if no collating
458 element if found, we use the collating
459 symbol itself. But __collate_equiv_value
460 doesn't make a distinction between
461 an element that is in a equvalence
462 class with others, or is the only member,
463 so we already know there is no collating
464 symbol. (Note that in the case of a
465 collating element whose collation value
466 is unique, matching against the
467 collating element itself, or against
468 its collation value, is equivalent.) */
469 #ifdef BSD_COMPATIBILITY
470 /* Check if the name is in cnames; if so,
471 use the corresponding code */
472 c = tre_search_cnames(start, re - start);
473 if (c != (wchar_t)-1)
474 {
475 re++;
476 goto process_single_character;
477 }
478 #endif /* BSD_COMPATIBILITY */
479 status = REG_ECOLLATE;
480 goto error;
481 }
482 if (!collect_MCCS)
483 {
484 status = tre_new_item(ctx->mem,
485 TRE_BRACKET_MATCH_TYPE_EQUIVALENCE,
486 equiv, &max_i, items);
487 if (status != REG_OK)
488 goto error;
489 }
490 }
491 else
492 {
493 /* Process character class */
494 DPRINT(("tre_parse_bracket: class: '%.*" STRF
495 "'\n", REST(start - 2)));
496 if (!collect_MCCS)
497 {
498 char tmp_str[64];
499 tre_ctype_t class;
500 int len = MIN(re - start, 63);
501 {
502 tre_char_t tmp_wcs[64];
503 wcsncpy(tmp_wcs, start, (size_t)len);
504 tmp_wcs[len] = L'\0';
505 {
506 mbstate_t state;
507 const tre_char_t *src = tmp_wcs;
508 memset(&state, '\0', sizeof(state));
509 len = wcsrtombs_l(tmp_str, &src,
510 sizeof(tmp_str), &state,
511 ctx->loc);
512 }
513 }
514 tmp_str[len] = '\0';
515 DPRINT((" class name: %s\n", tmp_str));
516 class = tre_ctype_l(tmp_str, ctx->loc);
517 if (!class)
518 {
519 status = REG_ECTYPE;
520 goto error;
521 }
522 status = tre_new_item(ctx->mem,
523 TRE_BRACKET_MATCH_TYPE_CLASS,
524 class, &max_i, items);
525 if (status != REG_OK)
526 goto error;
527 }
528 }
529 re++;
530 break;
531 }
532 }
533 }
534 other++;
535 break;
536 }
537
538 default:
539 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
540 c = CHAR_LBRACKET;
541 goto process_single_character;
542 break;
543 }
544 break;
545
546 case CHAR_RBRACKET:
547 /* A first right bracket */
548 if (re == ctx->re)
549 {
550 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
551 min = CHAR_RBRACKET;
552 range = 0;
553 other++;
554 break;
555 }
556 /* Done */
557 if (collect_MCCS)
558 {
559 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n",
560 REST(re)));
561 if (col_syms)
562 {
563 /* Mark the character following the right bracket. Set len
564 * to whether there are other things besides the
565 * multi-character collating symbols */
566 col_syms->start = re + 1;
567 col_syms->len = other;
568 /* Mark the end of the list */
569 cp->start = NULL;
570 }
571 *result = col_syms;
572 return REG_OK;
573 }
574 /* range > 0 is not possible, since we did a lookahead after the
575 * hyphen */
576 if (range == 0)
577 {
578 status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
579 min, &max_i, items);
580 if (status != REG_OK)
581 goto error;
582 }
583 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re)));
584 *items_size = max_i;
585 ctx->re = re + 1;
586 return REG_OK;
587
588 default:
589 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
590 c = *re;
591 process_single_character:
592 /* Process single character */
593 if (range > 0)
594 {
595 int mine, maxe;
596
597 process_end_range:
598 /* Get collation equivalence values */
599 mine = __collate_equiv_value(ctx->loc, &min, 1);
600 maxe = __collate_equiv_value(ctx->loc, &c, 1);
601 if (maxe < mine)
602 {
603 status = REG_ERANGE;
604 goto error;
605 }
606 if (!collect_MCCS)
607 {
608 status = tre_new_item(ctx->mem,
609 TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN,
610 mine, &max_i, items);
611 if (status != REG_OK)
612 goto error;
613 status = tre_new_item(ctx->mem,
614 TRE_BRACKET_MATCH_TYPE_RANGE_END,
615 maxe, &max_i, items);
616 if (status != REG_OK)
617 goto error;
618 }
619 range = -1;
620 }
621 else
622 {
623 process_begin_range:
624 if (!collect_MCCS)
625 {
626 if (range == 0)
627 {
628 status = tre_new_item(ctx->mem,
629 TRE_BRACKET_MATCH_TYPE_CHAR,
630 min, &max_i, items);
631 if (status != REG_OK)
632 goto error;
633 }
634 min = c;
635 }
636 range = 0;
637 }
638 other++;
639 break;
640 }
641 }
642 status = REG_EBRACK;
643 error:
644 DPRINT(("tre_parse_bracket: error: '%.*" STRF "', status=%d\n",
645 REST(re), status));
646 if (col_syms)
647 free(col_syms);
648 return status;
649 }
650
651 #ifdef TRE_DEBUG
652 static const char *bracket_match_type_str[] = {
653 "unused",
654 "char",
655 "range begin",
656 "range end",
657 "class",
658 "equivalence value",
659 };
660 #endif /* TRE_DEBUG */
661
662 static reg_errcode_t
663 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
664 {
665 tre_ast_node_t *node;
666 reg_errcode_t status = REG_OK;
667 tre_bracket_match_list_t *items;
668 int max_i = 32;
669 tre_collating_symbol *col_syms = NULL;
670
671 /* Handle special cases [[:<:]] and [[:>:]] */
672 if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET
673 && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>')
674 && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET
675 && ctx->re[5] == CHAR_RBRACKET)
676 {
677 *result = tre_ast_new_literal(ctx->mem, ASSERTION,
678 (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW,
679 -1);
680 DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ?
681 "[[:<:]]" : "[[:>:]]"));
682 ctx->re += 6;
683 return *result ? REG_OK : REG_ESPACE;
684 }
685
686 /* Start off with an array of `max_i' elements. */
687 items = calloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i));
688 if (items == NULL)
689 return REG_ESPACE;
690
691 if (*ctx->re == CHAR_CARET)
692 {
693 DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
694 items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE;
695 ctx->re++;
696 }
697
698 status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms);
699
700 if (status != REG_OK)
701 goto parse_bracket_done;
702
703 /* If there are collating symbols, split off the multi-character ones
704 * into a union of the bracket expression (without the collating symbols)
705 * and the multiple-character sequences. We create an equivalent input
706 * string and run tre_parse() recursively */
707 if (col_syms)
708 {
709 tre_char_t *str, *sp;
710 tre_collating_symbol *cp;
711 tre_parse_ctx_t subctx;
712
713 /* Allocate a new string. We start with the size of the original
714 * bracket expression (minus 1) and add 2 (for a leading "[" and
715 * a trailing nil; don't need a "^", since it is illegal to have
716 * inverted MCCSs). Since a multi-character collating symbols
717 * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't
718 * need to worry about the new string getting too long. */
719 free(items);
720 str = malloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2));
721 if (str == NULL)
722 {
723 free(col_syms);
724 return REG_ESPACE;
725 }
726 sp = str;
727 if (col_syms->len > 0)
728 {
729 /* There are other items in the bracket expression besides the
730 * multi-character collating symbols, so create a new bracket
731 * expression with only those other itmes. */
732 const tre_char_t *re;
733 ptrdiff_t i;
734
735 *sp++ = '[';
736 re = ctx->re;
737 for (cp = col_syms + 1; cp->start; cp++)
738 {
739 /* The "- 2" is to account for the "[." */
740 if ((i = ((cp->start - re) - 2)) > 0)
741 {
742 memcpy(sp, re, sizeof(*sp) * i);
743 sp += i;
744 }
745 /* The "+ 2" is to account for the ".]" */
746 re = cp->start + cp->len + 2;
747 }
748 i = col_syms->start - re; /* Includes the trailing right bracket */
749 memcpy(sp, re, sizeof(*sp) * i);
750 sp += i;
751 *sp++ = '|';
752 }
753 for (cp = col_syms + 1; cp->start; cp++)
754 {
755 memcpy(sp, cp->start, sizeof(*sp) * cp->len);
756 sp += cp->len;
757 if (cp[1].start)
758 *sp++ = '|';
759 }
760 *sp = 0;
761 DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n",
762 str));
763
764 memcpy(&subctx, ctx, sizeof(subctx));
765 subctx.re = str;
766 subctx.len = sp - str;
767 subctx.nofirstsub = 1;
768 subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */
769 status = tre_parse(&subctx);
770 free(str);
771 if (status != REG_OK)
772 {
773 free(col_syms);
774 return status;
775 }
776 ctx->re = col_syms->start;
777 ctx->position = subctx.position;
778 free(col_syms);
779 *result = subctx.result;
780 DPRINT(("tre_parse_bracket: Returning to original string\n"));
781 return REG_OK;
782 }
783
784 DPRINT(("tre_parse_bracket: creating bracket expression literal\n"));
785 node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position);
786 if (node == NULL)
787 {
788 status = REG_ESPACE;
789 goto parse_bracket_done;
790 }
791 else
792 {
793 tre_literal_t *l = node->obj;
794 l->u.bracket_match_list = tre_mem_alloc(ctx->mem,
795 SIZEOF_BRACKET_MATCH_LIST(items));
796 if (l->u.bracket_match_list == NULL)
797 {
798 status = REG_ESPACE;
799 goto parse_bracket_done;
800 }
801 memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items));
802 }
803
804 #ifdef TRE_DEBUG
805 {
806 int i;
807 tre_bracket_match_t *b;
808 DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n",
809 items->num_bracket_matches, items->flags));
810 for (i = 0, b = items->bracket_matches;
811 i < items->num_bracket_matches; i++, b++)
812 {
813 DPRINT((" %d: %s %d\n", i, bracket_match_type_str[b->type],
814 b->value));
815 }
816 }
817 #endif /* TRE_DEBUG */
818
819 parse_bracket_done:
820 free(items);
821 ctx->position++;
822 *result = node;
823 return status;
824 }
825
826 /* Parses a positive decimal integer. Returns -1 if the string does not
827 contain a valid number. */
828 static int
829 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
830 {
831 int num;
832 tre_char_t *endptr;
833
834 errno = 0;
835 num = (int)wcstoul(*regex, (wchar_t **)&endptr, 10);
836 if (errno != 0)
837 return -1;
838 *regex = endptr;
839 return num;
840 }
841
842 static reg_errcode_t
843 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
844 {
845 int min, max;
846 const tre_char_t *r = ctx->re;
847 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
848
849 /* Parse number (minimum repetition count). */
850 min = -1;
851 if (r >= ctx->re_end)
852 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
853 return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE;
854 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
855 return REG_EBRACE;
856 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
857 if (*r >= L'0' && *r <= L'9') {
858 DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r)));
859 min = tre_parse_int(&r, ctx->re_end);
860 if (min == -1)
861 return REG_BADBR;
862 }
863 else
864 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
865 /* For ERE, return REG_NOMATCH to signal that the lbrace should
866 be treated as a literal */
867 return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR;
868 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
869 return REG_BADBR;
870 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
871
872 /* Parse comma and second number (maximum repetition count). */
873 max = min;
874 if (r < ctx->re_end && *r == CHAR_COMMA)
875 {
876 r++;
877 DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r)));
878 max = tre_parse_int(&r, ctx->re_end);
879 }
880
881 /* Check that the repeat counts are sane. */
882 if ((max > 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX)
883 return REG_BADBR;
884
885 /*{*//* Missing }. */
886 if (r >= ctx->re_end)
887 return REG_EBRACE;
888
889 /* Empty contents of {}. */
890 if (r == ctx->re)
891 return REG_BADBR;
892
893 /* Parse the ending '}' or '\}'.*/
894 if (ctx->cflags & REG_EXTENDED)
895 {
896 if (r >= ctx->re_end || *r != CHAR_RBRACE)
897 return REG_BADBR;
898 r++;
899 /* Parse trailing '?' marking minimal repetition. */
900 if (r < ctx->re_end)
901 {
902 if (*r == CHAR_QUESTIONMARK)
903 {
904 /* Process the question mark only in enhanced mode.
905 Otherwise, the question mark is an error in ERE
906 or a literal in BRE */
907 if (ctx->cflags & REG_ENHANCED)
908 {
909 minimal = !(ctx->cflags & REG_UNGREEDY);
910 r++;
911 }
912 else return REG_BADRPT;
913 }
914 else if (*r == CHAR_STAR || *r == CHAR_PLUS)
915 {
916 /* These are reserved for future extensions. */
917 return REG_BADRPT;
918 }
919 }
920 }
921 else
922 {
923 if (r + 1 >= ctx->re_end
924 || *r != CHAR_BACKSLASH
925 || *(r + 1) != CHAR_RBRACE)
926 return REG_BADBR;
927 r += 2;
928 if (r < ctx->re_end && *r == CHAR_STAR)
929 {
930 /* This is reserved for future extensions. */
931 return REG_BADRPT;
932 }
933 }
934
935 if (minimal)
936 ctx->num_reorder_tags++;
937
938 if (!result) goto parse_bound_exit;
939 /* Create the AST node(s). */
940 /* Originally, if min == 0 && max == 0, we immediately replace the whole
941 iteration with EMPTY. This unfortunately drops any submatches, and
942 messes up setting the pmatch values (we can get tags of -1, and
943 tag values in the billions). So we leave it and process this case as
944 usual, and wait until tre_expand_ast() to replace with EMPTY */
945
946 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
947 if (!*result)
948 return REG_ESPACE;
949
950 parse_bound_exit:
951 DPRINT(("tre_parse_bound: min %d, max %d\n", min, max));
952
953 ctx->re = r;
954 return REG_OK;
955 }
956
957 /* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for
958 non-self-contained options, like (?i), this causes ((?i)fu)bar to be
959 treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect.
960 Because we now set up tags for even non-capturing parenthesized
961 subexpressions, we always call PARSE_MARK_FOR_SUBMATCH. So if we
962 pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and
963 have it restore cflags after the subexpression, we don't need to have
964 a separate PARSE_RESTORE_CFLAGS, and then after processing the
965 non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE.
966 This has the side-benefit of now matching the perl behavior: the RE
967 foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior
968 of foo AND (?i) (bar OR zap). */
969 typedef enum {
970 PARSE_RE = 0,
971 PARSE_ATOM,
972 PARSE_MARK_FOR_SUBMATCH,
973 PARSE_BRANCH,
974 PARSE_PIECE,
975 PARSE_CATENATION,
976 PARSE_POST_CATENATION,
977 PARSE_UNION,
978 PARSE_POST_UNION,
979 PARSE_POSTFIX,
980 } tre_parse_re_stack_symbol_t;
981
982 reg_errcode_t
983 tre_parse(tre_parse_ctx_t *ctx)
984 {
985 tre_ast_node_t *result = NULL;
986 tre_parse_re_stack_symbol_t symbol;
987 reg_errcode_t status = REG_OK;
988 tre_stack_t *stack = ctx->stack;
989 int bottom = tre_stack_num_objects(stack);
990 int depth = 0;
991 int temporary_cflags = 0;
992 int bre_branch_begin;
993 #ifdef TRE_DEBUG
994 const tre_char_t *tmp_re;
995 #endif
996
997 DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n",
998 ctx->len, ctx->re, ctx->len, ctx->cflags));
999
1000 if (ctx->len <= 0) return REG_EMPTY;
1001 if (!ctx->nofirstsub)
1002 {
1003 STACK_PUSH(stack, int, ctx->cflags);
1004 STACK_PUSH(stack, int, ctx->submatch_id);
1005 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
1006 ctx->submatch_id++;
1007 }
1008 STACK_PUSH(stack, int, 0); // bre_branch_begin
1009 STACK_PUSH(stack, int, PARSE_RE);
1010 ctx->re_start = ctx->re;
1011 ctx->re_end = ctx->re + ctx->len;
1012
1013 /* The following is basically just a recursive descent parser. I use
1014 an explicit stack instead of recursive functions mostly because of
1015 two reasons: compatibility with systems which have an overflowable
1016 call stack, and efficiency (both in lines of code and speed). */
1017 while (tre_stack_num_objects(stack) > bottom)
1018 {
1019 symbol = tre_stack_pop_int(stack);
1020 switch (symbol)
1021 {
1022 case PARSE_RE:
1023 /* Parse a full regexp. A regexp is one or more branches,
1024 separated by the union operator `|'. */
1025 bre_branch_begin = tre_stack_pop_int(stack);
1026 if (!(ctx->cflags & REG_LITERAL) &&
1027 ctx->cflags & (REG_EXTENDED | REG_ENHANCED))
1028 STACK_PUSHX(stack, int, PARSE_UNION);
1029 STACK_PUSHX(stack, int, bre_branch_begin);
1030 STACK_PUSHX(stack, int, PARSE_BRANCH);
1031 break;
1032
1033 case PARSE_BRANCH:
1034 /* Parse a branch. A branch is one or more pieces, concatenated.
1035 A piece is an atom possibly followed by a postfix operator. */
1036 bre_branch_begin = tre_stack_pop_int(stack);
1037 STACK_PUSHX(stack, int, PARSE_CATENATION);
1038 STACK_PUSHX(stack, int, bre_branch_begin);
1039 STACK_PUSHX(stack, int, PARSE_PIECE);
1040 break;
1041
1042 case PARSE_PIECE:
1043 /* Parse a piece. A piece is an atom possibly followed by one
1044 or more postfix operators. */
1045 bre_branch_begin = tre_stack_pop_int(stack);
1046 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1047 STACK_PUSHX(stack, int, bre_branch_begin);
1048 STACK_PUSHX(stack, int, PARSE_ATOM);
1049 break;
1050
1051 case PARSE_CATENATION:
1052 /* If the expression has not ended, parse another piece. */
1053 {
1054 tre_char_t c;
1055 if (ctx->re >= ctx->re_end)
1056 break;
1057 c = *ctx->re;
1058 if (!(ctx->cflags & REG_LITERAL))
1059 {
1060 if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) ||
1061 ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED
1062 && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH &&
1063 *(ctx->re + 1) == CHAR_PIPE))
1064 break;
1065 if ((ctx->cflags & REG_EXTENDED
1066 && c == CHAR_RPAREN && depth > 0)
1067 || (!(ctx->cflags & REG_EXTENDED)
1068 && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH
1069 && *(ctx->re + 1) == CHAR_RPAREN))
1070 {
1071 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1072 return REG_EPAREN;
1073 DPRINT(("tre_parse: group end: '%.*" STRF "'\n",
1074 REST(ctx->re)));
1075 depth--;
1076 if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED)))
1077 ctx->re += 2;
1078 break;
1079 }
1080 }
1081
1082 #ifdef REG_LEFT_ASSOC
1083 if (ctx->cflags & REG_LEFT_ASSOC)
1084 {
1085 /* Left associative concatenation. */
1086 STACK_PUSHX(stack, int, PARSE_CATENATION);
1087 STACK_PUSHX(stack, voidptr, result);
1088 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1089 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1090 STACK_PUSHX(stack, int, PARSE_PIECE);
1091 }
1092 else
1093 #endif /* REG_LEFT_ASSOC */
1094 {
1095 /* Default case, right associative concatenation. */
1096 STACK_PUSHX(stack, voidptr, result);
1097 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1098 STACK_PUSHX(stack, int, PARSE_CATENATION);
1099 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1100 STACK_PUSHX(stack, int, PARSE_PIECE);
1101 }
1102 break;
1103 }
1104
1105 case PARSE_POST_CATENATION:
1106 {
1107 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1108 tre_ast_node_t *tmp_node;
1109 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1110 if (!tmp_node)
1111 return REG_ESPACE;
1112 result = tmp_node;
1113 break;
1114 }
1115
1116 case PARSE_UNION:
1117 if (ctx->re >= ctx->re_end)
1118 break;
1119 if (ctx->cflags & REG_LITERAL)
1120 break;
1121 if (!(ctx->cflags & REG_EXTENDED))
1122 {
1123 if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end)
1124 break;
1125 ctx->re++;
1126 }
1127 switch (*ctx->re)
1128 {
1129 case CHAR_PIPE:
1130 DPRINT(("tre_parse: union: '%.*" STRF "'\n",
1131 REST(ctx->re)));
1132 STACK_PUSHX(stack, int, PARSE_UNION);
1133 STACK_PUSHX(stack, voidptr, (void *)ctx->re);
1134 STACK_PUSHX(stack, voidptr, result);
1135 STACK_PUSHX(stack, int, PARSE_POST_UNION);
1136 /* We need to pass a boolean (eventually) to PARSE_ATOM to
1137 indicate if this is the beginning of a BRE extended branch. */
1138 STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin
1139 STACK_PUSHX(stack, int, PARSE_BRANCH);
1140 ctx->re++;
1141 break;
1142
1143 case CHAR_RPAREN:
1144 ctx->re++;
1145 break;
1146
1147 default:
1148 if (!(ctx->cflags & REG_EXTENDED))
1149 ctx->re--;
1150 break;
1151 }
1152 break;
1153
1154 case PARSE_POST_UNION:
1155 {
1156 tre_ast_node_t *tmp_node;
1157 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1158 const tre_char_t *pipechar = tre_stack_pop_voidptr(stack);
1159 /* error on empty expression at end of union */
1160 if (pipechar == ctx->re - 1)
1161 {
1162 return REG_EMPTY;
1163 }
1164 tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1165 if (!tmp_node)
1166 return REG_ESPACE;
1167 result = tmp_node;
1168 break;
1169 }
1170
1171 case PARSE_POSTFIX:
1172 /* Parse postfix operators. */
1173 if (ctx->re >= ctx->re_end)
1174 break;
1175 if (ctx->cflags & REG_LITERAL)
1176 break;
1177 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1178 int rep_min = 0;
1179 int rep_max = -1;
1180 #ifdef TRE_DEBUG
1181 int lbrace_off;
1182 #endif
1183 switch (*ctx->re)
1184 {
1185 case CHAR_PLUS:
1186 case CHAR_QUESTIONMARK:
1187 if (!(ctx->cflags & REG_EXTENDED))
1188 break;
1189 /*FALLTHROUGH*/
1190 case CHAR_STAR:
1191 {
1192 tre_ast_node_t *tmp_node;
1193 #ifdef TRE_DEBUG
1194 const char *tstr = "star";
1195 tmp_re = ctx->re;
1196 #endif
1197
1198 handle_plus_or_question:
1199 /* error on iteration of raw assertion (not in subexpression) */
1200 if (result->type == LITERAL && result->submatch_id < 0 &&
1201 IS_ASSERTION((tre_literal_t *)result->obj))
1202 {
1203 if (!(ctx->cflags & REG_EXTENDED)) break;
1204 return REG_BADRPT;
1205 }
1206 if (*ctx->re == CHAR_PLUS)
1207 {
1208 rep_min = 1;
1209 #ifdef TRE_DEBUG
1210 tstr = "plus";
1211 #endif
1212 }
1213 if (*ctx->re == CHAR_QUESTIONMARK)
1214 {
1215 rep_max = 1;
1216 #ifdef TRE_DEBUG
1217 tstr = "questionmark";
1218 #endif
1219 }
1220
1221 if (ctx->cflags & REG_EXTENDED)
1222 {
1223 if (ctx->re + 1 < ctx->re_end)
1224 {
1225 if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1226 {
1227 /* Process the question mark only in enhanced mode.
1228 Otherwise, the question mark is an error in ERE */
1229 if (ctx->cflags & REG_ENHANCED)
1230 {
1231 minimal = !(ctx->cflags & REG_UNGREEDY);
1232 ctx->re++;
1233 }
1234 else return REG_BADRPT;
1235 }
1236 else if (*(ctx->re + 1) == CHAR_STAR
1237 || *(ctx->re + 1) == CHAR_PLUS)
1238 {
1239 /* These are reserved for future extensions. */
1240 return REG_BADRPT;
1241 }
1242 }
1243 }
1244 else
1245 {
1246 if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR)
1247 {
1248 /* This is reserved for future extensions. */
1249 return REG_BADRPT;
1250 }
1251 if (ctx->re + 2 < ctx->re_end)
1252 {
1253 if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_QUESTIONMARK)
1254 {
1255 /* Process the question mark only in enhanced mode.
1256 Otherwise, the question mark is a literal in BRE */
1257 if (ctx->cflags & REG_ENHANCED)
1258 {
1259 minimal = !(ctx->cflags & REG_UNGREEDY);
1260 ctx->re += 2;
1261 }
1262 }
1263 else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS)
1264 {
1265 /* This is reserved for future extensions. */
1266 return REG_BADRPT;
1267 }
1268 }
1269 }
1270
1271 if (minimal)
1272 ctx->num_reorder_tags++;
1273
1274 DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n",
1275 minimal ? " minimal" : "greedy", tstr, REST(tmp_re)));
1276 if (result == NULL)
1277 {
1278 if (ctx->cflags & REG_EXTENDED) return REG_BADRPT;
1279 else goto parse_literal;
1280 }
1281 ctx->re++;
1282 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1283 minimal);
1284 if (tmp_node == NULL)
1285 return REG_ESPACE;
1286 result = tmp_node;
1287
1288 /* Set the iterator with a submatch id in the invisible range
1289 * (which will be overridden if a real submatch is needed) */
1290 result->submatch_id = ctx->submatch_id_invisible++;
1291
1292 #if 0
1293 /* We don't allow multiple postfixes, but this might be needed
1294 to support approximate matching */
1295 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1296 #endif
1297 }
1298 break;
1299
1300 case CHAR_BACKSLASH:
1301 /* "\{" is special without REG_EXTENDED */
1302 /* "\+" and "\?" are special with REG_ENHANCED for BRE */
1303 if (!(ctx->cflags & REG_EXTENDED)
1304 && ctx->re + 1 < ctx->re_end)
1305 {
1306 switch (*(ctx->re + 1))
1307 {
1308 case CHAR_LBRACE:
1309 ctx->re++;
1310 #ifdef TRE_DEBUG
1311 lbrace_off = 2;
1312 #endif
1313 goto parse_brace;
1314 case CHAR_PLUS:
1315 case CHAR_QUESTIONMARK:
1316 if (ctx->cflags & REG_ENHANCED)
1317 {
1318 #ifdef TRE_DEBUG
1319 tmp_re = ctx->re;
1320 #endif
1321 ctx->re++;
1322 goto handle_plus_or_question;
1323 }
1324 break;
1325 }
1326 break;
1327 }
1328 else
1329 break;
1330
1331 case CHAR_LBRACE:
1332 {
1333 int raw_assertion;
1334
1335 /* "{" is literal without REG_EXTENDED */
1336 if (!(ctx->cflags & REG_EXTENDED))
1337 break;
1338 #ifdef TRE_DEBUG
1339 lbrace_off = 1;
1340 #endif
1341
1342 parse_brace:
1343 /* error on iteration of raw assertion (not in subexpression),
1344 but wait until after parsing bounds */
1345 raw_assertion = (result->type == LITERAL
1346 && result->submatch_id < 0
1347 && IS_ASSERTION((tre_literal_t *)result->obj));
1348 ctx->re++;
1349
1350 status = tre_parse_bound(ctx, &result);
1351 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1352 /* For ERE, if status is REG_NOMATCH, this mean the lbrace
1353 is to be treated as a literal. */
1354 if (status == REG_NOMATCH)
1355 {
1356 ctx->re--;
1357 break;
1358 }
1359 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1360 DPRINT(("tre_parse: bound: '%.*" STRF "'\n",
1361 REST(ctx->re - lbrace_off)));
1362 if (status != REG_OK)
1363 return status;
1364 if (raw_assertion) return REG_BADRPT;
1365
1366 /* Set the iterator with a submatch id in the invisible range
1367 * (which will be overridden if a real submatch is needed) */
1368 if (result->type == ITERATION)
1369 result->submatch_id = ctx->submatch_id_invisible++;
1370
1371 #if 0
1372 /* We don't allow multiple postfixes, but this might be needed
1373 to support approximate matching */
1374 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1375 #endif
1376 break;
1377 }
1378 }
1379 break;
1380
1381 case PARSE_ATOM:
1382 {
1383 /* Parse an atom. An atom is a regular expression enclosed in `()',
1384 an empty set of `()', a bracket expression, `.', `^', `$',
1385 a `\' followed by a character, or a single character. */
1386
1387 /* The stack contains a boolean value, whether PARSE_ATOM is
1388 being called just after the start of a group (left paren)
1389 in a BRE */
1390 bre_branch_begin = tre_stack_pop_int(stack);
1391
1392 /* End of regexp? (empty string). */
1393 if (ctx->re >= ctx->re_end)
1394 goto parse_literal;
1395
1396 if (ctx->cflags & REG_LITERAL)
1397 goto parse_literal;
1398
1399 switch (*ctx->re)
1400 {
1401 case CHAR_LPAREN: /* parenthesized subexpression */
1402
1403 /* Handle "(?...)" extensions. They work in a way similar
1404 to Perls corresponding extensions. */
1405 if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) ==
1406 (REG_EXTENDED|REG_ENHANCED)
1407 && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1408 {
1409 int new_cflags = ctx->cflags;
1410 int bit = 1;
1411 int invisible_submatch = 0;
1412 DPRINT(("tre_parse: extension: '%.*" STRF "'\n",
1413 REST(ctx->re)));
1414 ctx->re += 2;
1415 while (/*CONSTCOND*/1)
1416 {
1417 if (*ctx->re == L'i')
1418 {
1419 DPRINT(("tre_parse: icase: '%.*" STRF "'\n",
1420 REST(ctx->re)));
1421 if (bit)
1422 new_cflags |= REG_ICASE;
1423 else
1424 new_cflags &= ~REG_ICASE;
1425 ctx->re++;
1426 }
1427 else if (*ctx->re == L'n')
1428 {
1429 DPRINT(("tre_parse: newline: '%.*" STRF "'\n",
1430 REST(ctx->re)));
1431 if (bit)
1432 new_cflags |= REG_NEWLINE;
1433 else
1434 new_cflags &= ~REG_NEWLINE;
1435 ctx->re++;
1436 }
1437 #ifdef REG_LEFT_ASSOC
1438 else if (*ctx->re == L'l')
1439 {
1440 DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n",
1441 REST(ctx->re)));
1442 if (bit)
1443 new_cflags |= REG_LEFT_ASSOC;
1444 else
1445 new_cflags &= ~REG_LEFT_ASSOC;
1446 ctx->re++;
1447 }
1448 #endif /* REG_LEFT_ASSOC */
1449 #ifdef REG_UNGREEDY
1450 else if (*ctx->re == L'U')
1451 {
1452 DPRINT(("tre_parse: ungreedy: '%.*" STRF "'\n",
1453 REST(ctx->re)));
1454 if (bit)
1455 new_cflags |= REG_UNGREEDY;
1456 else
1457 new_cflags &= ~REG_UNGREEDY;
1458 ctx->re++;
1459 }
1460 #endif /* REG_UNGREEDY */
1461 else if (*ctx->re == CHAR_MINUS)
1462 {
1463 DPRINT(("tre_parse: turn off: '%.*" STRF "'\n",
1464 REST(ctx->re)));
1465 ctx->re++;
1466 bit = 0;
1467 }
1468 else if (*ctx->re == CHAR_COLON)
1469 {
1470 DPRINT(("tre_parse: no group: '%.*" STRF
1471 "', (invisible submatch %d)\n",
1472 REST(ctx->re), ctx->submatch_id_invisible));
1473 ctx->re++;
1474 depth++;
1475 invisible_submatch = 1;
1476 break;
1477 }
1478 else if (*ctx->re == CHAR_HASH)
1479 {
1480 DPRINT(("tre_parse: comment: '%.*" STRF "'\n",
1481 REST(ctx->re)));
1482 /* A comment can contain any character except a
1483 right parenthesis */
1484 while (*ctx->re != CHAR_RPAREN
1485 && ctx->re < ctx->re_end)
1486 ctx->re++;
1487 if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1488 {
1489 ctx->re++;
1490 break;
1491 }
1492 else
1493 return REG_BADPAT;
1494 }
1495 else if (*ctx->re == CHAR_RPAREN)
1496 {
1497 ctx->re++;
1498 break;
1499 }
1500 else
1501 return REG_BADRPT;
1502 }
1503
1504 /* Turn on the cflags changes for the rest of the
1505 enclosing group. */
1506 if (invisible_submatch)
1507 {
1508 STACK_PUSHX(stack, int, ctx->cflags);
1509 STACK_PUSHX(stack, int, ctx->submatch_id_invisible);
1510 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1511 ctx->submatch_id_invisible++;
1512 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1513 STACK_PUSHX(stack, int, PARSE_RE);
1514 }
1515 else {
1516 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1517 STACK_PUSHX(stack, int, PARSE_ATOM);
1518 }
1519 ctx->cflags = new_cflags;
1520 break;
1521 }
1522
1523 if (ctx->cflags & REG_EXTENDED)
1524 {
1525 parse_bre_lparen:
1526 DPRINT(("tre_parse: group begin: '%.*" STRF
1527 "', submatch %d\n", REST(ctx->re),
1528 ctx->submatch_id));
1529 ctx->re++;
1530 /* First parse a whole RE, then mark the resulting tree
1531 for submatching. */
1532 STACK_PUSHX(stack, int, ctx->cflags);
1533 STACK_PUSHX(stack, int, ctx->submatch_id);
1534 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1535 /* We need to pass a boolean (eventually) to PARSE_ATOM to
1536 indicate if this is the beginning of a BRE group. */
1537 STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED));
1538 STACK_PUSHX(stack, int, PARSE_RE);
1539 ctx->submatch_id++;
1540 depth++;
1541 }
1542 else
1543 goto parse_literal;
1544 break;
1545
1546 case CHAR_RPAREN: /* end of current subexpression */
1547 if (ctx->cflags & REG_EXTENDED && depth > 0)
1548 {
1549 parse_bre_rparen_empty:
1550 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1551 return REG_EPAREN;
1552 DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
1553 REST(ctx->re)));
1554 /* We were expecting an atom, but instead the current
1555 subexpression was closed. POSIX leaves the meaning of
1556 this to be implementation-defined. We interpret this as
1557 an empty expression (which matches an empty string). */
1558 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1559 if (result == NULL)
1560 return REG_ESPACE;
1561 if (!(ctx->cflags & REG_EXTENDED))
1562 ctx->re--;
1563 }
1564 else
1565 goto parse_literal;
1566 break;
1567
1568 case CHAR_LBRACKET: /* bracket expression */
1569 DPRINT(("tre_parse: bracket: '%.*" STRF "'\n",
1570 REST(ctx->re)));
1571 ctx->re++;
1572 status = tre_parse_bracket(ctx, &result);
1573 if (status != REG_OK)
1574 return status;
1575 break;
1576
1577 case CHAR_BACKSLASH:
1578 /* Deal with "\(", "\)" or "\{" for BREs */
1579 if (!(ctx->cflags & REG_EXTENDED)
1580 && ctx->re + 1 < ctx->re_end)
1581 {
1582 if (*(ctx->re + 1) == CHAR_LPAREN)
1583 {
1584 ctx->re++;
1585 goto parse_bre_lparen;
1586 }
1587 else if (*(ctx->re + 1) == CHAR_RPAREN)
1588 {
1589 ctx->re++;
1590 goto parse_bre_rparen_empty;
1591 }
1592 if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal;
1593 }
1594
1595 if (ctx->re + 1 >= ctx->re_end)
1596 /* Trailing backslash. */
1597 return REG_EESCAPE;
1598
1599 /* XXX illumos: want macro expansion enabled by default (same as TRE) */
1600 #if 0
1601 if (!(ctx->cflags & REG_ENHANCED))
1602 {
1603 DPRINT(("tre_parse: unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re)));
1604 ctx->re++;
1605 goto unenhanced_backslash;
1606 }
1607 #endif
1608
1609 /* If a macro is used, parse the expanded macro recursively. */
1610 {
1611 tre_char_t buf[64];
1612 tre_expand_macro(ctx->re + 1, ctx->re_end,
1613 buf, elementsof(buf));
1614 if (buf[0] != 0)
1615 {
1616 tre_parse_ctx_t subctx;
1617 memcpy(&subctx, ctx, sizeof(subctx));
1618 subctx.re = buf;
1619 subctx.len = tre_strlen(buf);
1620 subctx.nofirstsub = 1;
1621 status = tre_parse(&subctx);
1622 if (status != REG_OK)
1623 return status;
1624 ctx->re += 2;
1625 ctx->position = subctx.position;
1626 result = subctx.result;
1627 break;
1628 }
1629 }
1630
1631 if (*(ctx->re + 1) == L'Q')
1632 {
1633 DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1634 REST(ctx->re)));
1635 ctx->cflags |= REG_LITERAL;
1636 temporary_cflags |= REG_LITERAL;
1637 ctx->re += 2;
1638 STACK_PUSHX(stack, int, 0);
1639 STACK_PUSHX(stack, int, PARSE_ATOM);
1640 break;
1641 }
1642
1643 DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re)));
1644 ctx->re++;
1645 switch (*ctx->re)
1646 {
1647 case L'b':
1648 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1649 ASSERT_AT_WB, -1);
1650 ctx->re++;
1651 break;
1652 case L'B':
1653 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1654 ASSERT_AT_WB_NEG, -1);
1655 ctx->re++;
1656 break;
1657 case L'<':
1658 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1659 ASSERT_AT_BOW, -1);
1660 ctx->re++;
1661 break;
1662 case L'>':
1663 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1664 ASSERT_AT_EOW, -1);
1665 ctx->re++;
1666 break;
1667 case L'x':
1668 ctx->re++;
1669 if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1670 {
1671 /* 8 bit hex char. */
1672 char tmp[3] = {0, 0, 0};
1673 long val;
1674 DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n",
1675 REST(ctx->re - 2)));
1676
1677 if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1678 ctx->re < ctx->re_end)
1679 {
1680 tmp[0] = (char)ctx->re[0];
1681 ctx->re++;
1682 }
1683 if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1684 ctx->re < ctx->re_end)
1685 {
1686 tmp[1] = (char)ctx->re[0];
1687 ctx->re++;
1688 }
1689 val = strtol(tmp, NULL, 16);
1690 result = tre_ast_new_literal(ctx->mem, (int)val,
1691 (int)val, ctx->position);
1692 ctx->position++;
1693 break;
1694 }
1695 else if (ctx->re < ctx->re_end)
1696 {
1697 /* Wide char. */
1698 char tmp[32];
1699 long val;
1700 int i = 0;
1701 ctx->re++;
1702 while (ctx->re_end - ctx->re >= 0)
1703 {
1704 if (ctx->re[0] == CHAR_RBRACE)
1705 break;
1706 if (tre_isxdigit_l(ctx->re[0], ctx->loc))
1707 {
1708 tmp[i] = (char)ctx->re[0];
1709 i++;
1710 ctx->re++;
1711 continue;
1712 }
1713 return REG_EBRACE;
1714 }
1715 ctx->re++;
1716 tmp[i] = 0;
1717 val = strtol(tmp, NULL, 16);
1718 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1719 ctx->position);
1720 ctx->position++;
1721 break;
1722 }
1723 /*FALLTHROUGH*/
1724
1725 default:
1726 unenhanced_backslash:
1727 if ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) !=
1728 REG_EXTENDED &&
1729 tre_isdigit_l(*ctx->re, ctx->loc) && *ctx->re != L'0')
1730 {
1731 /* Back reference (only in BRE or enhanced). */
1732 int val = *ctx->re - L'0';
1733 DPRINT(("tre_parse: backref: '%.*" STRF "'\n",
1734 REST(ctx->re - 1)));
1735 result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1736 ctx->position);
1737 if (result == NULL)
1738 return REG_ESPACE;
1739
1740 /* Set the backref with a submatch id in the invisible
1741 * range (which will be overridden if a real submatch
1742 * is needed) */
1743 result->submatch_id = ctx->submatch_id_invisible++;
1744
1745 ctx->position++;
1746 ctx->num_reorder_tags++;
1747 ctx->max_backref = MAX(val, ctx->max_backref);
1748 ctx->re++;
1749 }
1750 else
1751 {
1752 /* Escaped character. */
1753 DPRINT(("tre_parse: escaped: '%.*" STRF "'\n",
1754 REST(ctx->re - 1)));
1755 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1756 ctx->position);
1757 ctx->position++;
1758 ctx->re++;
1759 }
1760 break;
1761 }
1762 if (result == NULL)
1763 return REG_ESPACE;
1764 break;
1765
1766 case CHAR_PERIOD: /* the any-symbol */
1767 DPRINT(("tre_parse: any: '%.*" STRF "'\n",
1768 REST(ctx->re)));
1769 if (ctx->cflags & REG_NEWLINE)
1770 {
1771 tre_ast_node_t *tmp1;
1772 tre_ast_node_t *tmp2;
1773 tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
1774 ctx->position);
1775 if (!tmp1)
1776 return REG_ESPACE;
1777 tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
1778 ctx->position + 1);
1779 if (!tmp2)
1780 return REG_ESPACE;
1781 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1782 if (!result)
1783 return REG_ESPACE;
1784 ctx->position += 2;
1785 }
1786 else
1787 {
1788 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1789 ctx->position);
1790 if (!result)
1791 return REG_ESPACE;
1792 ctx->position++;
1793 }
1794 ctx->re++;
1795 break;
1796
1797 case CHAR_CARET: /* beginning of line assertion */
1798 /* '^' has a special meaning everywhere in EREs, at the
1799 beginning of the RE and after \( is BREs. It is also
1800 special in enhanced BREs at the beginning of each branches
1801 of a union */
1802 if (ctx->cflags & REG_EXTENDED
1803 || bre_branch_begin
1804 || ctx->re == ctx->re_start)
1805 {
1806 DPRINT(("tre_parse: BOL: '%.*" STRF "'\n",
1807 REST(ctx->re)));
1808 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1809 ASSERT_AT_BOL, -1);
1810 if (result == NULL)
1811 return REG_ESPACE;
1812 ctx->re++;
1813 }
1814 else
1815 goto parse_literal;
1816 break;
1817
1818 case CHAR_DOLLAR: /* end of line assertion. */
1819 /* '$' is special everywhere in EREs, and in the end of the
1820 string and before \) is BREs. */
1821 if (ctx->cflags & REG_EXTENDED
1822 || (ctx->re + 2 < ctx->re_end
1823 && *(ctx->re + 1) == CHAR_BACKSLASH
1824 && *(ctx->re + 2) == CHAR_RPAREN)
1825 || ctx->re + 1 == ctx->re_end)
1826 {
1827 DPRINT(("tre_parse: EOL: '%.*" STRF "'\n",
1828 REST(ctx->re)));
1829 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1830 ASSERT_AT_EOL, -1);
1831 if (result == NULL)
1832 return REG_ESPACE;
1833 ctx->re++;
1834 }
1835 else
1836 goto parse_literal;
1837 break;
1838
1839 default:
1840 parse_literal:
1841
1842 if (temporary_cflags && ctx->re + 1 < ctx->re_end
1843 && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
1844 {
1845 DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n",
1846 REST(ctx->re)));
1847 ctx->cflags &= ~temporary_cflags;
1848 temporary_cflags = 0;
1849 ctx->re += 2;
1850 if (ctx->re < ctx->re_end)
1851 {
1852 STACK_PUSHX(stack, int, 0);
1853 STACK_PUSHX(stack, int, PARSE_ATOM);
1854 }
1855 else
1856 {
1857 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1858 if (!result) return REG_ESPACE;
1859 }
1860 break;
1861 }
1862
1863 /* We are expecting an atom. If the subexpression (or the whole
1864 regexp ends here, we interpret it as an empty expression
1865 (which matches an empty string), which is an error.
1866 Iterations of an empty expression is also an error. */
1867 if (!(ctx->cflags & REG_LITERAL))
1868 {
1869 /* error on end of string */
1870 if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN
1871 : REG_EMPTY;
1872 /* error on unions and iterations of empty expressions */
1873 if (ctx->cflags & REG_EXTENDED)
1874 {
1875 if (ctx->re < ctx->re_end)
1876 {
1877 if (*ctx->re == CHAR_PIPE) return REG_EMPTY;
1878 if (*ctx->re == CHAR_LBRACE)
1879 {
1880 ctx->re++;
1881 empty_parse_bound:
1882 /* We need to parse the bound first and return
1883 any error, before returning REG_BADRPT */
1884 status = tre_parse_bound(ctx, NULL);
1885 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1886 /* For ERE, if REG_NOMATCH is returned, we
1887 treat the lbrace as a literal. */
1888 if (status == REG_NOMATCH)
1889 {
1890 ctx->re--;
1891 /* Drop down to literal-handling code */
1892 }
1893 else
1894 {
1895 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1896 if (status != REG_OK)
1897 return status;
1898 return REG_BADRPT;
1899 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1900 }
1901 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1902 }
1903 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1904 else
1905 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1906 if (*ctx->re == CHAR_STAR
1907 || *ctx->re == CHAR_PLUS
1908 || *ctx->re == CHAR_QUESTIONMARK)
1909 {
1910 return REG_BADRPT;
1911 }
1912 }
1913 }
1914 else if (ctx->re + 1 < ctx->re_end
1915 && *ctx->re == CHAR_BACKSLASH
1916 && *(ctx->re + 1) == CHAR_LBRACE)
1917 {
1918 ctx->re += 2;
1919 goto empty_parse_bound;
1920 }
1921 }
1922
1923 DPRINT(("tre_parse: literal: '%.*" STRF "'\n",
1924 REST(ctx->re)));
1925 /* Note that we can't use an tre_isalpha() test here, since there
1926 may be characters which are alphabetic but neither upper or
1927 lower case. */
1928 if (ctx->cflags & REG_ICASE
1929 && (tre_isupper_l(*ctx->re, ctx->loc) ||
1930 tre_islower_l(*ctx->re, ctx->loc)))
1931 {
1932 tre_ast_node_t *tmp1;
1933 tre_ast_node_t *tmp2;
1934
1935 /* XXX - Can there be more than one opposite-case
1936 counterpoints for some character in some locale? Or
1937 more than two characters which all should be regarded
1938 the same character if case is ignored? If yes, there
1939 does not seem to be a portable way to detect it. I guess
1940 that at least for multi-character collating elements there
1941 could be several opposite-case counterpoints, but they
1942 cannot be supported portably anyway. */
1943 tmp1 = tre_ast_new_literal(ctx->mem,
1944 tre_toupper_l(*ctx->re, ctx->loc),
1945 tre_toupper_l(*ctx->re, ctx->loc),
1946 ctx->position);
1947 if (!tmp1)
1948 return REG_ESPACE;
1949 tmp2 = tre_ast_new_literal(ctx->mem,
1950 tre_tolower_l(*ctx->re, ctx->loc),
1951 tre_tolower_l(*ctx->re, ctx->loc),
1952 ctx->position);
1953 if (!tmp2)
1954 return REG_ESPACE;
1955 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1956 if (!result)
1957 return REG_ESPACE;
1958 }
1959 else
1960 {
1961 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1962 ctx->position);
1963 if (!result)
1964 return REG_ESPACE;
1965 }
1966 ctx->position++;
1967 ctx->re++;
1968 break;
1969 }
1970 break;
1971 }
1972
1973 case PARSE_MARK_FOR_SUBMATCH:
1974 {
1975 int submatch_id = tre_stack_pop_int(stack);
1976
1977 ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */
1978 if (result->submatch_id >= 0 &&
1979 result->submatch_id < SUBMATCH_ID_INVISIBLE_START)
1980 {
1981 tre_ast_node_t *n, *tmp_node;
1982 if (submatch_id >= SUBMATCH_ID_INVISIBLE_START)
1983 break;
1984 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1985 if (n == NULL)
1986 return REG_ESPACE;
1987 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1988 if (tmp_node == NULL)
1989 return REG_ESPACE;
1990 tmp_node->num_submatches = result->num_submatches;
1991 result = tmp_node;
1992 }
1993 result->submatch_id = submatch_id;
1994 if (submatch_id < SUBMATCH_ID_INVISIBLE_START)
1995 result->num_submatches++;
1996 break;
1997 }
1998
1999 default:
2000 assert(0);
2001 break;
2002 }
2003 }
2004
2005 /* Check for missing closing parentheses. */
2006 if (depth > 0)
2007 return REG_EPAREN;
2008
2009 ctx->result = result;
2010
2011 return REG_OK;
2012 }