1 /*
   2  * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  *
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  *
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  18  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  19  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  20  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  26  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28 
  29 /*
  30   tre-parse.c - Regexp parser
  31 */
  32 
  33 /*
  34   This parser is just a simple recursive descent parser for POSIX.2
  35   regexps.  The parser supports both the obsolete default syntax and
  36   the "extended" syntax, and some nonstandard extensions.
  37 */
  38 
  39 #include <assert.h>
  40 #include <errno.h>
  41 #include <limits.h>
  42 #include <stddef.h>
  43 #include <string.h>
  44 
  45 #include "malloc.h"
  46 #include "tre-mem.h"
  47 #include "tre-ast.h"
  48 #include "tre-stack.h"
  49 #include "tre-parse.h"
  50 
  51 #include "../locale/collate.h"
  52 
  53 /* BSD compatibility:
  54      Before looking up a collating symbol, check if the name matches in
  55      the character names (cnames) array; if so, use the corresponding
  56      character.
  57 
  58      Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve
  59      the implementation choice that for ERE, a non-numeric character following
  60      a left brace that would normally be a bound, causes the left brace to be
  61      literal. */
  62 #define BSD_COMPATIBILITY
  63 #ifdef BSD_COMPATIBILITY
  64 #include "cname.h"
  65 #define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
  66 #endif /* BSD_COMPATIBILITY */
  67 
  68 /* Characters with special meanings in regexp syntax. */
  69 #define CHAR_PIPE          L'|'
  70 #define CHAR_LPAREN        L'('
  71 #define CHAR_RPAREN        L')'
  72 #define CHAR_LBRACE        L'{'
  73 #define CHAR_RBRACE        L'}'
  74 #define CHAR_LBRACKET      L'['
  75 #define CHAR_RBRACKET      L']'
  76 #define CHAR_MINUS         L'-'
  77 #define CHAR_STAR          L'*'
  78 #define CHAR_QUESTIONMARK  L'?'
  79 #define CHAR_PLUS          L'+'
  80 #define CHAR_PERIOD        L'.'
  81 #define CHAR_COLON         L':'
  82 #define CHAR_EQUAL         L'='
  83 #define CHAR_COMMA         L','
  84 #define CHAR_CARET         L'^'
  85 #define CHAR_DOLLAR        L'$'
  86 #define CHAR_BACKSLASH     L'\\'
  87 #define CHAR_HASH          L'#'
  88 #define CHAR_TILDE         L'~'
  89 
  90 /* Some macros for expanding \w, \s, etc. */
  91 static const struct tre_macro_struct {
  92   const char c;
  93   const char *expansion;
  94 } tre_macros[] =
  95   { {'t', "\t"},           {'n', "\n"},            {'r', "\r"},
  96     {'f', "\f"},           {'a', "\a"},            {'e', "\033"},
  97     {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
  98     {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"},   {'D', "[^[:digit:]]"},
  99     { 0, NULL }
 100   };
 101 
 102 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
 103    must have at least `len' items.  Sets buf[0] to zero if the there
 104    is no match in `tre_macros'. */
 105 static void
 106 tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
 107                  tre_char_t *buf, size_t buf_len)
 108 {
 109   int i;
 110 
 111   buf[0] = 0;
 112   if (regex >= regex_end)
 113     return;
 114 
 115   for (i = 0; tre_macros[i].expansion; i++)
 116     {
 117       if (tre_macros[i].c == *regex)
 118         {
 119           unsigned int j;
 120           DPRINT(("Expanding macro '%c' => '%s'\n",
 121                   tre_macros[i].c, tre_macros[i].expansion));
 122           for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
 123             buf[j] = tre_macros[i].expansion[j];
 124           buf[j] = 0;
 125           break;
 126         }
 127     }
 128 }
 129 
 130 static reg_errcode_t
 131 tre_new_item(tre_mem_t mem, int type, int val, int *max_i,
 132          tre_bracket_match_list_t **items)
 133 {
 134   reg_errcode_t status = REG_OK;
 135   tre_bracket_match_list_t *array = *items;
 136   int i = array->num_bracket_matches;
 137   /* Allocate more space if necessary. */
 138   if (i >= *max_i)
 139     {
 140       tre_bracket_match_list_t *new_items;
 141       DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i));
 142       /* If the array is already 1024 items large, give up -- there's
 143          probably an error in the regexp (e.g. not a '\0' terminated
 144          string and missing ']') */
 145       if (*max_i >= 1024)
 146         return REG_ESPACE;
 147       *max_i *= 2;
 148       new_items = realloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i));
 149       if (new_items == NULL)
 150         return REG_ESPACE;
 151       *items = array = new_items;
 152     }
 153   array->bracket_matches[i].type = type;
 154   array->bracket_matches[i].value = val;
 155   array->num_bracket_matches++;
 156   return status;
 157 }
 158 
 159 #define REST(re) (int)(ctx->re_end - (re)), (re)
 160 
 161 #define START_COLLATING_SYMBOLS         16
 162 #define MAX_COLLATING_SYMBOL_LEN        4
 163 
 164 typedef struct {
 165   const tre_char_t *start;
 166   int len;
 167 } tre_collating_symbol;
 168 
 169 #ifdef BSD_COMPATIBILITY
 170 static wchar_t
 171 tre_search_cnames(const wchar_t *name, size_t len)
 172 {
 173   size_t low = 0;
 174   size_t high = NCNAMES - 1;
 175   size_t cur;
 176   int cmp;
 177 
 178   while(low <= high)
 179     {
 180       cur = (low + high) / 2;
 181       cmp = wcsncmp(name, cnames[cur].name, len);
 182       if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code;
 183       if (cmp > 0) low = cur + 1;
 184       else high = cur - 1;
 185     }
 186   return (wchar_t)-1;
 187 }
 188 #endif /* BSD_COMPATIBILITY */
 189 
 190 /* Scan the contents of a bracket expression, and create a
 191  * tre_bracket_match_list_t encoding the bracket expression.  If during
 192  * the scan, multi-character collating symbols are detected, switch
 193  * into a mode to collect those MCCSs into a tre_collating_symbol
 194  * list and pass them back.  tre_parse_bracket will use that to
 195  * create a new string composed of a union of the bracket expression
 196  * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and
 197  * call tre_parse (recursive) to parse that new string (which will
 198  * call tre_parse_bracket and tre_parse_bracket_items again. */
 199 static reg_errcode_t
 200 tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items,
 201                         int *items_size, tre_collating_symbol **result)
 202 {
 203   const tre_char_t *re = ctx->re;
 204   const tre_char_t *re_end = ctx->re_end;
 205   tre_collating_symbol *col_syms = NULL;
 206   tre_collating_symbol *cp = NULL;
 207   int n_col_syms = 0;
 208   reg_errcode_t status;
 209   int max_i = *items_size;
 210   int other = 0;  /* contains content other than multi-character collating
 211                    * symbols */
 212   int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */
 213   tre_cint_t min, c;
 214   int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE);
 215   int collect_MCCS = 0;
 216   const tre_char_t *start;
 217 
 218   for ( ;re < re_end; re++)
 219     {
 220       switch (*re)
 221         {
 222         case CHAR_MINUS:
 223           /* A first hyphen */
 224           if (re == ctx->re)
 225             {
 226               DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
 227               min = CHAR_MINUS;
 228               other++;
 229               range = 0;
 230               break;
 231             }
 232           /* The hyphen is the end range */
 233           if (range > 0)
 234             {
 235               DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
 236               c = CHAR_MINUS;
 237               goto process_end_range;
 238             }
 239           if (re + 1 >= re_end)
 240             {
 241               status = REG_EBRACK;
 242               goto error;
 243             }
 244           /* The hyphen is at the end */
 245           if (re[1] == CHAR_RBRACKET)
 246             {
 247               DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
 248               c = CHAR_MINUS;
 249               goto process_begin_range;
 250             }
 251           /* Two ranges are not allowed to share an endpoint, or begin
 252            * range is illegal. */
 253           if (range < 0)
 254             {
 255               status = REG_ERANGE;
 256               goto error;
 257             }
 258           range = 1; /* Expect end range */
 259           DPRINT(("tre_parse_bracket:   range: '%.*" STRF "'\n", REST(re)));
 260           break;
 261 
 262         case CHAR_LBRACKET:
 263           if (re + 1 >= re_end)
 264             {
 265               status = REG_EBRACK;
 266               goto error;
 267             }
 268           switch (re[1])
 269             {
 270             case CHAR_PERIOD:
 271               {
 272                 re += 2;
 273                 start = re;
 274                 for (;; re++)
 275                   {
 276                     if (re >= re_end)
 277                       {
 278                         status = REG_ECOLLATE;
 279                         goto error;
 280                       }
 281                     if (*re == CHAR_PERIOD)
 282                       {
 283                         if (re + 1 >= re_end)
 284                           {
 285                             status = REG_ECOLLATE;
 286                             goto error;
 287                           }
 288                         /* Found end */
 289                         if (re[1] == CHAR_RBRACKET)
 290                           {
 291                             DPRINT(("tre_parse_bracket:   collating "
 292                                     "symbol: '%.*" STRF "'\n",
 293                                     REST(start - 2)));
 294                             /* Empty name */
 295                             if (re == start)
 296                               {
 297                                 status = REG_ECOLLATE;
 298                                 goto error;
 299                               }
 300 #ifdef BSD_COMPATIBILITY
 301                             /* Check if the name is in cnames; if so, use
 302                                the corresponding code */
 303                             c = tre_search_cnames(start, re - start);
 304                             if (c != (wchar_t)-1)
 305                               {
 306                                 re++;
 307                                 goto process_single_character;
 308                               }
 309 #endif /* BSD_COMPATIBILITY */
 310                             /* Verify this is a known sequence */
 311                             if (__collate_equiv_value(ctx->loc, start,
 312                                                           re - start) <= 0)
 313                               {
 314                                 status = REG_ECOLLATE;
 315                                 goto error;
 316                               }
 317                             /* Process single character collating symbols */
 318                             if (re - start == 1)
 319                               {
 320                                 c = *start;
 321                                 re++;
 322                                 goto process_single_character;
 323                               }
 324                             /* Inverted MCCSs are undefined */
 325                             if (invert)
 326                               {
 327                                 status = REG_ECOLLATE;
 328                                 goto error;
 329                               }
 330                             /* Can't have MCCSs as an endpoint to a range */
 331                             if (range > 0)
 332                               {
 333                                 status = REG_ERANGE;
 334                                 goto error;
 335                               }
 336                             range = -1;
 337                             /* Switch into MCCS collection mode (if not
 338                              * already there */
 339 #if TRE_DEBUG
 340                             if (!collect_MCCS)
 341                               {
 342                                 collect_MCCS = 1;
 343                                 DPRINT(("tre_parse_bracket: Detected MCCS\n"));
 344                               }
 345 #else /* !TRE_DEBUG */
 346                             collect_MCCS = 1;
 347 #endif /* !TRE_DEBUG */
 348                             /* Allocate a memory block the first time */
 349                             if (!cp)
 350                               {
 351                                 if ((col_syms = malloc(sizeof(*col_syms) *
 352                                             (START_COLLATING_SYMBOLS + 2)))
 353                                             == NULL)
 354                                   return REG_ESPACE;
 355                                 cp = col_syms + 1;
 356                                 n_col_syms = START_COLLATING_SYMBOLS;
 357                               }
 358                             /* Enlarge the memory block is more is needed */
 359                             if ((cp - col_syms) - 1 >= n_col_syms)
 360                               {
 361                                 int i = n_col_syms;
 362                                 tre_collating_symbol *tmp =
 363                                     realloc(col_syms, sizeof(*col_syms) *
 364                                              ((n_col_syms *= 2) + 2));
 365                                 if (tmp == NULL)
 366                                   {
 367                                     free(col_syms);
 368                                     return REG_ESPACE;
 369                                   }
 370                                 DPRINT(("tre_list_collating_symbols: "
 371                                         "Enlarging col_syms to %d\n",
 372                                         n_col_syms));
 373                                 col_syms = tmp;
 374                                 cp = col_syms + i + 1;
 375                               }
 376                             cp->start = start;
 377                             cp->len = re - start;
 378                             cp++;
 379                             re++;
 380                             break;
 381                           }
 382                       }
 383                   }
 384                 break;
 385               }
 386 
 387             case CHAR_EQUAL:
 388             case CHAR_COLON:
 389               {
 390                 /* Process equivalence and character classes */
 391                 tre_char_t kind = re[1];
 392 
 393                 /* Can't have a class as an endpoint to a range */
 394                 if (range > 0)
 395                   {
 396                     status = REG_ERANGE;
 397                     goto error;
 398                   }
 399                 if (!collect_MCCS && range == 0)
 400                   {
 401                     status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
 402                                           min, &max_i, items);
 403                     if (status != REG_OK)
 404                       goto error;
 405                   }
 406                 range = -1;
 407                 re += 2;
 408                 start = re;
 409                 for (;; re++)
 410                   {
 411                     if (re >= re_end)
 412                       {
 413                         status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE;
 414                         goto error;
 415                       }
 416                     if (*re == kind)
 417                       {
 418                         if (re + 1 >= re_end)
 419                           {
 420                             status = kind == CHAR_EQUAL ? REG_ECOLLATE :
 421                                                           REG_ECTYPE;
 422                             goto error;
 423                           }
 424                         /* Found end */
 425                         if (re[1] == CHAR_RBRACKET)
 426                           {
 427                             if (re == start)
 428                               {
 429                                 /* Empty class name */
 430                                 status = kind == CHAR_EQUAL ? REG_ECOLLATE :
 431                                                               REG_ECTYPE;
 432                                 goto error;
 433                               }
 434                             /* Process equivalence class */
 435                             if (kind == CHAR_EQUAL)
 436                               {
 437                                 int equiv;
 438 
 439                                 DPRINT(("tre_parse_bracket:   equivalence: '%.*"
 440                                         STRF "'\n", REST(start - 2)));
 441 
 442                                 /* While we find the collation value even for
 443                                    multi-character collating elements , we
 444                                    don't (yet) match any collation values
 445                                    against multi-character sequences.  We'd have
 446                                    to enumerate those multi-character sequences
 447                                    and like multi-character collating symbols,
 448                                    create a union of those sequences with the
 449                                    rest of the bracket expression.  While
 450                                    doable, a bracket expression matching
 451                                    multiple characters, that doesn't explicitly
 452                                    contain multi-character sequences, might
 453                                    be unexpected, so we punt for now. */
 454                                 if ((equiv = __collate_equiv_value(ctx->loc,
 455                                              start, re - start)) <= 0)
 456                                   {
 457                                     /* The standard says that if no collating
 458                                        element if found, we use the collating
 459                                        symbol itself.  But __collate_equiv_value
 460                                        doesn't make a distinction between
 461                                        an element that is in a equvalence
 462                                        class with others, or is the only member,
 463                                        so we already know there is no collating
 464                                        symbol.  (Note that in the case of a
 465                                        collating element whose collation value
 466                                        is unique, matching against the
 467                                        collating element itself, or against
 468                                        its collation value, is equivalent.) */
 469 #ifdef BSD_COMPATIBILITY
 470                                     /* Check if the name is in cnames; if so,
 471                                        use the corresponding code */
 472                                     c = tre_search_cnames(start, re - start);
 473                                     if (c != (wchar_t)-1)
 474                                       {
 475                                         re++;
 476                                         goto process_single_character;
 477                                       }
 478 #endif /* BSD_COMPATIBILITY */
 479                                     status = REG_ECOLLATE;
 480                                     goto error;
 481                                   }
 482                                 if (!collect_MCCS)
 483                                   {
 484                                     status = tre_new_item(ctx->mem,
 485                                              TRE_BRACKET_MATCH_TYPE_EQUIVALENCE,
 486                                              equiv, &max_i, items);
 487                                     if (status != REG_OK)
 488                                       goto error;
 489                                   }
 490                               }
 491                             else
 492                               {
 493                                 /* Process character class */
 494                                 DPRINT(("tre_parse_bracket:  class: '%.*" STRF
 495                                         "'\n", REST(start - 2)));
 496                                 if (!collect_MCCS)
 497                                   {
 498                                     char tmp_str[64];
 499                                     tre_ctype_t class;
 500                                     int len = MIN(re - start, 63);
 501                                     {
 502                                       tre_char_t tmp_wcs[64];
 503                                       wcsncpy(tmp_wcs, start, (size_t)len);
 504                                       tmp_wcs[len] = L'\0';
 505                                       {
 506                                         mbstate_t state;
 507                                         const tre_char_t *src = tmp_wcs;
 508                                         memset(&state, '\0', sizeof(state));
 509                                         len = wcsrtombs_l(tmp_str, &src,
 510                                                       sizeof(tmp_str), &state,
 511                                                       ctx->loc);
 512                                       }
 513                                     }
 514                                     tmp_str[len] = '\0';
 515                                     DPRINT(("  class name: %s\n", tmp_str));
 516                                     class = tre_ctype_l(tmp_str, ctx->loc);
 517                                     if (!class)
 518                                       {
 519                                         status = REG_ECTYPE;
 520                                         goto error;
 521                                       }
 522                                     status = tre_new_item(ctx->mem,
 523                                              TRE_BRACKET_MATCH_TYPE_CLASS,
 524                                              class, &max_i, items);
 525                                     if (status != REG_OK)
 526                                       goto error;
 527                                   }
 528                               }
 529                             re++;
 530                             break;
 531                           }
 532                       }
 533                   }
 534                 other++;
 535                 break;
 536               }
 537 
 538             default:
 539               DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
 540               c = CHAR_LBRACKET;
 541               goto process_single_character;
 542               break;
 543             }
 544           break;
 545 
 546         case CHAR_RBRACKET:
 547           /* A first right bracket */
 548           if (re == ctx->re)
 549             {
 550               DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
 551               min = CHAR_RBRACKET;
 552               range = 0;
 553               other++;
 554               break;
 555             }
 556           /* Done */
 557           if (collect_MCCS)
 558             {
 559               DPRINT(("tre_parse_bracket:       done: '%.*" STRF "'\n",
 560                       REST(re)));
 561               if (col_syms)
 562                 {
 563                   /* Mark the character following the right bracket.  Set len
 564                    * to whether there are other things besides the
 565                    * multi-character collating symbols */
 566                   col_syms->start = re + 1;
 567                   col_syms->len = other;
 568                   /* Mark the end of the list */
 569                   cp->start = NULL;
 570                 }
 571               *result = col_syms;
 572               return REG_OK;
 573             }
 574           /* range > 0 is not possible, since we did a lookahead after the
 575            * hyphen */
 576           if (range == 0)
 577             {
 578               status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
 579                                     min, &max_i, items);
 580               if (status != REG_OK)
 581                 goto error;
 582             }
 583           DPRINT(("tre_parse_bracket:   done: '%.*" STRF "'\n", REST(re)));
 584           *items_size = max_i;
 585           ctx->re = re + 1;
 586           return REG_OK;
 587 
 588         default:
 589           DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
 590           c = *re;
 591 process_single_character:
 592           /* Process single character */
 593           if (range > 0)
 594             {
 595               int mine, maxe;
 596 
 597 process_end_range:
 598               /* Get collation equivalence values */
 599               mine = __collate_equiv_value(ctx->loc, &min, 1);
 600               maxe = __collate_equiv_value(ctx->loc, &c, 1);
 601               if (maxe < mine)
 602                 {
 603                   status = REG_ERANGE;
 604                   goto error;
 605                 }
 606               if (!collect_MCCS)
 607                 {
 608                   status = tre_new_item(ctx->mem,
 609                                         TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN,
 610                                         mine, &max_i, items);
 611                   if (status != REG_OK)
 612                     goto error;
 613                   status = tre_new_item(ctx->mem,
 614                                         TRE_BRACKET_MATCH_TYPE_RANGE_END,
 615                                         maxe, &max_i, items);
 616                   if (status != REG_OK)
 617                     goto error;
 618                 }
 619               range = -1;
 620             }
 621           else
 622             {
 623 process_begin_range:
 624               if (!collect_MCCS)
 625                 {
 626                   if (range == 0)
 627                     {
 628                       status = tre_new_item(ctx->mem,
 629                                             TRE_BRACKET_MATCH_TYPE_CHAR,
 630                                             min, &max_i, items);
 631                       if (status != REG_OK)
 632                         goto error;
 633                     }
 634                   min = c;
 635                 }
 636               range = 0;
 637             }
 638           other++;
 639           break;
 640         }
 641     }
 642   status = REG_EBRACK;
 643 error:
 644   DPRINT(("tre_parse_bracket:   error: '%.*" STRF "', status=%d\n",
 645           REST(re), status));
 646   if (col_syms)
 647     free(col_syms);
 648   return status;
 649 }
 650 
 651 #ifdef TRE_DEBUG
 652 static const char *bracket_match_type_str[] = {
 653   "unused",
 654   "char",
 655   "range begin",
 656   "range end",
 657   "class",
 658   "equivalence value",
 659 };
 660 #endif /* TRE_DEBUG */
 661 
 662 static reg_errcode_t
 663 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
 664 {
 665   tre_ast_node_t *node;
 666   reg_errcode_t status = REG_OK;
 667   tre_bracket_match_list_t *items;
 668   int max_i = 32;
 669   tre_collating_symbol *col_syms = NULL;
 670 
 671   /* Handle special cases [[:<:]] and [[:>:]] */
 672   if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET
 673       && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>')
 674       && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET
 675       && ctx->re[5] == CHAR_RBRACKET)
 676     {
 677       *result = tre_ast_new_literal(ctx->mem, ASSERTION,
 678                       (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW,
 679                       -1);
 680       DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ?
 681               "[[:<:]]" : "[[:>:]]"));
 682       ctx->re += 6;
 683       return *result ? REG_OK : REG_ESPACE;
 684     }
 685 
 686   /* Start off with an array of `max_i' elements. */
 687   items = calloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i));
 688   if (items == NULL)
 689     return REG_ESPACE;
 690 
 691   if (*ctx->re == CHAR_CARET)
 692     {
 693       DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
 694       items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE;
 695       ctx->re++;
 696     }
 697 
 698   status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms);
 699 
 700   if (status != REG_OK)
 701     goto parse_bracket_done;
 702 
 703   /* If there are collating symbols, split off the multi-character ones
 704    * into a union of the bracket expression (without the collating symbols)
 705    * and the multiple-character sequences.  We create an equivalent input
 706    * string and run tre_parse() recursively */
 707   if (col_syms)
 708     {
 709       tre_char_t *str, *sp;
 710       tre_collating_symbol *cp;
 711       tre_parse_ctx_t subctx;
 712 
 713       /* Allocate a new string.  We start with the size of the original
 714        * bracket expression (minus 1) and add 2 (for a leading "[" and
 715        * a trailing nil; don't need a "^", since it is illegal to have
 716        * inverted MCCSs).  Since a multi-character collating symbols
 717        * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't
 718        * need to worry about the new string getting too long. */
 719       free(items);
 720       str = malloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2));
 721       if (str == NULL)
 722         {
 723           free(col_syms);
 724           return REG_ESPACE;
 725         }
 726       sp = str;
 727       if (col_syms->len > 0)
 728         {
 729           /* There are other items in the bracket expression besides the
 730            * multi-character collating symbols, so create a new bracket
 731            * expression with only those other itmes. */
 732           const tre_char_t *re;
 733           ptrdiff_t i;
 734 
 735           *sp++ = '[';
 736           re = ctx->re;
 737           for (cp = col_syms + 1; cp->start; cp++)
 738             {
 739               /* The "- 2" is to account for the "[." */
 740               if ((i = ((cp->start - re) - 2)) > 0)
 741                 {
 742                   memcpy(sp, re, sizeof(*sp) * i);
 743                   sp += i;
 744                 }
 745               /* The "+ 2" is to account for the ".]" */
 746               re = cp->start + cp->len + 2;
 747             }
 748             i = col_syms->start - re; /* Includes the trailing right bracket */
 749             memcpy(sp, re, sizeof(*sp) * i);
 750             sp += i;
 751             *sp++ = '|';
 752         }
 753       for (cp = col_syms + 1; cp->start; cp++)
 754         {
 755           memcpy(sp, cp->start, sizeof(*sp) * cp->len);
 756           sp += cp->len;
 757           if (cp[1].start)
 758             *sp++ = '|';
 759         }
 760       *sp = 0;
 761       DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n",
 762               str));
 763 
 764       memcpy(&subctx, ctx, sizeof(subctx));
 765       subctx.re = str;
 766       subctx.len = sp - str;
 767       subctx.nofirstsub = 1;
 768       subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */
 769       status = tre_parse(&subctx);
 770       free(str);
 771       if (status != REG_OK)
 772         {
 773           free(col_syms);
 774           return status;
 775         }
 776       ctx->re = col_syms->start;
 777       ctx->position = subctx.position;
 778       free(col_syms);
 779       *result = subctx.result;
 780       DPRINT(("tre_parse_bracket: Returning to original string\n"));
 781       return REG_OK;
 782     }
 783 
 784   DPRINT(("tre_parse_bracket: creating bracket expression literal\n"));
 785   node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position);
 786   if (node == NULL)
 787     {
 788       status = REG_ESPACE;
 789       goto parse_bracket_done;
 790     }
 791   else
 792     {
 793       tre_literal_t *l = node->obj;
 794       l->u.bracket_match_list = tre_mem_alloc(ctx->mem,
 795                                          SIZEOF_BRACKET_MATCH_LIST(items));
 796       if (l->u.bracket_match_list == NULL)
 797         {
 798           status = REG_ESPACE;
 799           goto parse_bracket_done;
 800         }
 801       memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items));
 802     }
 803 
 804 #ifdef TRE_DEBUG
 805   {
 806     int i;
 807     tre_bracket_match_t *b;
 808     DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n",
 809             items->num_bracket_matches, items->flags));
 810     for (i = 0, b = items->bracket_matches;
 811          i < items->num_bracket_matches; i++, b++)
 812       {
 813         DPRINT(("   %d: %s %d\n", i, bracket_match_type_str[b->type],
 814                 b->value));
 815       }
 816   }
 817 #endif /* TRE_DEBUG */
 818 
 819  parse_bracket_done:
 820   free(items);
 821   ctx->position++;
 822   *result = node;
 823   return status;
 824 }
 825 
 826 /* Parses a positive decimal integer.  Returns -1 if the string does not
 827    contain a valid number. */
 828 static int
 829 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
 830 {
 831   int num;
 832   tre_char_t *endptr;
 833 
 834   errno = 0;
 835   num = (int)wcstoul(*regex, (wchar_t **)&endptr, 10);
 836   if (errno != 0)
 837     return -1;
 838   *regex = endptr;
 839   return num;
 840 }
 841 
 842 static reg_errcode_t
 843 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
 844 {
 845   int min, max;
 846   const tre_char_t *r = ctx->re;
 847   int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
 848 
 849   /* Parse number (minimum repetition count). */
 850   min = -1;
 851   if (r >= ctx->re_end)
 852 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
 853     return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE;
 854 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
 855     return REG_EBRACE;
 856 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
 857   if (*r >= L'0' && *r <= L'9') {
 858     DPRINT(("tre_parse:   min count: '%.*" STRF "'\n", REST(r)));
 859     min = tre_parse_int(&r, ctx->re_end);
 860     if (min == -1)
 861       return REG_BADBR;
 862   }
 863   else
 864 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
 865       /* For ERE, return REG_NOMATCH to signal that the lbrace should
 866          be treated as a literal */
 867       return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR;
 868 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
 869       return REG_BADBR;
 870 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
 871 
 872   /* Parse comma and second number (maximum repetition count). */
 873   max = min;
 874   if (r < ctx->re_end && *r == CHAR_COMMA)
 875     {
 876       r++;
 877       DPRINT(("tre_parse:   max count: '%.*" STRF "'\n", REST(r)));
 878       max = tre_parse_int(&r, ctx->re_end);
 879     }
 880 
 881   /* Check that the repeat counts are sane. */
 882   if ((max > 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX)
 883     return REG_BADBR;
 884 
 885   /*{*//* Missing }. */
 886   if (r >= ctx->re_end)
 887     return REG_EBRACE;
 888 
 889   /* Empty contents of {}. */
 890   if (r == ctx->re)
 891     return REG_BADBR;
 892 
 893   /* Parse the ending '}' or '\}'.*/
 894   if (ctx->cflags & REG_EXTENDED)
 895     {
 896       if (r >= ctx->re_end || *r != CHAR_RBRACE)
 897         return REG_BADBR;
 898       r++;
 899       /* Parse trailing '?' marking minimal repetition. */
 900       if (r < ctx->re_end)
 901         {
 902           if (*r == CHAR_QUESTIONMARK)
 903             {
 904               /* Process the question mark only in enhanced mode.
 905                  Otherwise, the question mark is an error in ERE
 906                  or a literal in BRE */
 907               if (ctx->cflags & REG_ENHANCED)
 908                 {
 909                   minimal = !(ctx->cflags & REG_UNGREEDY);
 910                   r++;
 911                 }
 912               else return REG_BADRPT;
 913             }
 914           else if (*r == CHAR_STAR || *r == CHAR_PLUS)
 915             {
 916               /* These are reserved for future extensions. */
 917               return REG_BADRPT;
 918             }
 919         }
 920     }
 921   else
 922     {
 923       if (r + 1 >= ctx->re_end
 924           || *r != CHAR_BACKSLASH
 925           || *(r + 1) != CHAR_RBRACE)
 926         return REG_BADBR;
 927       r += 2;
 928       if (r < ctx->re_end && *r == CHAR_STAR)
 929         {
 930           /* This is reserved for future extensions. */
 931           return REG_BADRPT;
 932         }
 933     }
 934 
 935   if (minimal)
 936     ctx->num_reorder_tags++;
 937 
 938   if (!result) goto parse_bound_exit;
 939   /* Create the AST node(s). */
 940   /* Originally, if min == 0 && max == 0, we immediately replace the whole
 941      iteration with EMPTY.  This unfortunately drops any submatches, and
 942      messes up setting the pmatch values (we can get tags of -1, and
 943      tag values in the billions).  So we leave it and process this case as
 944      usual, and wait until tre_expand_ast() to replace with EMPTY */
 945 
 946   *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
 947   if (!*result)
 948     return REG_ESPACE;
 949 
 950 parse_bound_exit:
 951   DPRINT(("tre_parse_bound: min %d, max %d\n", min, max));
 952 
 953   ctx->re = r;
 954   return REG_OK;
 955 }
 956 
 957 /* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for
 958    non-self-contained options, like (?i), this causes ((?i)fu)bar to be
 959    treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect.
 960    Because we now set up tags for even non-capturing parenthesized
 961    subexpressions, we always call PARSE_MARK_FOR_SUBMATCH.  So if we
 962    pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and
 963    have it restore cflags after the subexpression, we don't need to have
 964    a separate PARSE_RESTORE_CFLAGS, and then after processing the
 965    non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE.
 966    This has the side-benefit of now matching the perl behavior: the RE
 967    foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior
 968    of foo AND (?i) (bar OR zap). */
 969 typedef enum {
 970   PARSE_RE = 0,
 971   PARSE_ATOM,
 972   PARSE_MARK_FOR_SUBMATCH,
 973   PARSE_BRANCH,
 974   PARSE_PIECE,
 975   PARSE_CATENATION,
 976   PARSE_POST_CATENATION,
 977   PARSE_UNION,
 978   PARSE_POST_UNION,
 979   PARSE_POSTFIX,
 980 } tre_parse_re_stack_symbol_t;
 981 
 982 reg_errcode_t
 983 tre_parse(tre_parse_ctx_t *ctx)
 984 {
 985   tre_ast_node_t *result = NULL;
 986   tre_parse_re_stack_symbol_t symbol;
 987   reg_errcode_t status = REG_OK;
 988   tre_stack_t *stack = ctx->stack;
 989   int bottom = tre_stack_num_objects(stack);
 990   int depth = 0;
 991   int temporary_cflags = 0;
 992   int bre_branch_begin;
 993 #ifdef TRE_DEBUG
 994   const tre_char_t *tmp_re;
 995 #endif
 996 
 997   DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n",
 998           ctx->len, ctx->re, ctx->len, ctx->cflags));
 999 
1000   if (ctx->len <= 0) return REG_EMPTY;
1001   if (!ctx->nofirstsub)
1002     {
1003       STACK_PUSH(stack, int, ctx->cflags);
1004       STACK_PUSH(stack, int, ctx->submatch_id);
1005       STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
1006       ctx->submatch_id++;
1007     }
1008   STACK_PUSH(stack, int, 0); // bre_branch_begin
1009   STACK_PUSH(stack, int, PARSE_RE);
1010   ctx->re_start = ctx->re;
1011   ctx->re_end = ctx->re + ctx->len;
1012 
1013   /* The following is basically just a recursive descent parser.  I use
1014      an explicit stack instead of recursive functions mostly because of
1015      two reasons: compatibility with systems which have an overflowable
1016      call stack, and efficiency (both in lines of code and speed).  */
1017   while (tre_stack_num_objects(stack) > bottom)
1018     {
1019       symbol = tre_stack_pop_int(stack);
1020       switch (symbol)
1021         {
1022         case PARSE_RE:
1023           /* Parse a full regexp.  A regexp is one or more branches,
1024              separated by the union operator `|'. */
1025           bre_branch_begin = tre_stack_pop_int(stack);
1026           if (!(ctx->cflags & REG_LITERAL) &&
1027               ctx->cflags & (REG_EXTENDED | REG_ENHANCED))
1028             STACK_PUSHX(stack, int, PARSE_UNION);
1029           STACK_PUSHX(stack, int, bre_branch_begin);
1030           STACK_PUSHX(stack, int, PARSE_BRANCH);
1031           break;
1032 
1033         case PARSE_BRANCH:
1034           /* Parse a branch.  A branch is one or more pieces, concatenated.
1035              A piece is an atom possibly followed by a postfix operator. */
1036           bre_branch_begin = tre_stack_pop_int(stack);
1037           STACK_PUSHX(stack, int, PARSE_CATENATION);
1038           STACK_PUSHX(stack, int, bre_branch_begin);
1039           STACK_PUSHX(stack, int, PARSE_PIECE);
1040           break;
1041 
1042         case PARSE_PIECE:
1043           /* Parse a piece.  A piece is an atom possibly followed by one
1044              or more postfix operators. */
1045           bre_branch_begin = tre_stack_pop_int(stack);
1046           STACK_PUSHX(stack, int, PARSE_POSTFIX);
1047           STACK_PUSHX(stack, int, bre_branch_begin);
1048           STACK_PUSHX(stack, int, PARSE_ATOM);
1049           break;
1050 
1051         case PARSE_CATENATION:
1052           /* If the expression has not ended, parse another piece. */
1053           {
1054             tre_char_t c;
1055             if (ctx->re >= ctx->re_end)
1056               break;
1057             c = *ctx->re;
1058             if (!(ctx->cflags & REG_LITERAL))
1059               {
1060                 if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) ||
1061                     ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED
1062                     && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH &&
1063                     *(ctx->re + 1) == CHAR_PIPE))
1064                   break;
1065                 if ((ctx->cflags & REG_EXTENDED
1066                      && c == CHAR_RPAREN && depth > 0)
1067                     || (!(ctx->cflags & REG_EXTENDED)
1068                         && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH
1069                             && *(ctx->re + 1) == CHAR_RPAREN))
1070                   {
1071                     if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1072                       return REG_EPAREN;
1073                     DPRINT(("tre_parse:   group end: '%.*" STRF "'\n",
1074                             REST(ctx->re)));
1075                     depth--;
1076                     if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED)))
1077                       ctx->re += 2;
1078                     break;
1079                   }
1080               }
1081 
1082 #ifdef REG_LEFT_ASSOC
1083             if (ctx->cflags & REG_LEFT_ASSOC)
1084               {
1085                 /* Left associative concatenation. */
1086                 STACK_PUSHX(stack, int, PARSE_CATENATION);
1087                 STACK_PUSHX(stack, voidptr, result);
1088                 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1089                 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1090                 STACK_PUSHX(stack, int, PARSE_PIECE);
1091               }
1092             else
1093 #endif /* REG_LEFT_ASSOC */
1094               {
1095                 /* Default case, right associative concatenation. */
1096                 STACK_PUSHX(stack, voidptr, result);
1097                 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1098                 STACK_PUSHX(stack, int, PARSE_CATENATION);
1099                 STACK_PUSHX(stack, int, 0); // bre_branch_begin
1100                 STACK_PUSHX(stack, int, PARSE_PIECE);
1101               }
1102             break;
1103           }
1104 
1105         case PARSE_POST_CATENATION:
1106           {
1107             tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1108             tre_ast_node_t *tmp_node;
1109             tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1110             if (!tmp_node)
1111               return REG_ESPACE;
1112             result = tmp_node;
1113             break;
1114           }
1115 
1116         case PARSE_UNION:
1117           if (ctx->re >= ctx->re_end)
1118             break;
1119           if (ctx->cflags & REG_LITERAL)
1120             break;
1121           if (!(ctx->cflags & REG_EXTENDED))
1122             {
1123               if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end)
1124                 break;
1125               ctx->re++;
1126             }
1127           switch (*ctx->re)
1128             {
1129             case CHAR_PIPE:
1130               DPRINT(("tre_parse:       union: '%.*" STRF "'\n",
1131                       REST(ctx->re)));
1132               STACK_PUSHX(stack, int, PARSE_UNION);
1133               STACK_PUSHX(stack, voidptr, (void *)ctx->re);
1134               STACK_PUSHX(stack, voidptr, result);
1135               STACK_PUSHX(stack, int, PARSE_POST_UNION);
1136               /* We need to pass a boolean (eventually) to PARSE_ATOM to
1137                  indicate if this is the beginning of a BRE extended branch. */
1138               STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin
1139               STACK_PUSHX(stack, int, PARSE_BRANCH);
1140               ctx->re++;
1141               break;
1142 
1143             case CHAR_RPAREN:
1144               ctx->re++;
1145               break;
1146 
1147             default:
1148               if (!(ctx->cflags & REG_EXTENDED))
1149                 ctx->re--;
1150               break;
1151             }
1152           break;
1153 
1154         case PARSE_POST_UNION:
1155           {
1156             tre_ast_node_t *tmp_node;
1157             tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1158             const tre_char_t *pipechar = tre_stack_pop_voidptr(stack);
1159             /* error on empty expression at end of union */
1160             if (pipechar == ctx->re - 1)
1161               {
1162                 return REG_EMPTY;
1163               }
1164             tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1165             if (!tmp_node)
1166               return REG_ESPACE;
1167             result = tmp_node;
1168             break;
1169           }
1170 
1171         case PARSE_POSTFIX:
1172           /* Parse postfix operators. */
1173           if (ctx->re >= ctx->re_end)
1174             break;
1175           if (ctx->cflags & REG_LITERAL)
1176             break;
1177           int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1178           int rep_min = 0;
1179           int rep_max = -1;
1180 #ifdef TRE_DEBUG
1181           int lbrace_off;
1182 #endif
1183           switch (*ctx->re)
1184             {
1185             case CHAR_PLUS:
1186             case CHAR_QUESTIONMARK:
1187               if (!(ctx->cflags & REG_EXTENDED))
1188                 break;
1189                 /*FALLTHROUGH*/
1190             case CHAR_STAR:
1191               {
1192                 tre_ast_node_t *tmp_node;
1193 #ifdef TRE_DEBUG
1194                 const char *tstr = "star";
1195                 tmp_re = ctx->re;
1196 #endif
1197 
1198         handle_plus_or_question:
1199                 /* error on iteration of raw assertion (not in subexpression) */
1200                 if (result->type == LITERAL && result->submatch_id < 0 &&
1201                     IS_ASSERTION((tre_literal_t *)result->obj))
1202                   {
1203                     if (!(ctx->cflags & REG_EXTENDED)) break;
1204                     return REG_BADRPT;
1205                   }
1206                 if (*ctx->re == CHAR_PLUS)
1207                   {
1208                     rep_min = 1;
1209 #ifdef TRE_DEBUG
1210                     tstr = "plus";
1211 #endif
1212                   }
1213                 if (*ctx->re == CHAR_QUESTIONMARK)
1214                   {
1215                     rep_max = 1;
1216 #ifdef TRE_DEBUG
1217                     tstr = "questionmark";
1218 #endif
1219                   }
1220 
1221                 if (ctx->cflags & REG_EXTENDED)
1222                   {
1223                     if (ctx->re + 1 < ctx->re_end)
1224                       {
1225                         if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1226                           {
1227                             /* Process the question mark only in enhanced mode.
1228                                Otherwise, the question mark is an error in ERE */
1229                             if (ctx->cflags & REG_ENHANCED)
1230                               {
1231                                 minimal = !(ctx->cflags & REG_UNGREEDY);
1232                                 ctx->re++;
1233                               }
1234                             else return REG_BADRPT;
1235                           }
1236                         else if (*(ctx->re + 1) == CHAR_STAR
1237                                  || *(ctx->re + 1) == CHAR_PLUS)
1238                           {
1239                             /* These are reserved for future extensions. */
1240                             return REG_BADRPT;
1241                           }
1242                       }
1243                   }
1244                 else
1245                   {
1246                     if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR)
1247                       {
1248                         /* This is reserved for future extensions. */
1249                         return REG_BADRPT;
1250                       }
1251                     if (ctx->re + 2 < ctx->re_end)
1252                       {
1253                         if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_QUESTIONMARK)
1254                           {
1255                             /* Process the question mark only in enhanced mode.
1256                                Otherwise, the question mark is a literal in BRE */
1257                             if (ctx->cflags & REG_ENHANCED)
1258                               {
1259                                 minimal = !(ctx->cflags & REG_UNGREEDY);
1260                                 ctx->re += 2;
1261                               }
1262                           }
1263                         else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS)
1264                           {
1265                             /* This is reserved for future extensions. */
1266                             return REG_BADRPT;
1267                           }
1268                       }
1269                   }
1270 
1271                 if (minimal)
1272                   ctx->num_reorder_tags++;
1273 
1274                 DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n",
1275                         minimal ? "  minimal" : "greedy", tstr, REST(tmp_re)));
1276                 if (result == NULL)
1277                   {
1278                     if (ctx->cflags & REG_EXTENDED) return REG_BADRPT;
1279                     else goto parse_literal;
1280                   }
1281                 ctx->re++;
1282                 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1283                                             minimal);
1284                 if (tmp_node == NULL)
1285                   return REG_ESPACE;
1286                 result = tmp_node;
1287 
1288                 /* Set the iterator with a submatch id in the invisible range
1289                  * (which will be overridden if a real submatch is needed) */
1290                 result->submatch_id = ctx->submatch_id_invisible++;
1291 
1292 #if 0
1293                 /* We don't allow multiple postfixes, but this might be needed
1294                    to support approximate matching */
1295                 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1296 #endif
1297               }
1298               break;
1299 
1300             case CHAR_BACKSLASH:
1301               /* "\{" is special without REG_EXTENDED */
1302               /* "\+" and "\?" are special with REG_ENHANCED for BRE */
1303               if (!(ctx->cflags & REG_EXTENDED)
1304                   && ctx->re + 1 < ctx->re_end)
1305                 {
1306                   switch (*(ctx->re + 1))
1307                     {
1308                     case CHAR_LBRACE:
1309                       ctx->re++;
1310 #ifdef TRE_DEBUG
1311                       lbrace_off = 2;
1312 #endif
1313                       goto parse_brace;
1314                     case CHAR_PLUS:
1315                     case CHAR_QUESTIONMARK:
1316                       if (ctx->cflags & REG_ENHANCED)
1317                         {
1318 #ifdef TRE_DEBUG
1319                           tmp_re = ctx->re;
1320 #endif
1321                           ctx->re++;
1322                           goto handle_plus_or_question;
1323                         }
1324                       break;
1325                     }
1326                   break;
1327                 }
1328               else
1329                 break;
1330 
1331             case CHAR_LBRACE:
1332               {
1333                 int raw_assertion;
1334 
1335                 /* "{" is literal without REG_EXTENDED */
1336                 if (!(ctx->cflags & REG_EXTENDED))
1337                   break;
1338 #ifdef TRE_DEBUG
1339                 lbrace_off = 1;
1340 #endif
1341 
1342             parse_brace:
1343                 /* error on iteration of raw assertion (not in subexpression),
1344                    but wait until after parsing bounds */
1345                 raw_assertion = (result->type == LITERAL
1346                                  && result->submatch_id < 0
1347                                  && IS_ASSERTION((tre_literal_t *)result->obj));
1348                 ctx->re++;
1349 
1350                 status = tre_parse_bound(ctx, &result);
1351 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1352                 /* For ERE, if status is REG_NOMATCH, this mean the lbrace
1353                    is to be treated as a literal. */
1354                 if (status == REG_NOMATCH)
1355                   {
1356                     ctx->re--;
1357                     break;
1358                   }
1359 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1360                 DPRINT(("tre_parse:     bound: '%.*" STRF "'\n",
1361                         REST(ctx->re - lbrace_off)));
1362                 if (status != REG_OK)
1363                   return status;
1364                 if (raw_assertion) return REG_BADRPT;
1365 
1366                 /* Set the iterator with a submatch id in the invisible range
1367                  * (which will be overridden if a real submatch is needed) */
1368                 if (result->type == ITERATION)
1369                   result->submatch_id = ctx->submatch_id_invisible++;
1370 
1371 #if 0
1372                 /* We don't allow multiple postfixes, but this might be needed
1373                    to support approximate matching */
1374                 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1375 #endif
1376                 break;
1377               }
1378             }
1379           break;
1380 
1381         case PARSE_ATOM:
1382           {
1383             /* Parse an atom.  An atom is a regular expression enclosed in `()',
1384                an empty set of `()', a bracket expression, `.', `^', `$',
1385                a `\' followed by a character, or a single character. */
1386 
1387             /* The stack contains a boolean value, whether PARSE_ATOM is
1388                being called just after the start of a group (left paren)
1389                in a BRE */
1390             bre_branch_begin = tre_stack_pop_int(stack);
1391 
1392             /* End of regexp? (empty string). */
1393             if (ctx->re >= ctx->re_end)
1394               goto parse_literal;
1395 
1396             if (ctx->cflags & REG_LITERAL)
1397               goto parse_literal;
1398 
1399             switch (*ctx->re)
1400               {
1401               case CHAR_LPAREN:  /* parenthesized subexpression */
1402 
1403                 /* Handle "(?...)" extensions.  They work in a way similar
1404                    to Perls corresponding extensions. */
1405                 if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) ==
1406                     (REG_EXTENDED|REG_ENHANCED)
1407                     && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1408                   {
1409                     int new_cflags = ctx->cflags;
1410                     int bit = 1;
1411                     int invisible_submatch = 0;
1412                     DPRINT(("tre_parse: extension: '%.*" STRF "'\n",
1413                             REST(ctx->re)));
1414                     ctx->re += 2;
1415                     while (/*CONSTCOND*/1)
1416                       {
1417                         if (*ctx->re == L'i')
1418                           {
1419                             DPRINT(("tre_parse:     icase: '%.*" STRF "'\n",
1420                                     REST(ctx->re)));
1421                             if (bit)
1422                               new_cflags |= REG_ICASE;
1423                             else
1424                               new_cflags &= ~REG_ICASE;
1425                             ctx->re++;
1426                           }
1427                         else if (*ctx->re == L'n')
1428                           {
1429                             DPRINT(("tre_parse:   newline: '%.*" STRF "'\n",
1430                                     REST(ctx->re)));
1431                             if (bit)
1432                               new_cflags |= REG_NEWLINE;
1433                             else
1434                               new_cflags &= ~REG_NEWLINE;
1435                             ctx->re++;
1436                           }
1437 #ifdef REG_LEFT_ASSOC
1438                         else if (*ctx->re == L'l')
1439                           {
1440                             DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n",
1441                                     REST(ctx->re)));
1442                             if (bit)
1443                               new_cflags |= REG_LEFT_ASSOC;
1444                             else
1445                               new_cflags &= ~REG_LEFT_ASSOC;
1446                             ctx->re++;
1447                           }
1448 #endif /* REG_LEFT_ASSOC */
1449 #ifdef REG_UNGREEDY
1450                         else if (*ctx->re == L'U')
1451                           {
1452                             DPRINT(("tre_parse:    ungreedy: '%.*" STRF "'\n",
1453                                     REST(ctx->re)));
1454                             if (bit)
1455                               new_cflags |= REG_UNGREEDY;
1456                             else
1457                               new_cflags &= ~REG_UNGREEDY;
1458                             ctx->re++;
1459                           }
1460 #endif /* REG_UNGREEDY */
1461                         else if (*ctx->re == CHAR_MINUS)
1462                           {
1463                             DPRINT(("tre_parse:  turn off: '%.*" STRF "'\n",
1464                                     REST(ctx->re)));
1465                             ctx->re++;
1466                             bit = 0;
1467                           }
1468                         else if (*ctx->re == CHAR_COLON)
1469                           {
1470                             DPRINT(("tre_parse:  no group: '%.*" STRF
1471                                     "', (invisible submatch %d)\n",
1472                                     REST(ctx->re), ctx->submatch_id_invisible));
1473                             ctx->re++;
1474                             depth++;
1475                             invisible_submatch = 1;
1476                             break;
1477                           }
1478                         else if (*ctx->re == CHAR_HASH)
1479                           {
1480                             DPRINT(("tre_parse:    comment: '%.*" STRF "'\n",
1481                                     REST(ctx->re)));
1482                             /* A comment can contain any character except a
1483                                right parenthesis */
1484                             while (*ctx->re != CHAR_RPAREN
1485                                    && ctx->re < ctx->re_end)
1486                               ctx->re++;
1487                             if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1488                               {
1489                                 ctx->re++;
1490                                 break;
1491                               }
1492                             else
1493                               return REG_BADPAT;
1494                           }
1495                         else if (*ctx->re == CHAR_RPAREN)
1496                           {
1497                             ctx->re++;
1498                             break;
1499                           }
1500                         else
1501                           return REG_BADRPT;
1502                       }
1503 
1504                     /* Turn on the cflags changes for the rest of the
1505                        enclosing group. */
1506                     if (invisible_submatch)
1507                       {
1508                         STACK_PUSHX(stack, int, ctx->cflags);
1509                         STACK_PUSHX(stack, int, ctx->submatch_id_invisible);
1510                         STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1511                         ctx->submatch_id_invisible++;
1512                         STACK_PUSHX(stack, int, 0); // bre_branch_begin
1513                         STACK_PUSHX(stack, int, PARSE_RE);
1514                       }
1515                     else {
1516                         STACK_PUSHX(stack, int, 0); // bre_branch_begin
1517                         STACK_PUSHX(stack, int, PARSE_ATOM);
1518                     }
1519                     ctx->cflags = new_cflags;
1520                     break;
1521                   }
1522 
1523                 if (ctx->cflags & REG_EXTENDED)
1524                   {
1525                 parse_bre_lparen:
1526                     DPRINT(("tre_parse: group begin: '%.*" STRF
1527                             "', submatch %d\n", REST(ctx->re),
1528                             ctx->submatch_id));
1529                     ctx->re++;
1530                     /* First parse a whole RE, then mark the resulting tree
1531                        for submatching. */
1532                     STACK_PUSHX(stack, int, ctx->cflags);
1533                     STACK_PUSHX(stack, int, ctx->submatch_id);
1534                     STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1535                     /* We need to pass a boolean (eventually) to PARSE_ATOM to
1536                        indicate if this is the beginning of a BRE group. */
1537                     STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED));
1538                     STACK_PUSHX(stack, int, PARSE_RE);
1539                     ctx->submatch_id++;
1540                     depth++;
1541                   }
1542                 else
1543                   goto parse_literal;
1544                 break;
1545 
1546               case CHAR_RPAREN:  /* end of current subexpression */
1547                 if (ctx->cflags & REG_EXTENDED && depth > 0)
1548                   {
1549               parse_bre_rparen_empty:
1550                     if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1551                       return REG_EPAREN;
1552                     DPRINT(("tre_parse:     empty: '%.*" STRF "'\n",
1553                             REST(ctx->re)));
1554                     /* We were expecting an atom, but instead the current
1555                        subexpression was closed.  POSIX leaves the meaning of
1556                        this to be implementation-defined.  We interpret this as
1557                        an empty expression (which matches an empty string).  */
1558                     result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1559                     if (result == NULL)
1560                       return REG_ESPACE;
1561                     if (!(ctx->cflags & REG_EXTENDED))
1562                       ctx->re--;
1563                   }
1564                 else
1565                   goto parse_literal;
1566                 break;
1567 
1568               case CHAR_LBRACKET: /* bracket expression */
1569                 DPRINT(("tre_parse:     bracket: '%.*" STRF "'\n",
1570                         REST(ctx->re)));
1571                 ctx->re++;
1572                 status = tre_parse_bracket(ctx, &result);
1573                 if (status != REG_OK)
1574                   return status;
1575                 break;
1576 
1577               case CHAR_BACKSLASH:
1578                 /* Deal with "\(", "\)" or "\{" for BREs */
1579                 if (!(ctx->cflags & REG_EXTENDED)
1580                     && ctx->re + 1 < ctx->re_end)
1581                   {
1582                     if (*(ctx->re + 1) == CHAR_LPAREN)
1583                       {
1584                         ctx->re++;
1585                         goto parse_bre_lparen;
1586                       }
1587                     else if (*(ctx->re + 1) == CHAR_RPAREN)
1588                       {
1589                         ctx->re++;
1590                         goto parse_bre_rparen_empty;
1591                       }
1592                     if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal;
1593                   }
1594 
1595                 if (ctx->re + 1 >= ctx->re_end)
1596                   /* Trailing backslash. */
1597                   return REG_EESCAPE;
1598 
1599                 /* XXX illumos: want macro expansion enabled by default (same as TRE) */
1600 #if 0
1601                 if (!(ctx->cflags & REG_ENHANCED))
1602                   {
1603                     DPRINT(("tre_parse:  unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re)));
1604                     ctx->re++;
1605                     goto unenhanced_backslash;
1606                   }
1607 #endif
1608 
1609                 /* If a macro is used, parse the expanded macro recursively. */
1610                 {
1611                   tre_char_t buf[64];
1612                   tre_expand_macro(ctx->re + 1, ctx->re_end,
1613                                    buf, elementsof(buf));
1614                   if (buf[0] != 0)
1615                     {
1616                       tre_parse_ctx_t subctx;
1617                       memcpy(&subctx, ctx, sizeof(subctx));
1618                       subctx.re = buf;
1619                       subctx.len = tre_strlen(buf);
1620                       subctx.nofirstsub = 1;
1621                       status = tre_parse(&subctx);
1622                       if (status != REG_OK)
1623                         return status;
1624                       ctx->re += 2;
1625                       ctx->position = subctx.position;
1626                       result = subctx.result;
1627                       break;
1628                     }
1629                 }
1630 
1631                 if (*(ctx->re + 1) == L'Q')
1632                   {
1633                     DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1634                             REST(ctx->re)));
1635                     ctx->cflags |= REG_LITERAL;
1636                     temporary_cflags |= REG_LITERAL;
1637                     ctx->re += 2;
1638                     STACK_PUSHX(stack, int, 0);
1639                     STACK_PUSHX(stack, int, PARSE_ATOM);
1640                     break;
1641                   }
1642 
1643                 DPRINT(("tre_parse:  bleep: '%.*" STRF "'\n", REST(ctx->re)));
1644                 ctx->re++;
1645                 switch (*ctx->re)
1646                   {
1647                   case L'b':
1648                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
1649                                                  ASSERT_AT_WB, -1);
1650                     ctx->re++;
1651                     break;
1652                   case L'B':
1653                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
1654                                                  ASSERT_AT_WB_NEG, -1);
1655                     ctx->re++;
1656                     break;
1657                   case L'<':
1658                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
1659                                                  ASSERT_AT_BOW, -1);
1660                     ctx->re++;
1661                     break;
1662                   case L'>':
1663                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
1664                                                  ASSERT_AT_EOW, -1);
1665                     ctx->re++;
1666                     break;
1667                   case L'x':
1668                     ctx->re++;
1669                     if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1670                       {
1671                         /* 8 bit hex char. */
1672                         char tmp[3] = {0, 0, 0};
1673                         long val;
1674                         DPRINT(("tre_parse:  8 bit hex: '%.*" STRF "'\n",
1675                                 REST(ctx->re - 2)));
1676 
1677                         if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1678                             ctx->re < ctx->re_end)
1679                           {
1680                             tmp[0] = (char)ctx->re[0];
1681                             ctx->re++;
1682                           }
1683                         if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1684                             ctx->re < ctx->re_end)
1685                           {
1686                             tmp[1] = (char)ctx->re[0];
1687                             ctx->re++;
1688                           }
1689                         val = strtol(tmp, NULL, 16);
1690                         result = tre_ast_new_literal(ctx->mem, (int)val,
1691                                                      (int)val, ctx->position);
1692                         ctx->position++;
1693                         break;
1694                       }
1695                     else if (ctx->re < ctx->re_end)
1696                       {
1697                         /* Wide char. */
1698                         char tmp[32];
1699                         long val;
1700                         int i = 0;
1701                         ctx->re++;
1702                         while (ctx->re_end - ctx->re >= 0)
1703                           {
1704                             if (ctx->re[0] == CHAR_RBRACE)
1705                               break;
1706                             if (tre_isxdigit_l(ctx->re[0], ctx->loc))
1707                               {
1708                                 tmp[i] = (char)ctx->re[0];
1709                                 i++;
1710                                 ctx->re++;
1711                                 continue;
1712                               }
1713                             return REG_EBRACE;
1714                           }
1715                         ctx->re++;
1716                         tmp[i] = 0;
1717                         val = strtol(tmp, NULL, 16);
1718                         result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1719                                                      ctx->position);
1720                         ctx->position++;
1721                         break;
1722                       }
1723                     /*FALLTHROUGH*/
1724 
1725                   default:
1726                   unenhanced_backslash:
1727                     if ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) !=
1728                         REG_EXTENDED &&
1729                         tre_isdigit_l(*ctx->re, ctx->loc) && *ctx->re != L'0')
1730                       {
1731                         /* Back reference (only in BRE or enhanced). */
1732                         int val = *ctx->re - L'0';
1733                         DPRINT(("tre_parse:     backref: '%.*" STRF "'\n",
1734                                 REST(ctx->re - 1)));
1735                         result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1736                                                      ctx->position);
1737                         if (result == NULL)
1738                           return REG_ESPACE;
1739 
1740                         /* Set the backref with a submatch id in the invisible
1741                          * range (which will be overridden if a real submatch
1742                          * is needed) */
1743                         result->submatch_id = ctx->submatch_id_invisible++;
1744 
1745                         ctx->position++;
1746                         ctx->num_reorder_tags++;
1747                         ctx->max_backref = MAX(val, ctx->max_backref);
1748                         ctx->re++;
1749                       }
1750                     else
1751                       {
1752                         /* Escaped character. */
1753                         DPRINT(("tre_parse:     escaped: '%.*" STRF "'\n",
1754                                 REST(ctx->re - 1)));
1755                         result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1756                                                      ctx->position);
1757                         ctx->position++;
1758                         ctx->re++;
1759                       }
1760                     break;
1761                   }
1762                 if (result == NULL)
1763                   return REG_ESPACE;
1764                 break;
1765 
1766               case CHAR_PERIOD:  /* the any-symbol */
1767                 DPRINT(("tre_parse:       any: '%.*" STRF "'\n",
1768                         REST(ctx->re)));
1769                 if (ctx->cflags & REG_NEWLINE)
1770                   {
1771                     tre_ast_node_t *tmp1;
1772                     tre_ast_node_t *tmp2;
1773                     tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
1774                                                ctx->position);
1775                     if (!tmp1)
1776                       return REG_ESPACE;
1777                     tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
1778                                                ctx->position + 1);
1779                     if (!tmp2)
1780                       return REG_ESPACE;
1781                     result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1782                     if (!result)
1783                       return REG_ESPACE;
1784                     ctx->position += 2;
1785                   }
1786                 else
1787                   {
1788                     result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1789                                                  ctx->position);
1790                     if (!result)
1791                       return REG_ESPACE;
1792                     ctx->position++;
1793                   }
1794                 ctx->re++;
1795                 break;
1796 
1797               case CHAR_CARET:   /* beginning of line assertion */
1798                 /* '^' has a special meaning everywhere in EREs, at the
1799                    beginning of the RE and after \( is BREs.  It is also
1800                    special in enhanced BREs at the beginning of each branches
1801                    of a union */
1802                 if (ctx->cflags & REG_EXTENDED
1803                     || bre_branch_begin
1804                     || ctx->re == ctx->re_start)
1805                   {
1806                     DPRINT(("tre_parse:       BOL: '%.*" STRF "'\n",
1807                             REST(ctx->re)));
1808                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
1809                                                  ASSERT_AT_BOL, -1);
1810                     if (result == NULL)
1811                       return REG_ESPACE;
1812                     ctx->re++;
1813                   }
1814                 else
1815                   goto parse_literal;
1816                 break;
1817 
1818               case CHAR_DOLLAR:  /* end of line assertion. */
1819                 /* '$' is special everywhere in EREs, and in the end of the
1820                    string and before \) is BREs. */
1821                 if (ctx->cflags & REG_EXTENDED
1822                     || (ctx->re + 2 < ctx->re_end
1823                         && *(ctx->re + 1) == CHAR_BACKSLASH
1824                         && *(ctx->re + 2) == CHAR_RPAREN)
1825                     || ctx->re + 1 == ctx->re_end)
1826                   {
1827                     DPRINT(("tre_parse:       EOL: '%.*" STRF "'\n",
1828                             REST(ctx->re)));
1829                     result = tre_ast_new_literal(ctx->mem, ASSERTION,
1830                                                  ASSERT_AT_EOL, -1);
1831                     if (result == NULL)
1832                       return REG_ESPACE;
1833                     ctx->re++;
1834                   }
1835                 else
1836                   goto parse_literal;
1837                 break;
1838 
1839               default:
1840               parse_literal:
1841 
1842                 if (temporary_cflags && ctx->re + 1 < ctx->re_end
1843                     && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
1844                   {
1845                     DPRINT(("tre_parse:  end tmps: '%.*" STRF "'\n",
1846                             REST(ctx->re)));
1847                     ctx->cflags &= ~temporary_cflags;
1848                     temporary_cflags = 0;
1849                     ctx->re += 2;
1850                     if (ctx->re < ctx->re_end)
1851                       {
1852                         STACK_PUSHX(stack, int, 0);
1853                         STACK_PUSHX(stack, int, PARSE_ATOM);
1854                       }
1855                     else
1856                       {
1857                         result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1858                         if (!result) return REG_ESPACE;
1859                       }
1860                     break;
1861                   }
1862 
1863                 /* We are expecting an atom.  If the subexpression (or the whole
1864                    regexp ends here, we interpret it as an empty expression
1865                    (which matches an empty string), which is an error.
1866                    Iterations of an empty expression is also an error. */
1867                 if (!(ctx->cflags & REG_LITERAL))
1868                   {
1869                     /* error on end of string */
1870                     if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN
1871                                                        : REG_EMPTY;
1872                     /* error on unions and iterations of empty expressions */
1873                     if (ctx->cflags & REG_EXTENDED)
1874                       {
1875                         if (ctx->re < ctx->re_end)
1876                           {
1877                             if (*ctx->re == CHAR_PIPE) return REG_EMPTY;
1878                             if (*ctx->re == CHAR_LBRACE)
1879                               {
1880                                 ctx->re++;
1881                   empty_parse_bound:
1882                                 /* We need to parse the bound first and return
1883                                    any error, before returning REG_BADRPT */
1884                                 status = tre_parse_bound(ctx, NULL);
1885 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1886                                 /* For ERE, if REG_NOMATCH is returned, we
1887                                    treat the lbrace as a literal. */
1888                                 if (status == REG_NOMATCH)
1889                                   {
1890                                     ctx->re--;
1891                                     /* Drop down to literal-handling code */
1892                                   }
1893                                 else
1894                                   {
1895 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1896                                     if (status != REG_OK)
1897                                       return status;
1898                                     return REG_BADRPT;
1899 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1900                                   }
1901 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1902                               }
1903 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1904                             else
1905 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1906                             if (*ctx->re == CHAR_STAR
1907                                 || *ctx->re == CHAR_PLUS
1908                                 || *ctx->re == CHAR_QUESTIONMARK)
1909                               {
1910                                 return REG_BADRPT;
1911                               }
1912                           }
1913                       }
1914                     else if (ctx->re + 1 < ctx->re_end
1915                              && *ctx->re == CHAR_BACKSLASH
1916                              && *(ctx->re + 1) == CHAR_LBRACE)
1917                       {
1918                         ctx->re += 2;
1919                         goto empty_parse_bound;
1920                       }
1921                   }
1922 
1923                 DPRINT(("tre_parse:     literal: '%.*" STRF "'\n",
1924                         REST(ctx->re)));
1925                 /* Note that we can't use an tre_isalpha() test here, since there
1926                    may be characters which are alphabetic but neither upper or
1927                    lower case. */
1928                 if (ctx->cflags & REG_ICASE
1929                     && (tre_isupper_l(*ctx->re, ctx->loc) ||
1930                     tre_islower_l(*ctx->re, ctx->loc)))
1931                   {
1932                     tre_ast_node_t *tmp1;
1933                     tre_ast_node_t *tmp2;
1934 
1935                     /* XXX - Can there be more than one opposite-case
1936                        counterpoints for some character in some locale?  Or
1937                        more than two characters which all should be regarded
1938                        the same character if case is ignored?  If yes, there
1939                        does not seem to be a portable way to detect it.  I guess
1940                        that at least for multi-character collating elements there
1941                        could be several opposite-case counterpoints, but they
1942                        cannot be supported portably anyway. */
1943                     tmp1 = tre_ast_new_literal(ctx->mem,
1944                                                tre_toupper_l(*ctx->re, ctx->loc),
1945                                                tre_toupper_l(*ctx->re, ctx->loc),
1946                                                ctx->position);
1947                     if (!tmp1)
1948                       return REG_ESPACE;
1949                     tmp2 = tre_ast_new_literal(ctx->mem,
1950                                                tre_tolower_l(*ctx->re, ctx->loc),
1951                                                tre_tolower_l(*ctx->re, ctx->loc),
1952                                                ctx->position);
1953                     if (!tmp2)
1954                       return REG_ESPACE;
1955                     result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1956                     if (!result)
1957                       return REG_ESPACE;
1958                   }
1959                 else
1960                   {
1961                     result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1962                                                  ctx->position);
1963                     if (!result)
1964                       return REG_ESPACE;
1965                   }
1966                 ctx->position++;
1967                 ctx->re++;
1968                 break;
1969               }
1970             break;
1971           }
1972 
1973         case PARSE_MARK_FOR_SUBMATCH:
1974           {
1975             int submatch_id = tre_stack_pop_int(stack);
1976 
1977             ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */
1978             if (result->submatch_id >= 0 &&
1979                 result->submatch_id < SUBMATCH_ID_INVISIBLE_START)
1980               {
1981                 tre_ast_node_t *n, *tmp_node;
1982                 if (submatch_id >= SUBMATCH_ID_INVISIBLE_START)
1983                   break;
1984                 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1985                 if (n == NULL)
1986                   return REG_ESPACE;
1987                 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1988                 if (tmp_node == NULL)
1989                   return REG_ESPACE;
1990                 tmp_node->num_submatches = result->num_submatches;
1991                 result = tmp_node;
1992               }
1993             result->submatch_id = submatch_id;
1994             if (submatch_id < SUBMATCH_ID_INVISIBLE_START)
1995               result->num_submatches++;
1996             break;
1997           }
1998 
1999         default:
2000           assert(0);
2001           break;
2002         }
2003     }
2004 
2005   /* Check for missing closing parentheses. */
2006   if (depth > 0)
2007     return REG_EPAREN;
2008 
2009   ctx->result = result;
2010 
2011   return REG_OK;
2012 }