1 /*
   2  * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  *
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  *
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  18  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  19  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  20  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  26  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28 
  29 /*
  30   tre-match-utils.h - TRE matcher helper definitions
  31 */
  32 
  33 #include "tre-internal.h"
  34 
  35 #define str_source ((const tre_str_source*)string)
  36 
  37 /*
  38  * Because all multibyte encodings are exclusively single-shift encoding,
  39  * with the shift codes having the high bit set, we can make an optimization
  40  * for STR_MBS that only calls tre_mbrtowc_l() when a high-bit character
  41  * is detected, and just do a direct copy for ASCII characters.
  42  */
  43 #define GET_NEXT_WCHAR()                                                      \
  44   do {                                                                        \
  45     prev_c = next_c;                                                          \
  46     switch (type)                                                             \
  47       {                                                                       \
  48       case STR_BYTE:                                                          \
  49         pos++;                                                                \
  50         if (len >= 0 && pos >= len)                                             \
  51           next_c = '\0';                                                      \
  52         else                                                                  \
  53           next_c = (unsigned char)(*str_byte++);                              \
  54         break;                                                                \
  55       case STR_WIDE:                                                          \
  56         pos++;                                                                \
  57         if (len >= 0 && pos >= len)                                             \
  58           next_c = L'\0';                                                     \
  59         else                                                                  \
  60           next_c = *str_wide++;                                               \
  61         break;                                                                \
  62       case STR_MBS:                                                           \
  63         pos += pos_add_next;                                                  \
  64         if (__builtin_expect(len >= 0 && pos >= len, 0))                \
  65           {                                                                   \
  66             next_c = L'\0';                                                   \
  67             pos_add_next = 1;                                                 \
  68           }                                                                   \
  69         else if (__builtin_expect(!(*str_byte & 0x80), 1))                \
  70           {                                                                   \
  71             next_c = (unsigned char)(*str_byte++);                            \
  72             pos_add_next = 1;                                                 \
  73           }                                                                   \
  74         else                                                                  \
  75           {                                                                   \
  76             size_t w;                                                         \
  77             int max;                                                          \
  78             if (len >= 0)                                                  \
  79               max = len - pos;                                                \
  80             else                                                              \
  81               max = 32;                                                       \
  82             w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate,       \
  83                               tnfa->loc);                                  \
  84             if (w == (size_t)-1 || w == (size_t)-2)                           \
  85               return REG_NOMATCH;                                             \
  86             if (w == 0 && len >= 0)                                        \
  87               {                                                               \
  88                 pos_add_next = 1;                                             \
  89                 next_c = 0;                                                   \
  90                 str_byte++;                                                   \
  91               }                                                               \
  92             else                                                              \
  93               {                                                               \
  94                 pos_add_next = w;                                             \
  95                 str_byte += w;                                                \
  96               }                                                               \
  97           }                                                                   \
  98         break;                                                                \
  99       }                                                                       \
 100   } while(/*CONSTCOND*/0)
 101 
 102 /* Assumes tre_tnfa_t *tnfa in scope */
 103 #define IS_WORD_CHAR(c)  ((c) == L'_' || tre_isalnum_l(c, tnfa->loc))
 104 
 105 #define CHECK_ASSERTIONS(assertions)                                          \
 106   (((assertions & ASSERT_AT_BOL)                                          \
 107     && (pos > 0 || reg_notbol)                                                     \
 108     && (prev_c != L'\n' || !reg_newline))                                     \
 109    || ((assertions & ASSERT_AT_EOL)                                       \
 110        && (next_c != L'\0' || reg_noteol)                                     \
 111        && (next_c != L'\n' || !reg_newline))                                  \
 112    || ((assertions & ASSERT_AT_BOW)                                       \
 113        && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c)))                    \
 114    || ((assertions & ASSERT_AT_EOW)                                       \
 115        && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c)))                    \
 116    || ((assertions & ASSERT_AT_WB)                                        \
 117        && (pos != 0 && next_c != L'\0'                                        \
 118            && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c)))                  \
 119    || ((assertions & ASSERT_AT_WB_NEG)                                            \
 120        && (pos == 0 || next_c == L'\0'                                        \
 121            || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
 122 
 123 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)                             \
 124   ((trans_i->assertions & ASSERT_BRACKET_MATCH)                               \
 125     && !tre_bracket_match(trans_i->u.bracket_match_list,(tre_cint_t)prev_c,    \
 126                                       tnfa))
 127 
 128 inline static int
 129 tre_tag_get(const tre_tag_t *tags, int i)
 130 {
 131   tags += i;
 132   return tags->count > 0 ? tags->value : -1;
 133 }
 134 
 135 inline static void
 136 tre_tag_set(tre_tag_t *tags, int i, int val, int touch)
 137 {
 138   tags += i;
 139   if (tags->count++ == 0)
 140     tags->first = val;
 141   tags->value = val;
 142   tags->touch = touch;
 143 }
 144 
 145 inline static void
 146 tre_tag_reset(tre_tag_t *tags, int i)
 147 {
 148   tags[i].count = 0;
 149 }
 150 
 151 inline static int
 152 tre_tag_touch_get(const tre_tag_t *tags, int i)
 153 {
 154   return tags[i].touch;
 155 }
 156 
 157 #ifdef TRE_DEBUG
 158 inline static void
 159 tre_print_tags(const tre_tag_t *tags, int num_tags)
 160 {
 161   int i;
 162   for (i = 0; i < num_tags; i++, tags++)
 163     {
 164       switch(tags->count)
 165         {
 166         case 0:
 167           DPRINT(("%d:(0,-1)", i));
 168           break;
 169         case 1:
 170           DPRINT(("%d:(1,%d)", i, tags->first));
 171           break;
 172         default:
 173           DPRINT(("%d:(%d,%d,%d)", i, tags->count, tags->first,
 174                   tags->value));
 175           break;
 176         }
 177       if (i < (num_tags - 1))
 178         DPRINT((" "));
 179     }
 180 }
 181 
 182 inline static void
 183 tre_print_tags_all(const tre_tag_t *tags, int num_tags)
 184 {
 185   int i;
 186   for (i = 0; i < num_tags; i++, tags++)
 187     {
 188       switch(tags->count)
 189         {
 190         case 0:
 191           DPRINT(("%d:(0,-1)/%d", i, tags->touch));
 192           break;
 193         case 1:
 194           DPRINT(("%d:(1,%d)/%d", i, tags->first, tags->touch));
 195           break;
 196         default:
 197           DPRINT(("%d:(%d,%d,%d)/%d", i, tags->count, tags->first,
 198                   tags->value, tags->touch));
 199           break;
 200         }
 201       if (i < (num_tags - 1))
 202         DPRINT((" "));
 203     }
 204 }
 205 #endif /* TRE_DEBUG */
 206 
 207 /* Return < 0, = 0 or > 0 depending on how the start/end pairs of a minimal
 208  * group between t1 and t2 compare (t1 loses if < 0, t1 wins if > 0) */
 209 inline static int
 210 tre_minimal_tag_order(int start, int end, const tre_tag_t *tags1,
 211                       const tre_tag_t *tags2)
 212 {
 213   const tre_tag_t *t1, *t2;
 214 
 215   t1 = tags1 + start;
 216   t2 = tags2 + start;
 217   /* We need both start tags to be set */
 218   if (t1->count == 0 || t2->count == 0)
 219     return 0;
 220 
 221   /* The start tags must be equal */
 222   if (t1->value != t2->value)
 223     return 0;
 224 
 225   t1 = tags1 + end;
 226   t2 = tags2 + end;
 227   /* For the end tags, we prefer set over unset, because unset means that
 228    * the end tag is still growing */
 229   if (t1->count == 0)
 230     {
 231       /* if t2 is set, t1 loses since it is unset */
 232       if (t2->count != 0)
 233         return -1;
 234     }
 235   /* if t2 not set, t1 wins since it is set */
 236   else if (t2->count == 0)
 237     return 1;
 238 
 239   /* least current value wins */
 240   return t2->value - t1->value;
 241 }
 242 
 243 /* Return < 0, = 0 or > 0 depending on how the i-th item of t1 and t2 compare
 244  * (t1 loses if < 0, t1 wins if > 0) */
 245 inline static int
 246 tre_tag_order_1(int i, tre_tag_direction_t dir, const tre_tag_t *t1,
 247                 const tre_tag_t *t2)
 248 {
 249   int diff;
 250 
 251   t1 += i;
 252   t2 += i;
 253   switch (dir)
 254     {
 255     case TRE_TAG_MINIMIZE:
 256       /* least current value wins (because tags are initialized to all zeros,
 257        * unset wins over set; also, tre_minimal_tag_order() will have already
 258        * been run, which checks for being unset) */
 259       return t2->value - t1->value;
 260 
 261     case TRE_TAG_MAXIMIZE:
 262       /* prefer set */
 263       if (t1->count == 0)
 264         {
 265           /* if neither t1 and t2 are set, try next tag */
 266           if (t2->count == 0)
 267             return 0;
 268           /* t2 is set, t1 loses since it is unset */
 269           return -1;
 270         }
 271       /* if t2 not set, t1 wins since it is set */
 272       else if (t2->count == 0)
 273         return 1;
 274       /* greatest initial value wins */
 275       if ((diff = t1->first - t2->first) != 0)
 276         return diff;
 277       /* least number of times the tag was set, wins */
 278       if ((diff = t2->count - t1->count) != 0)
 279         return diff;
 280       /* if the tags were only set once, they only have initial values */
 281       if (t1->count == 1)
 282         return 0;
 283       /* greatest current value wins */
 284       return t1->value - t2->value;
 285 
 286     case TRE_TAG_LEFT_MAXIMIZE:
 287       /* prefer set */
 288       if (t1->count == 0)
 289         {
 290           /* if neither t1 and t2 are set, try next tag */
 291           if (t2->count == 0)
 292             return 0;
 293           /* t2 is set, t1 loses since it is unset */
 294           return -1;
 295         }
 296       /* if t2 not set, t1 wins since it is set */
 297       else if (t2->count == 0)
 298         return 1;
 299       /* least initial value wins */
 300       if ((diff = t2->first - t1->first) != 0)
 301         return diff;
 302       /* least number of times the tag was set, wins */
 303       if ((diff = t2->count - t1->count) != 0)
 304         return diff;
 305       /* if the tags were only set once, they only have initial values */
 306       if (t1->count == 1)
 307         return 0;
 308       /* greatest current value wins */
 309       return t1->value - t2->value;
 310 
 311     default:
 312       /* Shouldn't happen: only assert if TRE_DEBUG defined */
 313       assert(0);
 314       break;
 315     }
 316   return 0;
 317 }
 318 
 319 #ifdef TRE_DEBUG
 320 #define _MORE_DEBUGGING
 321 #endif /* TRE_DEBUG */
 322 
 323 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
 324 inline static int
 325 #ifdef _MORE_DEBUGGING
 326 _tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
 327               const tre_tag_t *t1, const tre_tag_t *t2)
 328 #else /* !_MORE_DEBUGGING */
 329 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
 330               const tre_tag_t *t1, const tre_tag_t *t2)
 331 #endif /* !_MORE_DEBUGGING */
 332 {
 333   int i, ret;
 334 
 335   for (i = 0; i < num_tags; i++)
 336     {
 337       if ((ret = tre_tag_order_1(i, tag_directions[i], t1, t2)) != 0)
 338         return (ret > 0);
 339     }
 340   /*  assert(0);*/
 341   return 0;
 342 }
 343 
 344 #ifdef _MORE_DEBUGGING
 345 inline static int
 346 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
 347               const tre_tag_t *t1, const tre_tag_t *t2)
 348 {
 349   int ret = _tre_tag_order(num_tags, tag_directions, t1, t2);
 350   DPRINT(("tre_tag_order: "));
 351   tre_print_tags(t1, num_tags);
 352   DPRINT((" %s ", ret ? "wins" : "doesn't win"));
 353   tre_print_tags(t2, num_tags);
 354   DPRINT(("\n"));
 355   return ret;
 356 }
 357 #endif /* _MORE_DEBUGGING */
 358 
 359 int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len);
 360 
 361 inline static int
 362 tre_bracket_match(tre_bracket_match_list_t * __restrict list, tre_cint_t wc,
 363                   const tre_tnfa_t * __restrict tnfa)
 364 {
 365   int match = 0;
 366   int i;
 367   tre_bracket_match_t *b;
 368   tre_cint_t uc, lc;
 369   int we, ue, le, got_equiv = 0;
 370   int icase = ((tnfa->cflags & REG_ICASE) != 0);
 371 
 372   DPRINT(("tre_bracket_match: %p, %d, %d\n", list, wc, icase));
 373   if (icase)
 374     {
 375       if (tre_islower_l(wc, tnfa->loc))
 376         {
 377           lc = wc;
 378           uc = tre_toupper_l(wc, tnfa->loc);
 379         }
 380       else if (tre_isupper_l(wc, tnfa->loc))
 381         {
 382           uc = wc;
 383           lc = tre_tolower_l(wc, tnfa->loc);
 384         }
 385       else
 386         {
 387           icase = 0;
 388         }
 389     }
 390   for (i = 0, b = list->bracket_matches; i < list->num_bracket_matches;
 391        i++, b++)
 392     {
 393       switch (b->type)
 394         {
 395         case TRE_BRACKET_MATCH_TYPE_CHAR:
 396           if (icase)
 397             match = (b->value == uc || b->value == lc);
 398           else
 399             match = (b->value == wc);
 400           break;
 401         case TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN:
 402           {
 403             tre_cint_t start = b->value, end;
 404             if (++i >= list->num_bracket_matches ||
 405                 (++b)->type != TRE_BRACKET_MATCH_TYPE_RANGE_END)
 406               {
 407                 DPRINT(("tre_bracket_match: no following range end\n"));
 408                 assert(0);
 409                 goto error;
 410               }
 411             end = b->value;
 412             if (!got_equiv)
 413               {
 414                 if (icase)
 415                   {
 416                     ue = __collate_equiv_value(tnfa->loc, &uc, 1);
 417                     le = __collate_equiv_value(tnfa->loc, &lc, 1);
 418                   }
 419                 else
 420                     we = __collate_equiv_value(tnfa->loc, &wc, 1);
 421                 got_equiv = 1;
 422               }
 423             if (icase)
 424               match = ((start <= ue && ue <= end) ||
 425                       (start <= le && le <= end));
 426             else
 427               match = (start <= we && we <= end);
 428             break;
 429           }
 430         case TRE_BRACKET_MATCH_TYPE_RANGE_END:
 431           DPRINT(("tre_bracket_match: range end without preceeding start\n"));
 432           assert(0);
 433           break;
 434         case TRE_BRACKET_MATCH_TYPE_CLASS:
 435           if (icase)
 436             match = (tre_isctype_l(uc, b->value, tnfa->loc) ||
 437                      tre_isctype_l(lc, b->value, tnfa->loc));
 438           else
 439             match = (tre_isctype_l(wc, b->value, tnfa->loc));
 440           break;
 441         case TRE_BRACKET_MATCH_TYPE_EQUIVALENCE:
 442           if (!got_equiv)
 443             {
 444               if (icase)
 445                 {
 446                   ue = __collate_equiv_value(tnfa->loc, &uc, 1);
 447                   le = __collate_equiv_value(tnfa->loc, &lc, 1);
 448                 }
 449               else
 450                   we = __collate_equiv_value(tnfa->loc, &wc, 1);
 451               got_equiv = 1;
 452             }
 453           if (icase)
 454             match = (b->value == ue || b->value == le);
 455           else
 456             match = (b->value == we);
 457           break;
 458         default:
 459           DPRINT(("tre_bracket_match: unknown type %d\n", b->type));
 460           assert(0);
 461           break;
 462         }
 463         if (match)
 464           break;
 465     }
 466 error:
 467   if (list->flags & TRE_BRACKET_MATCH_FLAG_NEGATE) {
 468     if ((tnfa->cflags & REG_NEWLINE) && wc == '\n') return 0;
 469     match = !match;
 470   }
 471   return match;
 472 }