1 /*
   2  * Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  *
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  *
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  18  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  19  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  20  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  26  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28 
  29 /*
  30   tre-match-parallel.c - TRE parallel regex matching engine
  31 */
  32 
  33 /*
  34   This algorithm searches for matches basically by reading characters
  35   in the searched string one by one, starting at the beginning.  All
  36   matching paths in the TNFA are traversed in parallel.  When two or
  37   more paths reach the same state, exactly one is chosen according to
  38   tag ordering rules; if returning submatches is not required it does
  39   not matter which path is chosen.
  40 
  41   The worst case time required for finding the leftmost and longest
  42   match, or determining that there is no match, is always linearly
  43   dependent on the length of the text being searched.
  44 
  45   This algorithm cannot handle TNFAs with back referencing nodes.
  46   See `tre-match-backtrack.c'.
  47 */
  48 
  49 #include <assert.h>
  50 #include <stdlib.h>
  51 #include <string.h>
  52 #include <wchar.h>
  53 #include <wctype.h>
  54 
  55 #include "tre-internal.h"
  56 #include "tre-match-utils.h"
  57 #include "tre.h"
  58 
  59 typedef struct {
  60   tre_tnfa_transition_t *state;
  61   tre_tag_t *tags;
  62 } tre_tnfa_reach_t;
  63 
  64 typedef struct {
  65   int pos;
  66   tre_tag_t **tags;
  67 } tre_reach_pos_t;
  68 
  69 #ifdef TRE_DEBUG
  70 static void
  71 tre_print_reach1(tre_tnfa_transition_t *state, tre_tag_t *tags, int num_tags)
  72 {
  73   DPRINT((" %p", (void *)state));
  74   if (num_tags > 0)
  75     {
  76       DPRINT(("/"));
  77       tre_print_tags(tags, num_tags);
  78     }
  79 }
  80 
  81 static void
  82 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
  83 {
  84   while (reach->state != NULL)
  85     {
  86       tre_print_reach1(reach->state, reach->tags, num_tags);
  87       reach++;
  88     }
  89   DPRINT(("\n"));
  90 
  91 }
  92 #endif /* TRE_DEBUG */
  93 
  94 reg_errcode_t
  95 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
  96                       tre_str_type_t type, tre_tag_t *match_tags, int eflags,
  97                       int *match_end_ofs)
  98 {
  99   /* State variables required by GET_NEXT_WCHAR. */
 100   tre_char_t prev_c = 0, next_c = 0;
 101   const char *str_byte = string;
 102   int pos = -1;
 103   unsigned int pos_add_next = 1;
 104   const wchar_t *str_wide = string;
 105   mbstate_t mbstate;
 106   int reg_notbol = eflags & REG_NOTBOL;
 107   int reg_noteol = eflags & REG_NOTEOL;
 108   int reg_newline = tnfa->cflags & REG_NEWLINE;
 109 
 110   char *buf;
 111   tre_tnfa_transition_t *trans_i;
 112   tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
 113   tre_reach_pos_t *reach_pos;
 114   int *tag_i;
 115   int num_tags, i;
 116 
 117   int match_eo = -1;       /* end offset of match (-1 if no match found yet) */
 118 #ifdef TRE_DEBUG
 119   int once;
 120 #endif /* TRE_DEBUG */
 121   tre_tag_t *tmp_tags = NULL;
 122   tre_tag_t *tmp_iptr;
 123   int tbytes;
 124   int touch = 1;
 125 
 126   memset(&mbstate, '\0', sizeof(mbstate));
 127 
 128   DPRINT(("tre_tnfa_run_parallel, input type %d\n", type));
 129 
 130   if (!match_tags)
 131     num_tags = 0;
 132   else
 133     num_tags = tnfa->num_tags;
 134 
 135   /* Allocate memory for temporary data required for matching.  This needs to
 136      be done for every matching operation to be thread safe.  This allocates
 137      everything in a single large block from the stack frame using alloca()
 138      or with malloc() if alloca is unavailable. */
 139   {
 140     int rbytes, pbytes, total_bytes;
 141     char *tmp_buf;
 142     /* Compute the length of the block we need. */
 143     tbytes = sizeof(*tmp_tags) * num_tags;
 144     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
 145     pbytes = sizeof(*reach_pos) * tnfa->num_states;
 146     total_bytes =
 147       (sizeof(long) - 1) * 4 /* for alignment paddings */
 148       + (rbytes + tbytes * tnfa->num_states) * 2 + tbytes + pbytes;
 149 
 150     DPRINT(("tre_tnfa_run_parallel, allocate %d bytes\n", total_bytes));
 151     /* Allocate the memory. */
 152     buf = malloc((unsigned)total_bytes);
 153     if (buf == NULL)
 154       return REG_ESPACE;
 155     memset(buf, 0, (size_t)total_bytes);
 156 
 157     /* Get the various pointers within tmp_buf (properly aligned). */
 158     tmp_tags = (void *)buf;
 159     tmp_buf = buf + tbytes;
 160     tmp_buf += ALIGN(tmp_buf, long);
 161     reach_next = (void *)tmp_buf;
 162     tmp_buf += rbytes;
 163     tmp_buf += ALIGN(tmp_buf, long);
 164     reach = (void *)tmp_buf;
 165     tmp_buf += rbytes;
 166     tmp_buf += ALIGN(tmp_buf, long);
 167     reach_pos = (void *)tmp_buf;
 168     tmp_buf += pbytes;
 169     tmp_buf += ALIGN(tmp_buf, long);
 170     for (i = 0; i < tnfa->num_states; i++)
 171       {
 172         reach[i].tags = (void *)tmp_buf;
 173         tmp_buf += tbytes;
 174         reach_next[i].tags = (void *)tmp_buf;
 175         tmp_buf += tbytes;
 176       }
 177   }
 178 
 179   for (i = 0; i < tnfa->num_states; i++)
 180     reach_pos[i].pos = -1;
 181 
 182   /* If only one character can start a match, find it first. */
 183   if (tnfa->first_char >= 0 && str_byte)
 184     {
 185       const char *orig_str = str_byte;
 186       int first = tnfa->first_char;
 187       int found_high_bit = 0;
 188 
 189       if (type == STR_BYTE)
 190         {
 191           if (len >= 0)
 192             str_byte = memchr(orig_str, first, (size_t)len);
 193           else
 194             str_byte = strchr(orig_str, first);
 195         }
 196       else if (type == STR_MBS)
 197         {
 198           /*
 199            * If the match character is ASCII, try to match the character
 200            * directly, but if a high bit character is found, we stop there.
 201            */
 202           if (first < 0x80)
 203             {
 204               if (len >= 0)
 205                 {
 206                   int i;
 207                   for (i = 0; ; str_byte++, i++)
 208                     {
 209                       if (i >= len)
 210                         {
 211                           str_byte = NULL;
 212                           break;
 213                         }
 214                       if (*str_byte == first)
 215                         break;
 216                       if (*str_byte & 0x80)
 217                         {
 218                           found_high_bit = 1;
 219                           break;
 220                         }
 221                     }
 222                 }
 223               else
 224                 {
 225                   for (; ; str_byte++)
 226                     {
 227                       if (!*str_byte)
 228                         {
 229                           str_byte = NULL;
 230                           break;
 231                         }
 232                       if (*str_byte == first)
 233                         break;
 234                       if (*str_byte & 0x80)
 235                         {
 236                           found_high_bit = 1;
 237                           break;
 238                         }
 239                     }
 240                 }
 241             }
 242           else
 243             {
 244               if (len >= 0)
 245                 {
 246                   int i;
 247                   for (i = 0; ; str_byte++, i++)
 248                     {
 249                       if (i >= len)
 250                         {
 251                           str_byte = NULL;
 252                           break;
 253                         }
 254                       if (*str_byte & 0x80)
 255                         {
 256                           found_high_bit = 1;
 257                           break;
 258                         }
 259                     }
 260                 }
 261               else
 262                 {
 263                   for (; ; str_byte++)
 264                     {
 265                       if (!*str_byte)
 266                         {
 267                           str_byte = NULL;
 268                           break;
 269                         }
 270                       if (*str_byte & 0x80)
 271                         {
 272                           found_high_bit = 1;
 273                           break;
 274                         }
 275                     }
 276                 }
 277             }
 278         }
 279       if (str_byte == NULL)
 280         {
 281           if (buf)
 282             free(buf);
 283           return REG_NOMATCH;
 284         }
 285       DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str)));
 286       if (!found_high_bit)
 287         {
 288           if (str_byte >= orig_str + 1)
 289             prev_c = (unsigned char)*(str_byte - 1);
 290           next_c = (unsigned char)*str_byte;
 291           pos = str_byte - orig_str;
 292           if (len < 0 || pos < len)
 293             str_byte++;
 294         }
 295       else
 296         {
 297           if (str_byte == orig_str)
 298             goto no_first_optimization;
 299           /*
 300            * Back up one character, fix up the position, then call
 301            * GET_NEXT_WCHAR() to process the multibyte character.
 302            */
 303           /* no need to set prev_c, since GET_NEXT_WCHAR will overwrite */
 304           next_c = (unsigned char)*(str_byte - 1);
 305           pos = (str_byte - 1) - orig_str;
 306           GET_NEXT_WCHAR();
 307         }
 308     }
 309   else
 310     {
 311 no_first_optimization:
 312       GET_NEXT_WCHAR();
 313       pos = 0;
 314     }
 315 
 316   DPRINT(("length: %d\n", len));
 317   DPRINT(("pos:chr/code | states and tags\n"));
 318   DPRINT(("-------------+------------------------------------------------\n"));
 319 
 320   reach_next_i = reach_next;
 321   while (/*CONSTCOND*/1)
 322     {
 323       /* If no match found yet, add the initial states to `reach_next'. */
 324       if (match_eo < 0)
 325         {
 326           DPRINT((" init >"));
 327           trans_i = tnfa->initial;
 328           while (trans_i->state != NULL)
 329             {
 330               if (reach_pos[trans_i->state_id].pos < pos)
 331                 {
 332                   if (trans_i->assertions
 333                       && CHECK_ASSERTIONS(trans_i->assertions))
 334                     {
 335                       DPRINT(("assertion failed\n"));
 336                       trans_i++;
 337                       continue;
 338                     }
 339 
 340                   DPRINT((" %p", (void *)trans_i->state));
 341                   reach_next_i->state = trans_i->state;
 342                   memset(reach_next_i->tags, 0, tbytes);
 343                   tag_i = trans_i->tags;
 344                   if (tag_i)
 345                     {
 346                       while (*tag_i >= 0)
 347                         {
 348                           if (*tag_i < num_tags)
 349                             tre_tag_set(reach_next_i->tags, *tag_i, pos, touch);
 350                           tag_i++;
 351                         }
 352                         touch++;
 353                     }
 354                   if (reach_next_i->state == tnfa->final)
 355                     {
 356                       DPRINT(("  found empty match\n"));
 357                       match_eo = pos;
 358                       memcpy(match_tags, reach_next_i->tags, tbytes);
 359                     }
 360                   reach_pos[trans_i->state_id].pos = pos;
 361                   reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
 362                   reach_next_i++;
 363                 }
 364               trans_i++;
 365             }
 366           DPRINT(("\n"));
 367           reach_next_i->state = NULL;
 368         }
 369       else
 370         {
 371           if (num_tags == 0 || reach_next_i == reach_next)
 372             /* We have found a match. */
 373             break;
 374         }
 375 
 376       /* Check for end of string. */
 377       if (len < 0)
 378         {
 379           if (next_c == L'\0')
 380             break;
 381         }
 382       else
 383         {
 384           if (pos >= len)
 385             break;
 386         }
 387 
 388       GET_NEXT_WCHAR();
 389 
 390 #ifdef TRE_DEBUG
 391       DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
 392       tre_print_reach(tnfa, reach_next, num_tags);
 393 #endif /* TRE_DEBUG */
 394 
 395       /* Swap `reach' and `reach_next'. */
 396       reach_i = reach;
 397       reach = reach_next;
 398       reach_next = reach_i;
 399 
 400 #ifdef TRE_DEBUG
 401       once = 0;
 402 #endif /* TRE_DEBUG */
 403 
 404       /* For each state in `reach' see if there is a transition leaving with
 405          the current input symbol to a state not yet in `reach_next', and
 406          add the destination states to `reach_next'. */
 407       reach_next_i = reach_next;
 408       for (reach_i = reach; reach_i->state; reach_i++)
 409         {
 410           for (trans_i = reach_i->state; trans_i->state; trans_i++)
 411             {
 412               /* Does this transition match the input symbol? */
 413               if (trans_i->code_min <= (tre_cint_t)prev_c &&
 414                   trans_i->code_max >= (tre_cint_t)prev_c)
 415                 {
 416                   if (trans_i->assertions
 417                       && (CHECK_ASSERTIONS(trans_i->assertions)
 418                           || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
 419                     {
 420                       DPRINT(("assertion failed\n"));
 421                       continue;
 422                     }
 423 
 424                   /* Compute the tags after this transition. */
 425                   memcpy(tmp_tags, reach_i->tags, tbytes);
 426                   tag_i = trans_i->tags;
 427                   if (tag_i != NULL)
 428                     {
 429                       while (*tag_i >= 0)
 430                         {
 431                           if (*tag_i < num_tags)
 432                             tre_tag_set(tmp_tags, *tag_i, pos, touch);
 433                           tag_i++;
 434                         }
 435                         touch++;
 436                     }
 437 
 438                   /* For each new transition, weed out those that don't
 439                      fulfill the minimal matching conditions. */
 440                   if (tnfa->num_minimals && match_eo >= 0)
 441                     {
 442                       int skip = 0;
 443 #ifdef TRE_DEBUG
 444                       if (!once)
 445                         {
 446                           DPRINT(("Checking minimal conditions: match_eo=%d "
 447                                   "match_tags=", match_eo));
 448                           tre_print_tags(match_tags, num_tags);
 449                           DPRINT(("\n"));
 450                           once++;
 451                         }
 452 #endif /* TRE_DEBUG */
 453                       for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
 454                         {
 455                            int end = tnfa->minimal_tags[i];
 456                            int start = tnfa->minimal_tags[i + 1];
 457                            DPRINT(("  Minimal start %d, end %d\n", start, end));
 458                            if (tre_minimal_tag_order(start, end, match_tags,
 459                                                      tmp_tags) > 0)
 460                              {
 461                                 skip = 1;
 462                                 break;
 463                              }
 464                         }
 465                       if (skip)
 466                         {
 467 #ifdef TRE_DEBUG
 468                            DPRINT(("     Throwing out"));
 469                            tre_print_reach1(reach_i->state, tmp_tags,
 470                                              num_tags);
 471                            DPRINT(("\n"));
 472 #endif /* TRE_DEBUG */
 473                            continue;
 474                         }
 475                     }
 476 
 477                   if (reach_pos[trans_i->state_id].pos < pos)
 478                     {
 479                       /* Found an unvisited node. */
 480                       reach_next_i->state = trans_i->state;
 481                       tmp_iptr = reach_next_i->tags;
 482                       reach_next_i->tags = tmp_tags;
 483                       tmp_tags = tmp_iptr;
 484                       reach_pos[trans_i->state_id].pos = pos;
 485                       reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
 486 
 487                       if (reach_next_i->state == tnfa->final
 488                           && (match_eo == -1
 489                               || (num_tags > 0
 490                                   && tre_tag_get(reach_next_i->tags, 0) <=
 491                                   tre_tag_get(match_tags, 0))))
 492                         {
 493 #ifdef TRE_DEBUG
 494                           DPRINT(("  found match"));
 495                           tre_print_reach1(trans_i->state, reach_next_i->tags, num_tags);
 496                           DPRINT(("\n"));
 497 #endif /* TRE_DEBUG */
 498                           match_eo = pos;
 499                           memcpy(match_tags, reach_next_i->tags, tbytes);
 500                         }
 501                       reach_next_i++;
 502 
 503                     }
 504                   else
 505                     {
 506                       assert(reach_pos[trans_i->state_id].pos == pos);
 507                       /* Another path has also reached this state.  We choose
 508                          the winner by examining the tag values for both
 509                          paths. */
 510                       if (tre_tag_order(num_tags, tnfa->tag_directions,
 511                                         tmp_tags,
 512                                         *reach_pos[trans_i->state_id].tags))
 513                         {
 514                           /* The new path wins. */
 515                           tmp_iptr = *reach_pos[trans_i->state_id].tags;
 516                           *reach_pos[trans_i->state_id].tags = tmp_tags;
 517                           if (trans_i->state == tnfa->final)
 518                             {
 519 #ifdef TRE_DEBUG
 520                               DPRINT(("  found better match"));
 521                               tre_print_reach1(trans_i->state, tmp_tags, num_tags);
 522                               DPRINT(("\n"));
 523 #endif /* TRE_DEBUG */
 524                               match_eo = pos;
 525                               memcpy(match_tags, tmp_tags, tbytes);
 526                             }
 527                           tmp_tags = tmp_iptr;
 528                         }
 529                     }
 530                 }
 531             }
 532         }
 533       reach_next_i->state = NULL;
 534     }
 535 
 536   DPRINT(("match end offset = %d\n", match_eo));
 537 
 538   *match_end_ofs = match_eo;
 539 #ifdef TRE_DEBUG
 540   if (match_tags)
 541     {
 542       DPRINT(("Winning tags="));
 543       tre_print_tags_all(match_tags, num_tags);
 544       DPRINT((" touch=%d\n", touch));
 545     }
 546 #endif /* TRE_DEBUG */
 547 
 548   if (buf)
 549     free(buf);
 550 
 551   return match_eo >= 0 ? REG_OK : REG_NOMATCH;
 552 }