1 /* Extended regular expression matching and search library. 2 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 Free Software 3 Foundation, Inc. 4 This file is part of the GNU C Library. 5 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 6 7 This program is free software; you can redistribute it and/or modify 8 it under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 2, or (at your option) 10 any later version. 11 12 This program is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program; if not, write to the Free Software Foundation, 19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 20 21 static void re_string_construct_common (const char *str, Idx len, 22 re_string_t *pstr, 23 RE_TRANSLATE_TYPE trans, bool icase, 24 const re_dfa_t *dfa) internal_function; 25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, 26 const re_node_set *nodes, 27 re_hashval_t hash) internal_function; 28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, 29 const re_node_set *nodes, 30 unsigned int context, 31 re_hashval_t hash) internal_function; 32 33 /* Functions for string operation. */ 34 35 /* This function allocate the buffers. It is necessary to call 36 re_string_reconstruct before using the object. */ 37 38 static reg_errcode_t 39 internal_function 40 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len, 41 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 42 { 43 reg_errcode_t ret; 44 Idx init_buf_len; 45 46 /* Ensure at least one character fits into the buffers. */ 47 if (init_len < dfa->mb_cur_max) 48 init_len = dfa->mb_cur_max; 49 init_buf_len = (len + 1 < init_len) ? len + 1: init_len; 50 re_string_construct_common (str, len, pstr, trans, icase, dfa); 51 52 ret = re_string_realloc_buffers (pstr, init_buf_len); 53 if (BE (ret != REG_NOERROR, 0)) 54 return ret; 55 56 pstr->word_char = dfa->word_char; 57 pstr->word_ops_used = dfa->word_ops_used; 58 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 59 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len; 60 pstr->valid_raw_len = pstr->valid_len; 61 return REG_NOERROR; 62 } 63 64 /* This function allocate the buffers, and initialize them. */ 65 66 static reg_errcode_t 67 internal_function 68 re_string_construct (re_string_t *pstr, const char *str, Idx len, 69 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 70 { 71 reg_errcode_t ret; 72 memset (pstr, '\0', sizeof (re_string_t)); 73 re_string_construct_common (str, len, pstr, trans, icase, dfa); 74 75 if (len > 0) 76 { 77 ret = re_string_realloc_buffers (pstr, len + 1); 78 if (BE (ret != REG_NOERROR, 0)) 79 return ret; 80 } 81 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 82 83 if (icase) 84 { 85 #ifdef RE_ENABLE_I18N 86 if (dfa->mb_cur_max > 1) 87 { 88 while (1) 89 { 90 ret = build_wcs_upper_buffer (pstr); 91 if (BE (ret != REG_NOERROR, 0)) 92 return ret; 93 if (pstr->valid_raw_len >= len) 94 break; 95 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max) 96 break; 97 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 98 if (BE (ret != REG_NOERROR, 0)) 99 return ret; 100 } 101 } 102 else 103 #endif /* RE_ENABLE_I18N */ 104 build_upper_buffer (pstr); 105 } 106 else 107 { 108 #ifdef RE_ENABLE_I18N 109 if (dfa->mb_cur_max > 1) 110 build_wcs_buffer (pstr); 111 else 112 #endif /* RE_ENABLE_I18N */ 113 { 114 if (trans != NULL) 115 re_string_translate_buffer (pstr); 116 else 117 { 118 pstr->valid_len = pstr->bufs_len; 119 pstr->valid_raw_len = pstr->bufs_len; 120 } 121 } 122 } 123 124 return REG_NOERROR; 125 } 126 127 /* Helper functions for re_string_allocate, and re_string_construct. */ 128 129 static reg_errcode_t 130 internal_function 131 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len) 132 { 133 #ifdef RE_ENABLE_I18N 134 if (pstr->mb_cur_max > 1) 135 { 136 wint_t *new_wcs; 137 138 /* Avoid overflow. */ 139 size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx)); 140 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0)) 141 return REG_ESPACE; 142 143 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len); 144 if (BE (new_wcs == NULL, 0)) 145 return REG_ESPACE; 146 pstr->wcs = new_wcs; 147 if (pstr->offsets != NULL) 148 { 149 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len); 150 if (BE (new_offsets == NULL, 0)) 151 return REG_ESPACE; 152 pstr->offsets = new_offsets; 153 } 154 } 155 #endif /* RE_ENABLE_I18N */ 156 if (pstr->mbs_allocated) 157 { 158 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, 159 new_buf_len); 160 if (BE (new_mbs == NULL, 0)) 161 return REG_ESPACE; 162 pstr->mbs = new_mbs; 163 } 164 pstr->bufs_len = new_buf_len; 165 return REG_NOERROR; 166 } 167 168 169 static void 170 internal_function 171 re_string_construct_common (const char *str, Idx len, re_string_t *pstr, 172 RE_TRANSLATE_TYPE trans, bool icase, 173 const re_dfa_t *dfa) 174 { 175 pstr->raw_mbs = (const unsigned char *) str; 176 pstr->len = len; 177 pstr->raw_len = len; 178 pstr->trans = trans; 179 pstr->icase = icase; 180 pstr->mbs_allocated = (trans != NULL || icase); 181 pstr->mb_cur_max = dfa->mb_cur_max; 182 pstr->is_utf8 = dfa->is_utf8; 183 pstr->map_notascii = dfa->map_notascii; 184 pstr->stop = pstr->len; 185 pstr->raw_stop = pstr->stop; 186 } 187 188 #ifdef RE_ENABLE_I18N 189 190 /* Build wide character buffer PSTR->WCS. 191 If the byte sequence of the string are: 192 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> 193 Then wide character buffer will be: 194 <wc1> , WEOF , <wc2> , WEOF , <wc3> 195 We use WEOF for padding, they indicate that the position isn't 196 a first byte of a multibyte character. 197 198 Note that this function assumes PSTR->VALID_LEN elements are already 199 built and starts from PSTR->VALID_LEN. */ 200 201 static void 202 internal_function 203 build_wcs_buffer (re_string_t *pstr) 204 { 205 #ifdef _LIBC 206 unsigned char buf[MB_LEN_MAX]; 207 assert (MB_LEN_MAX >= pstr->mb_cur_max); 208 #else 209 unsigned char buf[64]; 210 #endif 211 mbstate_t prev_st; 212 Idx byte_idx, end_idx, remain_len; 213 size_t mbclen; 214 215 /* Build the buffers from pstr->valid_len to either pstr->len or 216 pstr->bufs_len. */ 217 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 218 for (byte_idx = pstr->valid_len; byte_idx < end_idx;) 219 { 220 wchar_t wc; 221 const char *p; 222 223 remain_len = end_idx - byte_idx; 224 prev_st = pstr->cur_state; 225 /* Apply the translation if we need. */ 226 if (BE (pstr->trans != NULL, 0)) 227 { 228 int i, ch; 229 230 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 231 { 232 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i]; 233 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch]; 234 } 235 p = (const char *) buf; 236 } 237 else 238 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; 239 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state); 240 if (BE (mbclen == (size_t) -2, 0)) 241 { 242 /* The buffer doesn't have enough space, finish to build. */ 243 pstr->cur_state = prev_st; 244 break; 245 } 246 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0)) 247 { 248 /* We treat these cases as a singlebyte character. */ 249 mbclen = 1; 250 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 251 if (BE (pstr->trans != NULL, 0)) 252 wc = pstr->trans[wc]; 253 pstr->cur_state = prev_st; 254 } 255 256 /* Write wide character and padding. */ 257 pstr->wcs[byte_idx++] = wc; 258 /* Write paddings. */ 259 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 260 pstr->wcs[byte_idx++] = WEOF; 261 } 262 pstr->valid_len = byte_idx; 263 pstr->valid_raw_len = byte_idx; 264 } 265 266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer, 267 but for REG_ICASE. */ 268 269 static reg_errcode_t 270 internal_function 271 build_wcs_upper_buffer (re_string_t *pstr) 272 { 273 mbstate_t prev_st; 274 Idx src_idx, byte_idx, end_idx, remain_len; 275 size_t mbclen; 276 #ifdef _LIBC 277 char buf[MB_LEN_MAX]; 278 assert (MB_LEN_MAX >= pstr->mb_cur_max); 279 #else 280 char buf[64]; 281 #endif 282 283 byte_idx = pstr->valid_len; 284 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 285 286 /* The following optimization assumes that ASCII characters can be 287 mapped to wide characters with a simple cast. */ 288 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) 289 { 290 while (byte_idx < end_idx) 291 { 292 wchar_t wc; 293 294 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) 295 && mbsinit (&pstr->cur_state)) 296 { 297 /* In case of a singlebyte character. */ 298 pstr->mbs[byte_idx] 299 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); 300 /* The next step uses the assumption that wchar_t is encoded 301 ASCII-safe: all ASCII values can be converted like this. */ 302 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; 303 ++byte_idx; 304 continue; 305 } 306 307 remain_len = end_idx - byte_idx; 308 prev_st = pstr->cur_state; 309 mbclen = mbrtowc (&wc, 310 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx 311 + byte_idx), remain_len, &pstr->cur_state); 312 if (BE (mbclen < (size_t) -2, 1)) 313 { 314 wchar_t wcu = wc; 315 if (iswlower (wc)) 316 { 317 size_t mbcdlen; 318 319 wcu = towupper (wc); 320 mbcdlen = wcrtomb (buf, wcu, &prev_st); 321 if (BE (mbclen == mbcdlen, 1)) 322 memcpy (pstr->mbs + byte_idx, buf, mbclen); 323 else 324 { 325 src_idx = byte_idx; 326 goto offsets_needed; 327 } 328 } 329 else 330 memcpy (pstr->mbs + byte_idx, 331 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); 332 pstr->wcs[byte_idx++] = wcu; 333 /* Write paddings. */ 334 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 335 pstr->wcs[byte_idx++] = WEOF; 336 } 337 else if (mbclen == (size_t) -1 || mbclen == 0) 338 { 339 /* It is an invalid character or '\0'. Just use the byte. */ 340 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 341 pstr->mbs[byte_idx] = ch; 342 /* And also cast it to wide char. */ 343 pstr->wcs[byte_idx++] = (wchar_t) ch; 344 if (BE (mbclen == (size_t) -1, 0)) 345 pstr->cur_state = prev_st; 346 } 347 else 348 { 349 /* The buffer doesn't have enough space, finish to build. */ 350 pstr->cur_state = prev_st; 351 break; 352 } 353 } 354 pstr->valid_len = byte_idx; 355 pstr->valid_raw_len = byte_idx; 356 return REG_NOERROR; 357 } 358 else 359 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) 360 { 361 wchar_t wc; 362 const char *p; 363 offsets_needed: 364 remain_len = end_idx - byte_idx; 365 prev_st = pstr->cur_state; 366 if (BE (pstr->trans != NULL, 0)) 367 { 368 int i, ch; 369 370 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 371 { 372 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i]; 373 buf[i] = pstr->trans[ch]; 374 } 375 p = (const char *) buf; 376 } 377 else 378 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; 379 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state); 380 if (BE (mbclen < (size_t) -2, 1)) 381 { 382 wchar_t wcu = wc; 383 if (iswlower (wc)) 384 { 385 size_t mbcdlen; 386 387 wcu = towupper (wc); 388 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); 389 if (BE (mbclen == mbcdlen, 1)) 390 memcpy (pstr->mbs + byte_idx, buf, mbclen); 391 else if (mbcdlen != (size_t) -1) 392 { 393 size_t i; 394 395 if (byte_idx + mbcdlen > pstr->bufs_len) 396 { 397 pstr->cur_state = prev_st; 398 break; 399 } 400 401 if (pstr->offsets == NULL) 402 { 403 pstr->offsets = re_malloc (Idx, pstr->bufs_len); 404 405 if (pstr->offsets == NULL) 406 return REG_ESPACE; 407 } 408 if (!pstr->offsets_needed) 409 { 410 for (i = 0; i < (size_t) byte_idx; ++i) 411 pstr->offsets[i] = i; 412 pstr->offsets_needed = 1; 413 } 414 415 memcpy (pstr->mbs + byte_idx, buf, mbcdlen); 416 pstr->wcs[byte_idx] = wcu; 417 pstr->offsets[byte_idx] = src_idx; 418 for (i = 1; i < mbcdlen; ++i) 419 { 420 pstr->offsets[byte_idx + i] 421 = src_idx + (i < mbclen ? i : mbclen - 1); 422 pstr->wcs[byte_idx + i] = WEOF; 423 } 424 pstr->len += mbcdlen - mbclen; 425 if (pstr->raw_stop > src_idx) 426 pstr->stop += mbcdlen - mbclen; 427 end_idx = (pstr->bufs_len > pstr->len) 428 ? pstr->len : pstr->bufs_len; 429 byte_idx += mbcdlen; 430 src_idx += mbclen; 431 continue; 432 } 433 else 434 memcpy (pstr->mbs + byte_idx, p, mbclen); 435 } 436 else 437 memcpy (pstr->mbs + byte_idx, p, mbclen); 438 439 if (BE (pstr->offsets_needed != 0, 0)) 440 { 441 size_t i; 442 for (i = 0; i < mbclen; ++i) 443 pstr->offsets[byte_idx + i] = src_idx + i; 444 } 445 src_idx += mbclen; 446 447 pstr->wcs[byte_idx++] = wcu; 448 /* Write paddings. */ 449 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 450 pstr->wcs[byte_idx++] = WEOF; 451 } 452 else if (mbclen == (size_t) -1 || mbclen == 0) 453 { 454 /* It is an invalid character or '\0'. Just use the byte. */ 455 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; 456 457 if (BE (pstr->trans != NULL, 0)) 458 ch = pstr->trans [ch]; 459 pstr->mbs[byte_idx] = ch; 460 461 if (BE (pstr->offsets_needed != 0, 0)) 462 pstr->offsets[byte_idx] = src_idx; 463 ++src_idx; 464 465 /* And also cast it to wide char. */ 466 pstr->wcs[byte_idx++] = (wchar_t) ch; 467 if (BE (mbclen == (size_t) -1, 0)) 468 pstr->cur_state = prev_st; 469 } 470 else 471 { 472 /* The buffer doesn't have enough space, finish to build. */ 473 pstr->cur_state = prev_st; 474 break; 475 } 476 } 477 pstr->valid_len = byte_idx; 478 pstr->valid_raw_len = src_idx; 479 return REG_NOERROR; 480 } 481 482 /* Skip characters until the index becomes greater than NEW_RAW_IDX. 483 Return the index. */ 484 485 static Idx 486 internal_function 487 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc) 488 { 489 mbstate_t prev_st; 490 Idx rawbuf_idx; 491 size_t mbclen; 492 wint_t wc = WEOF; 493 494 /* Skip the characters which are not necessary to check. */ 495 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; 496 rawbuf_idx < new_raw_idx;) 497 { 498 wchar_t wc2; 499 Idx remain_len; 500 remain_len = pstr->len - rawbuf_idx; 501 prev_st = pstr->cur_state; 502 mbclen = mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx, 503 remain_len, &pstr->cur_state); 504 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) 505 { 506 /* We treat these cases as a single byte character. */ 507 if (mbclen == 0 || remain_len == 0) 508 wc = L'\0'; 509 else 510 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx); 511 mbclen = 1; 512 pstr->cur_state = prev_st; 513 } 514 else 515 wc = wc2; 516 /* Then proceed the next character. */ 517 rawbuf_idx += mbclen; 518 } 519 *last_wc = wc; 520 return rawbuf_idx; 521 } 522 #endif /* RE_ENABLE_I18N */ 523 524 /* Build the buffer PSTR->MBS, and apply the translation if we need. 525 This function is used in case of REG_ICASE. */ 526 527 static void 528 internal_function 529 build_upper_buffer (re_string_t *pstr) 530 { 531 Idx char_idx, end_idx; 532 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 533 534 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) 535 { 536 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; 537 if (BE (pstr->trans != NULL, 0)) 538 ch = pstr->trans[ch]; 539 if (islower (ch)) 540 pstr->mbs[char_idx] = toupper (ch); 541 else 542 pstr->mbs[char_idx] = ch; 543 } 544 pstr->valid_len = char_idx; 545 pstr->valid_raw_len = char_idx; 546 } 547 548 /* Apply TRANS to the buffer in PSTR. */ 549 550 static void 551 internal_function 552 re_string_translate_buffer (re_string_t *pstr) 553 { 554 Idx buf_idx, end_idx; 555 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 556 557 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) 558 { 559 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; 560 pstr->mbs[buf_idx] = pstr->trans[ch]; 561 } 562 563 pstr->valid_len = buf_idx; 564 pstr->valid_raw_len = buf_idx; 565 } 566 567 /* This function re-construct the buffers. 568 Concretely, convert to wide character in case of pstr->mb_cur_max > 1, 569 convert to upper case in case of REG_ICASE, apply translation. */ 570 571 static reg_errcode_t 572 internal_function 573 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) 574 { 575 Idx offset; 576 577 if (BE (pstr->raw_mbs_idx <= idx, 0)) 578 offset = idx - pstr->raw_mbs_idx; 579 else 580 { 581 /* Reset buffer. */ 582 #ifdef RE_ENABLE_I18N 583 if (pstr->mb_cur_max > 1) 584 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 585 #endif /* RE_ENABLE_I18N */ 586 pstr->len = pstr->raw_len; 587 pstr->stop = pstr->raw_stop; 588 pstr->valid_len = 0; 589 pstr->raw_mbs_idx = 0; 590 pstr->valid_raw_len = 0; 591 pstr->offsets_needed = 0; 592 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 593 : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 594 if (!pstr->mbs_allocated) 595 pstr->mbs = (unsigned char *) pstr->raw_mbs; 596 offset = idx; 597 } 598 599 if (BE (offset != 0, 1)) 600 { 601 /* Should the already checked characters be kept? */ 602 if (BE (offset < pstr->valid_raw_len, 1)) 603 { 604 /* Yes, move them to the front of the buffer. */ 605 #ifdef RE_ENABLE_I18N 606 if (BE (pstr->offsets_needed, 0)) 607 { 608 Idx low = 0, high = pstr->valid_len, mid; 609 do 610 { 611 mid = (high + low) / 2; 612 if (pstr->offsets[mid] > offset) 613 high = mid; 614 else if (pstr->offsets[mid] < offset) 615 low = mid + 1; 616 else 617 break; 618 } 619 while (low < high); 620 if (pstr->offsets[mid] < offset) 621 ++mid; 622 pstr->tip_context = re_string_context_at (pstr, mid - 1, 623 eflags); 624 /* This can be quite complicated, so handle specially 625 only the common and easy case where the character with 626 different length representation of lower and upper 627 case is present at or after offset. */ 628 if (pstr->valid_len > offset 629 && mid == offset && pstr->offsets[mid] == offset) 630 { 631 memmove (pstr->wcs, pstr->wcs + offset, 632 (pstr->valid_len - offset) * sizeof (wint_t)); 633 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); 634 pstr->valid_len -= offset; 635 pstr->valid_raw_len -= offset; 636 for (low = 0; low < pstr->valid_len; low++) 637 pstr->offsets[low] = pstr->offsets[low + offset] - offset; 638 } 639 else 640 { 641 /* Otherwise, just find out how long the partial multibyte 642 character at offset is and fill it with WEOF/255. */ 643 pstr->len = pstr->raw_len - idx + offset; 644 pstr->stop = pstr->raw_stop - idx + offset; 645 pstr->offsets_needed = 0; 646 while (mid > 0 && pstr->offsets[mid - 1] == offset) 647 --mid; 648 while (mid < pstr->valid_len) 649 if (pstr->wcs[mid] != WEOF) 650 break; 651 else 652 ++mid; 653 if (mid == pstr->valid_len) 654 pstr->valid_len = 0; 655 else 656 { 657 pstr->valid_len = pstr->offsets[mid] - offset; 658 if (pstr->valid_len) 659 { 660 for (low = 0; low < pstr->valid_len; ++low) 661 pstr->wcs[low] = WEOF; 662 memset (pstr->mbs, 255, pstr->valid_len); 663 } 664 } 665 pstr->valid_raw_len = pstr->valid_len; 666 } 667 } 668 else 669 #endif 670 { 671 pstr->tip_context = re_string_context_at (pstr, offset - 1, 672 eflags); 673 #ifdef RE_ENABLE_I18N 674 if (pstr->mb_cur_max > 1) 675 memmove (pstr->wcs, pstr->wcs + offset, 676 (pstr->valid_len - offset) * sizeof (wint_t)); 677 #endif /* RE_ENABLE_I18N */ 678 if (BE (pstr->mbs_allocated, 0)) 679 memmove (pstr->mbs, pstr->mbs + offset, 680 pstr->valid_len - offset); 681 pstr->valid_len -= offset; 682 pstr->valid_raw_len -= offset; 683 #if DEBUG 684 assert (pstr->valid_len > 0); 685 #endif 686 } 687 } 688 else 689 { 690 /* No, skip all characters until IDX. */ 691 Idx prev_valid_len = pstr->valid_len; 692 693 #ifdef RE_ENABLE_I18N 694 if (BE (pstr->offsets_needed, 0)) 695 { 696 pstr->len = pstr->raw_len - idx + offset; 697 pstr->stop = pstr->raw_stop - idx + offset; 698 pstr->offsets_needed = 0; 699 } 700 #endif 701 pstr->valid_len = 0; 702 #ifdef RE_ENABLE_I18N 703 if (pstr->mb_cur_max > 1) 704 { 705 Idx wcs_idx; 706 wint_t wc = WEOF; 707 708 if (pstr->is_utf8) 709 { 710 const unsigned char *raw, *p, *end; 711 712 /* Special case UTF-8. Multi-byte chars start with any 713 byte other than 0x80 - 0xbf. */ 714 raw = pstr->raw_mbs + pstr->raw_mbs_idx; 715 end = raw + (offset - pstr->mb_cur_max); 716 if (end < pstr->raw_mbs) 717 end = pstr->raw_mbs; 718 p = raw + offset - 1; 719 #ifdef _LIBC 720 /* We know the wchar_t encoding is UCS4, so for the simple 721 case, ASCII characters, skip the conversion step. */ 722 if (isascii (*p) && BE (pstr->trans == NULL, 1)) 723 { 724 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 725 /* pstr->valid_len = 0; */ 726 wc = (wchar_t) *p; 727 } 728 else 729 #endif 730 for (; p >= end; --p) 731 if ((*p & 0xc0) != 0x80) 732 { 733 mbstate_t cur_state; 734 wchar_t wc2; 735 Idx mlen = raw + pstr->len - p; 736 unsigned char buf[6]; 737 size_t mbclen; 738 739 if (BE (pstr->trans != NULL, 0)) 740 { 741 int i = mlen < 6 ? mlen : 6; 742 while (--i >= 0) 743 buf[i] = pstr->trans[p[i]]; 744 } 745 /* XXX Don't use mbrtowc, we know which conversion 746 to use (UTF-8 -> UCS4). */ 747 memset (&cur_state, 0, sizeof (cur_state)); 748 mbclen = mbrtowc (&wc2, (const char *) p, mlen, 749 &cur_state); 750 if (raw + offset - p <= mbclen 751 && mbclen < (size_t) -2) 752 { 753 memset (&pstr->cur_state, '\0', 754 sizeof (mbstate_t)); 755 pstr->valid_len = mbclen - (raw + offset - p); 756 wc = wc2; 757 } 758 break; 759 } 760 } 761 762 if (wc == WEOF) 763 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; 764 if (wc == WEOF) 765 pstr->tip_context 766 = re_string_context_at (pstr, prev_valid_len - 1, eflags); 767 else 768 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) 769 && IS_WIDE_WORD_CHAR (wc)) 770 ? CONTEXT_WORD 771 : ((IS_WIDE_NEWLINE (wc) 772 && pstr->newline_anchor) 773 ? CONTEXT_NEWLINE : 0)); 774 if (BE (pstr->valid_len, 0)) 775 { 776 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) 777 pstr->wcs[wcs_idx] = WEOF; 778 if (pstr->mbs_allocated) 779 memset (pstr->mbs, 255, pstr->valid_len); 780 } 781 pstr->valid_raw_len = pstr->valid_len; 782 } 783 else 784 #endif /* RE_ENABLE_I18N */ 785 { 786 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; 787 pstr->valid_raw_len = 0; 788 if (pstr->trans) 789 c = pstr->trans[c]; 790 pstr->tip_context = (bitset_contain (pstr->word_char, c) 791 ? CONTEXT_WORD 792 : ((IS_NEWLINE (c) && pstr->newline_anchor) 793 ? CONTEXT_NEWLINE : 0)); 794 } 795 } 796 if (!BE (pstr->mbs_allocated, 0)) 797 pstr->mbs += offset; 798 } 799 pstr->raw_mbs_idx = idx; 800 pstr->len -= offset; 801 pstr->stop -= offset; 802 803 /* Then build the buffers. */ 804 #ifdef RE_ENABLE_I18N 805 if (pstr->mb_cur_max > 1) 806 { 807 if (pstr->icase) 808 { 809 reg_errcode_t ret = build_wcs_upper_buffer (pstr); 810 if (BE (ret != REG_NOERROR, 0)) 811 return ret; 812 } 813 else 814 build_wcs_buffer (pstr); 815 } 816 else 817 #endif /* RE_ENABLE_I18N */ 818 if (BE (pstr->mbs_allocated, 0)) 819 { 820 if (pstr->icase) 821 build_upper_buffer (pstr); 822 else if (pstr->trans != NULL) 823 re_string_translate_buffer (pstr); 824 } 825 else 826 pstr->valid_len = pstr->len; 827 828 pstr->cur_idx = 0; 829 return REG_NOERROR; 830 } 831 832 static unsigned char 833 internal_function __attribute ((pure)) 834 re_string_peek_byte_case (const re_string_t *pstr, Idx idx) 835 { 836 int ch; 837 Idx off; 838 839 /* Handle the common (easiest) cases first. */ 840 if (BE (!pstr->mbs_allocated, 1)) 841 return re_string_peek_byte (pstr, idx); 842 843 #ifdef RE_ENABLE_I18N 844 if (pstr->mb_cur_max > 1 845 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) 846 return re_string_peek_byte (pstr, idx); 847 #endif 848 849 off = pstr->cur_idx + idx; 850 #ifdef RE_ENABLE_I18N 851 if (pstr->offsets_needed) 852 off = pstr->offsets[off]; 853 #endif 854 855 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 856 857 #ifdef RE_ENABLE_I18N 858 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I 859 this function returns CAPITAL LETTER I instead of first byte of 860 DOTLESS SMALL LETTER I. The latter would confuse the parser, 861 since peek_byte_case doesn't advance cur_idx in any way. */ 862 if (pstr->offsets_needed && !isascii (ch)) 863 return re_string_peek_byte (pstr, idx); 864 #endif 865 866 return ch; 867 } 868 869 static unsigned char 870 internal_function __attribute ((pure)) 871 re_string_fetch_byte_case (re_string_t *pstr) 872 { 873 if (BE (!pstr->mbs_allocated, 1)) 874 return re_string_fetch_byte (pstr); 875 876 #ifdef RE_ENABLE_I18N 877 if (pstr->offsets_needed) 878 { 879 Idx off; 880 int ch; 881 882 /* For tr_TR.UTF-8 [[:islower:]] there is 883 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip 884 in that case the whole multi-byte character and return 885 the original letter. On the other side, with 886 [[: DOTLESS SMALL LETTER I return [[:I, as doing 887 anything else would complicate things too much. */ 888 889 if (!re_string_first_byte (pstr, pstr->cur_idx)) 890 return re_string_fetch_byte (pstr); 891 892 off = pstr->offsets[pstr->cur_idx]; 893 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 894 895 if (! isascii (ch)) 896 return re_string_fetch_byte (pstr); 897 898 re_string_skip_bytes (pstr, 899 re_string_char_size_at (pstr, pstr->cur_idx)); 900 return ch; 901 } 902 #endif 903 904 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; 905 } 906 907 static void 908 internal_function 909 re_string_destruct (re_string_t *pstr) 910 { 911 #ifdef RE_ENABLE_I18N 912 re_free (pstr->wcs); 913 re_free (pstr->offsets); 914 #endif /* RE_ENABLE_I18N */ 915 if (pstr->mbs_allocated) 916 re_free (pstr->mbs); 917 } 918 919 /* Return the context at IDX in INPUT. */ 920 921 static unsigned int 922 internal_function 923 re_string_context_at (const re_string_t *input, Idx idx, int eflags) 924 { 925 int c; 926 if (BE (! REG_VALID_INDEX (idx), 0)) 927 /* In this case, we use the value stored in input->tip_context, 928 since we can't know the character in input->mbs[-1] here. */ 929 return input->tip_context; 930 if (BE (idx == input->len, 0)) 931 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF 932 : CONTEXT_NEWLINE | CONTEXT_ENDBUF); 933 #ifdef RE_ENABLE_I18N 934 if (input->mb_cur_max > 1) 935 { 936 wint_t wc; 937 Idx wc_idx = idx; 938 while(input->wcs[wc_idx] == WEOF) 939 { 940 #ifdef DEBUG 941 /* It must not happen. */ 942 assert (REG_VALID_INDEX (wc_idx)); 943 #endif 944 --wc_idx; 945 if (! REG_VALID_INDEX (wc_idx)) 946 return input->tip_context; 947 } 948 wc = input->wcs[wc_idx]; 949 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc)) 950 return CONTEXT_WORD; 951 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor 952 ? CONTEXT_NEWLINE : 0); 953 } 954 else 955 #endif 956 { 957 c = re_string_byte_at (input, idx); 958 if (bitset_contain (input->word_char, c)) 959 return CONTEXT_WORD; 960 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0; 961 } 962 } 963 964 /* Functions for set operation. */ 965 966 static reg_errcode_t 967 internal_function 968 re_node_set_alloc (re_node_set *set, Idx size) 969 { 970 set->alloc = size; 971 set->nelem = 0; 972 set->elems = re_malloc (Idx, size); 973 if (BE (set->elems == NULL, 0)) 974 return REG_ESPACE; 975 return REG_NOERROR; 976 } 977 978 static reg_errcode_t 979 internal_function 980 re_node_set_init_1 (re_node_set *set, Idx elem) 981 { 982 set->alloc = 1; 983 set->nelem = 1; 984 set->elems = re_malloc (Idx, 1); 985 if (BE (set->elems == NULL, 0)) 986 { 987 set->alloc = set->nelem = 0; 988 return REG_ESPACE; 989 } 990 set->elems[0] = elem; 991 return REG_NOERROR; 992 } 993 994 static reg_errcode_t 995 internal_function 996 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2) 997 { 998 set->alloc = 2; 999 set->elems = re_malloc (Idx, 2); 1000 if (BE (set->elems == NULL, 0)) 1001 return REG_ESPACE; 1002 if (elem1 == elem2) 1003 { 1004 set->nelem = 1; 1005 set->elems[0] = elem1; 1006 } 1007 else 1008 { 1009 set->nelem = 2; 1010 if (elem1 < elem2) 1011 { 1012 set->elems[0] = elem1; 1013 set->elems[1] = elem2; 1014 } 1015 else 1016 { 1017 set->elems[0] = elem2; 1018 set->elems[1] = elem1; 1019 } 1020 } 1021 return REG_NOERROR; 1022 } 1023 1024 static reg_errcode_t 1025 internal_function 1026 re_node_set_init_copy (re_node_set *dest, const re_node_set *src) 1027 { 1028 dest->nelem = src->nelem; 1029 if (src->nelem > 0) 1030 { 1031 dest->alloc = dest->nelem; 1032 dest->elems = re_malloc (Idx, dest->alloc); 1033 if (BE (dest->elems == NULL, 0)) 1034 { 1035 dest->alloc = dest->nelem = 0; 1036 return REG_ESPACE; 1037 } 1038 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 1039 } 1040 else 1041 re_node_set_init_empty (dest); 1042 return REG_NOERROR; 1043 } 1044 1045 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to 1046 DEST. Return value indicate the error code or REG_NOERROR if succeeded. 1047 Note: We assume dest->elems is NULL, when dest->alloc is 0. */ 1048 1049 static reg_errcode_t 1050 internal_function 1051 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1, 1052 const re_node_set *src2) 1053 { 1054 Idx i1, i2, is, id, delta, sbase; 1055 if (src1->nelem == 0 || src2->nelem == 0) 1056 return REG_NOERROR; 1057 1058 /* We need dest->nelem + 2 * elems_in_intersection; this is a 1059 conservative estimate. */ 1060 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) 1061 { 1062 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc; 1063 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc); 1064 if (BE (new_elems == NULL, 0)) 1065 return REG_ESPACE; 1066 dest->elems = new_elems; 1067 dest->alloc = new_alloc; 1068 } 1069 1070 /* Find the items in the intersection of SRC1 and SRC2, and copy 1071 into the top of DEST those that are not already in DEST itself. */ 1072 sbase = dest->nelem + src1->nelem + src2->nelem; 1073 i1 = src1->nelem - 1; 1074 i2 = src2->nelem - 1; 1075 id = dest->nelem - 1; 1076 for (;;) 1077 { 1078 if (src1->elems[i1] == src2->elems[i2]) 1079 { 1080 /* Try to find the item in DEST. Maybe we could binary search? */ 1081 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1]) 1082 --id; 1083 1084 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1]) 1085 dest->elems[--sbase] = src1->elems[i1]; 1086 1087 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2)) 1088 break; 1089 } 1090 1091 /* Lower the highest of the two items. */ 1092 else if (src1->elems[i1] < src2->elems[i2]) 1093 { 1094 if (! REG_VALID_INDEX (--i2)) 1095 break; 1096 } 1097 else 1098 { 1099 if (! REG_VALID_INDEX (--i1)) 1100 break; 1101 } 1102 } 1103 1104 id = dest->nelem - 1; 1105 is = dest->nelem + src1->nelem + src2->nelem - 1; 1106 delta = is - sbase + 1; 1107 1108 /* Now copy. When DELTA becomes zero, the remaining 1109 DEST elements are already in place; this is more or 1110 less the same loop that is in re_node_set_merge. */ 1111 dest->nelem += delta; 1112 if (delta > 0 && REG_VALID_INDEX (id)) 1113 for (;;) 1114 { 1115 if (dest->elems[is] > dest->elems[id]) 1116 { 1117 /* Copy from the top. */ 1118 dest->elems[id + delta--] = dest->elems[is--]; 1119 if (delta == 0) 1120 break; 1121 } 1122 else 1123 { 1124 /* Slide from the bottom. */ 1125 dest->elems[id + delta] = dest->elems[id]; 1126 if (! REG_VALID_INDEX (--id)) 1127 break; 1128 } 1129 } 1130 1131 /* Copy remaining SRC elements. */ 1132 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx)); 1133 1134 return REG_NOERROR; 1135 } 1136 1137 /* Calculate the union set of the sets SRC1 and SRC2. And store it to 1138 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1139 1140 static reg_errcode_t 1141 internal_function 1142 re_node_set_init_union (re_node_set *dest, const re_node_set *src1, 1143 const re_node_set *src2) 1144 { 1145 Idx i1, i2, id; 1146 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) 1147 { 1148 dest->alloc = src1->nelem + src2->nelem; 1149 dest->elems = re_malloc (Idx, dest->alloc); 1150 if (BE (dest->elems == NULL, 0)) 1151 return REG_ESPACE; 1152 } 1153 else 1154 { 1155 if (src1 != NULL && src1->nelem > 0) 1156 return re_node_set_init_copy (dest, src1); 1157 else if (src2 != NULL && src2->nelem > 0) 1158 return re_node_set_init_copy (dest, src2); 1159 else 1160 re_node_set_init_empty (dest); 1161 return REG_NOERROR; 1162 } 1163 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) 1164 { 1165 if (src1->elems[i1] > src2->elems[i2]) 1166 { 1167 dest->elems[id++] = src2->elems[i2++]; 1168 continue; 1169 } 1170 if (src1->elems[i1] == src2->elems[i2]) 1171 ++i2; 1172 dest->elems[id++] = src1->elems[i1++]; 1173 } 1174 if (i1 < src1->nelem) 1175 { 1176 memcpy (dest->elems + id, src1->elems + i1, 1177 (src1->nelem - i1) * sizeof (Idx)); 1178 id += src1->nelem - i1; 1179 } 1180 else if (i2 < src2->nelem) 1181 { 1182 memcpy (dest->elems + id, src2->elems + i2, 1183 (src2->nelem - i2) * sizeof (Idx)); 1184 id += src2->nelem - i2; 1185 } 1186 dest->nelem = id; 1187 return REG_NOERROR; 1188 } 1189 1190 /* Calculate the union set of the sets DEST and SRC. And store it to 1191 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1192 1193 static reg_errcode_t 1194 internal_function 1195 re_node_set_merge (re_node_set *dest, const re_node_set *src) 1196 { 1197 Idx is, id, sbase, delta; 1198 if (src == NULL || src->nelem == 0) 1199 return REG_NOERROR; 1200 if (dest->alloc < 2 * src->nelem + dest->nelem) 1201 { 1202 Idx new_alloc = 2 * (src->nelem + dest->alloc); 1203 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc); 1204 if (BE (new_buffer == NULL, 0)) 1205 return REG_ESPACE; 1206 dest->elems = new_buffer; 1207 dest->alloc = new_alloc; 1208 } 1209 1210 if (BE (dest->nelem == 0, 0)) 1211 { 1212 dest->nelem = src->nelem; 1213 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 1214 return REG_NOERROR; 1215 } 1216 1217 /* Copy into the top of DEST the items of SRC that are not 1218 found in DEST. Maybe we could binary search in DEST? */ 1219 for (sbase = dest->nelem + 2 * src->nelem, 1220 is = src->nelem - 1, id = dest->nelem - 1; 1221 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); ) 1222 { 1223 if (dest->elems[id] == src->elems[is]) 1224 is--, id--; 1225 else if (dest->elems[id] < src->elems[is]) 1226 dest->elems[--sbase] = src->elems[is--]; 1227 else /* if (dest->elems[id] > src->elems[is]) */ 1228 --id; 1229 } 1230 1231 if (REG_VALID_INDEX (is)) 1232 { 1233 /* If DEST is exhausted, the remaining items of SRC must be unique. */ 1234 sbase -= is + 1; 1235 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx)); 1236 } 1237 1238 id = dest->nelem - 1; 1239 is = dest->nelem + 2 * src->nelem - 1; 1240 delta = is - sbase + 1; 1241 if (delta == 0) 1242 return REG_NOERROR; 1243 1244 /* Now copy. When DELTA becomes zero, the remaining 1245 DEST elements are already in place. */ 1246 dest->nelem += delta; 1247 for (;;) 1248 { 1249 if (dest->elems[is] > dest->elems[id]) 1250 { 1251 /* Copy from the top. */ 1252 dest->elems[id + delta--] = dest->elems[is--]; 1253 if (delta == 0) 1254 break; 1255 } 1256 else 1257 { 1258 /* Slide from the bottom. */ 1259 dest->elems[id + delta] = dest->elems[id]; 1260 if (! REG_VALID_INDEX (--id)) 1261 { 1262 /* Copy remaining SRC elements. */ 1263 memcpy (dest->elems, dest->elems + sbase, 1264 delta * sizeof (Idx)); 1265 break; 1266 } 1267 } 1268 } 1269 1270 return REG_NOERROR; 1271 } 1272 1273 /* Insert the new element ELEM to the re_node_set* SET. 1274 SET should not already have ELEM. 1275 Return true if successful. */ 1276 1277 static bool 1278 internal_function 1279 re_node_set_insert (re_node_set *set, Idx elem) 1280 { 1281 Idx idx; 1282 /* In case the set is empty. */ 1283 if (set->alloc == 0) 1284 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1); 1285 1286 if (BE (set->nelem, 0) == 0) 1287 { 1288 /* We already guaranteed above that set->alloc != 0. */ 1289 set->elems[0] = elem; 1290 ++set->nelem; 1291 return true; 1292 } 1293 1294 /* Realloc if we need. */ 1295 if (set->alloc == set->nelem) 1296 { 1297 Idx *new_elems; 1298 set->alloc = set->alloc * 2; 1299 new_elems = re_realloc (set->elems, Idx, set->alloc); 1300 if (BE (new_elems == NULL, 0)) 1301 return false; 1302 set->elems = new_elems; 1303 } 1304 1305 /* Move the elements which follows the new element. Test the 1306 first element separately to skip a check in the inner loop. */ 1307 if (elem < set->elems[0]) 1308 { 1309 idx = 0; 1310 for (idx = set->nelem; idx > 0; idx--) 1311 set->elems[idx] = set->elems[idx - 1]; 1312 } 1313 else 1314 { 1315 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--) 1316 set->elems[idx] = set->elems[idx - 1]; 1317 } 1318 1319 /* Insert the new element. */ 1320 set->elems[idx] = elem; 1321 ++set->nelem; 1322 return true; 1323 } 1324 1325 /* Insert the new element ELEM to the re_node_set* SET. 1326 SET should not already have any element greater than or equal to ELEM. 1327 Return true if successful. */ 1328 1329 static bool 1330 internal_function 1331 re_node_set_insert_last (re_node_set *set, Idx elem) 1332 { 1333 /* Realloc if we need. */ 1334 if (set->alloc == set->nelem) 1335 { 1336 Idx *new_elems; 1337 set->alloc = (set->alloc + 1) * 2; 1338 new_elems = re_realloc (set->elems, Idx, set->alloc); 1339 if (BE (new_elems == NULL, 0)) 1340 return false; 1341 set->elems = new_elems; 1342 } 1343 1344 /* Insert the new element. */ 1345 set->elems[set->nelem++] = elem; 1346 return true; 1347 } 1348 1349 /* Compare two node sets SET1 and SET2. 1350 Return true if SET1 and SET2 are equivalent. */ 1351 1352 static bool 1353 internal_function __attribute ((pure)) 1354 re_node_set_compare (const re_node_set *set1, const re_node_set *set2) 1355 { 1356 Idx i; 1357 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) 1358 return false; 1359 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; ) 1360 if (set1->elems[i] != set2->elems[i]) 1361 return false; 1362 return true; 1363 } 1364 1365 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ 1366 1367 static Idx 1368 internal_function __attribute ((pure)) 1369 re_node_set_contains (const re_node_set *set, Idx elem) 1370 { 1371 __re_size_t idx, right, mid; 1372 if (! REG_VALID_NONZERO_INDEX (set->nelem)) 1373 return 0; 1374 1375 /* Binary search the element. */ 1376 idx = 0; 1377 right = set->nelem - 1; 1378 while (idx < right) 1379 { 1380 mid = (idx + right) / 2; 1381 if (set->elems[mid] < elem) 1382 idx = mid + 1; 1383 else 1384 right = mid; 1385 } 1386 return set->elems[idx] == elem ? idx + 1 : 0; 1387 } 1388 1389 static void 1390 internal_function 1391 re_node_set_remove_at (re_node_set *set, Idx idx) 1392 { 1393 if (idx < 0 || idx >= set->nelem) 1394 return; 1395 --set->nelem; 1396 for (; idx < set->nelem; idx++) 1397 set->elems[idx] = set->elems[idx + 1]; 1398 } 1399 1400 1401 /* Add the token TOKEN to dfa->nodes, and return the index of the token. 1402 Or return REG_MISSING if an error occurred. */ 1403 1404 static Idx 1405 internal_function 1406 re_dfa_add_node (re_dfa_t *dfa, re_token_t token) 1407 { 1408 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) 1409 { 1410 size_t new_nodes_alloc = dfa->nodes_alloc * 2; 1411 Idx *new_nexts, *new_indices; 1412 re_node_set *new_edests, *new_eclosures; 1413 re_token_t *new_nodes; 1414 size_t max_object_size = 1415 MAX (sizeof (re_token_t), 1416 MAX (sizeof (re_node_set), 1417 sizeof (Idx))); 1418 1419 /* Avoid overflows. */ 1420 if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0)) 1421 return REG_MISSING; 1422 1423 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc); 1424 if (BE (new_nodes == NULL, 0)) 1425 return REG_MISSING; 1426 dfa->nodes = new_nodes; 1427 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc); 1428 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc); 1429 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc); 1430 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc); 1431 if (BE (new_nexts == NULL || new_indices == NULL 1432 || new_edests == NULL || new_eclosures == NULL, 0)) 1433 return REG_MISSING; 1434 dfa->nexts = new_nexts; 1435 dfa->org_indices = new_indices; 1436 dfa->edests = new_edests; 1437 dfa->eclosures = new_eclosures; 1438 dfa->nodes_alloc = new_nodes_alloc; 1439 } 1440 dfa->nodes[dfa->nodes_len] = token; 1441 dfa->nodes[dfa->nodes_len].constraint = 0; 1442 #ifdef RE_ENABLE_I18N 1443 { 1444 int type = token.type; 1445 dfa->nodes[dfa->nodes_len].accept_mb = 1446 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET; 1447 } 1448 #endif 1449 dfa->nexts[dfa->nodes_len] = REG_MISSING; 1450 re_node_set_init_empty (dfa->edests + dfa->nodes_len); 1451 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); 1452 return dfa->nodes_len++; 1453 } 1454 1455 static inline re_hashval_t 1456 internal_function 1457 calc_state_hash (const re_node_set *nodes, unsigned int context) 1458 { 1459 re_hashval_t hash = nodes->nelem + context; 1460 Idx i; 1461 for (i = 0 ; i < nodes->nelem ; i++) 1462 hash += nodes->elems[i]; 1463 return hash; 1464 } 1465 1466 /* Search for the state whose node_set is equivalent to NODES. 1467 Return the pointer to the state, if we found it in the DFA. 1468 Otherwise create the new one and return it. In case of an error 1469 return NULL and set the error code in ERR. 1470 Note: - We assume NULL as the invalid state, then it is possible that 1471 return value is NULL and ERR is REG_NOERROR. 1472 - We never return non-NULL value in case of any errors, it is for 1473 optimization. */ 1474 1475 static re_dfastate_t * 1476 internal_function 1477 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa, 1478 const re_node_set *nodes) 1479 { 1480 re_hashval_t hash; 1481 re_dfastate_t *new_state; 1482 struct re_state_table_entry *spot; 1483 Idx i; 1484 #ifdef lint 1485 /* Suppress bogus uninitialized-variable warnings. */ 1486 *err = REG_NOERROR; 1487 #endif 1488 if (BE (nodes->nelem == 0, 0)) 1489 { 1490 *err = REG_NOERROR; 1491 return NULL; 1492 } 1493 hash = calc_state_hash (nodes, 0); 1494 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1495 1496 for (i = 0 ; i < spot->num ; i++) 1497 { 1498 re_dfastate_t *state = spot->array[i]; 1499 if (hash != state->hash) 1500 continue; 1501 if (re_node_set_compare (&state->nodes, nodes)) 1502 return state; 1503 } 1504 1505 /* There are no appropriate state in the dfa, create the new one. */ 1506 new_state = create_ci_newstate (dfa, nodes, hash); 1507 if (BE (new_state == NULL, 0)) 1508 *err = REG_ESPACE; 1509 1510 return new_state; 1511 } 1512 1513 /* Search for the state whose node_set is equivalent to NODES and 1514 whose context is equivalent to CONTEXT. 1515 Return the pointer to the state, if we found it in the DFA. 1516 Otherwise create the new one and return it. In case of an error 1517 return NULL and set the error code in ERR. 1518 Note: - We assume NULL as the invalid state, then it is possible that 1519 return value is NULL and ERR is REG_NOERROR. 1520 - We never return non-NULL value in case of any errors, it is for 1521 optimization. */ 1522 1523 static re_dfastate_t * 1524 internal_function 1525 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa, 1526 const re_node_set *nodes, unsigned int context) 1527 { 1528 re_hashval_t hash; 1529 re_dfastate_t *new_state; 1530 struct re_state_table_entry *spot; 1531 Idx i; 1532 #ifdef lint 1533 /* Suppress bogus uninitialized-variable warnings. */ 1534 *err = REG_NOERROR; 1535 #endif 1536 if (nodes->nelem == 0) 1537 { 1538 *err = REG_NOERROR; 1539 return NULL; 1540 } 1541 hash = calc_state_hash (nodes, context); 1542 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1543 1544 for (i = 0 ; i < spot->num ; i++) 1545 { 1546 re_dfastate_t *state = spot->array[i]; 1547 if (state->hash == hash 1548 && state->context == context 1549 && re_node_set_compare (state->entrance_nodes, nodes)) 1550 return state; 1551 } 1552 /* There are no appropriate state in `dfa', create the new one. */ 1553 new_state = create_cd_newstate (dfa, nodes, context, hash); 1554 if (BE (new_state == NULL, 0)) 1555 *err = REG_ESPACE; 1556 1557 return new_state; 1558 } 1559 1560 /* Finish initialization of the new state NEWSTATE, and using its hash value 1561 HASH put in the appropriate bucket of DFA's state table. Return value 1562 indicates the error code if failed. */ 1563 1564 static reg_errcode_t 1565 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, 1566 re_hashval_t hash) 1567 { 1568 struct re_state_table_entry *spot; 1569 reg_errcode_t err; 1570 Idx i; 1571 1572 newstate->hash = hash; 1573 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem); 1574 if (BE (err != REG_NOERROR, 0)) 1575 return REG_ESPACE; 1576 for (i = 0; i < newstate->nodes.nelem; i++) 1577 { 1578 Idx elem = newstate->nodes.elems[i]; 1579 if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) 1580 if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0)) 1581 return REG_ESPACE; 1582 } 1583 1584 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1585 if (BE (spot->alloc <= spot->num, 0)) 1586 { 1587 Idx new_alloc = 2 * spot->num + 2; 1588 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *, 1589 new_alloc); 1590 if (BE (new_array == NULL, 0)) 1591 return REG_ESPACE; 1592 spot->array = new_array; 1593 spot->alloc = new_alloc; 1594 } 1595 spot->array[spot->num++] = newstate; 1596 return REG_NOERROR; 1597 } 1598 1599 static void 1600 free_state (re_dfastate_t *state) 1601 { 1602 re_node_set_free (&state->non_eps_nodes); 1603 re_node_set_free (&state->inveclosure); 1604 if (state->entrance_nodes != &state->nodes) 1605 { 1606 re_node_set_free (state->entrance_nodes); 1607 re_free (state->entrance_nodes); 1608 } 1609 re_node_set_free (&state->nodes); 1610 re_free (state->word_trtable); 1611 re_free (state->trtable); 1612 re_free (state); 1613 } 1614 1615 /* Create the new state which is independ of contexts. 1616 Return the new state if succeeded, otherwise return NULL. */ 1617 1618 static re_dfastate_t * 1619 internal_function 1620 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1621 re_hashval_t hash) 1622 { 1623 Idx i; 1624 reg_errcode_t err; 1625 re_dfastate_t *newstate; 1626 1627 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1628 if (BE (newstate == NULL, 0)) 1629 return NULL; 1630 err = re_node_set_init_copy (&newstate->nodes, nodes); 1631 if (BE (err != REG_NOERROR, 0)) 1632 { 1633 re_free (newstate); 1634 return NULL; 1635 } 1636 1637 newstate->entrance_nodes = &newstate->nodes; 1638 for (i = 0 ; i < nodes->nelem ; i++) 1639 { 1640 re_token_t *node = dfa->nodes + nodes->elems[i]; 1641 re_token_type_t type = node->type; 1642 if (type == CHARACTER && !node->constraint) 1643 continue; 1644 #ifdef RE_ENABLE_I18N 1645 newstate->accept_mb |= node->accept_mb; 1646 #endif /* RE_ENABLE_I18N */ 1647 1648 /* If the state has the halt node, the state is a halt state. */ 1649 if (type == END_OF_RE) 1650 newstate->halt = 1; 1651 else if (type == OP_BACK_REF) 1652 newstate->has_backref = 1; 1653 else if (type == ANCHOR || node->constraint) 1654 newstate->has_constraint = 1; 1655 } 1656 err = register_state (dfa, newstate, hash); 1657 if (BE (err != REG_NOERROR, 0)) 1658 { 1659 free_state (newstate); 1660 newstate = NULL; 1661 } 1662 return newstate; 1663 } 1664 1665 /* Create the new state which is depend on the context CONTEXT. 1666 Return the new state if succeeded, otherwise return NULL. */ 1667 1668 static re_dfastate_t * 1669 internal_function 1670 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1671 unsigned int context, re_hashval_t hash) 1672 { 1673 Idx i, nctx_nodes = 0; 1674 reg_errcode_t err; 1675 re_dfastate_t *newstate; 1676 1677 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1678 if (BE (newstate == NULL, 0)) 1679 return NULL; 1680 err = re_node_set_init_copy (&newstate->nodes, nodes); 1681 if (BE (err != REG_NOERROR, 0)) 1682 { 1683 re_free (newstate); 1684 return NULL; 1685 } 1686 1687 newstate->context = context; 1688 newstate->entrance_nodes = &newstate->nodes; 1689 1690 for (i = 0 ; i < nodes->nelem ; i++) 1691 { 1692 unsigned int constraint = 0; 1693 re_token_t *node = dfa->nodes + nodes->elems[i]; 1694 re_token_type_t type = node->type; 1695 if (node->constraint) 1696 constraint = node->constraint; 1697 1698 if (type == CHARACTER && !constraint) 1699 continue; 1700 #ifdef RE_ENABLE_I18N 1701 newstate->accept_mb |= node->accept_mb; 1702 #endif /* RE_ENABLE_I18N */ 1703 1704 /* If the state has the halt node, the state is a halt state. */ 1705 if (type == END_OF_RE) 1706 newstate->halt = 1; 1707 else if (type == OP_BACK_REF) 1708 newstate->has_backref = 1; 1709 else if (type == ANCHOR) 1710 constraint = node->opr.ctx_type; 1711 1712 if (constraint) 1713 { 1714 if (newstate->entrance_nodes == &newstate->nodes) 1715 { 1716 newstate->entrance_nodes = re_malloc (re_node_set, 1); 1717 if (BE (newstate->entrance_nodes == NULL, 0)) 1718 { 1719 free_state (newstate); 1720 return NULL; 1721 } 1722 re_node_set_init_copy (newstate->entrance_nodes, nodes); 1723 nctx_nodes = 0; 1724 newstate->has_constraint = 1; 1725 } 1726 1727 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) 1728 { 1729 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); 1730 ++nctx_nodes; 1731 } 1732 } 1733 } 1734 err = register_state (dfa, newstate, hash); 1735 if (BE (err != REG_NOERROR, 0)) 1736 { 1737 free_state (newstate); 1738 newstate = NULL; 1739 } 1740 return newstate; 1741 }