1 /*
2 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
3 * Copyright 2012 Milan Jurik. All rights reserved.
4 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5 * Copyright (c) 1992, 1993, 1994
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Henry Spencer.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 4. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
708
709 if (cs->invert && p->g->cflags®_NEWLINE)
710 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
711
712 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
713 ordinary(p, ch);
714 freeset(p, cs);
715 } else
716 EMIT(OANYOF, (int)(cs - p->g->sets));
717 }
718
719 /*
720 * p_b_term - parse one term of a bracketed character list
721 */
722 static void
723 p_b_term(struct parse *p, cset *cs)
724 {
725 char c;
726 wint_t start, finish;
727 wint_t i;
728
729 /* classify what we've got */
730 switch ((MORE()) ? PEEK() : '\0') {
731 case '[':
732 c = (MORE2()) ? PEEK2() : '\0';
733 break;
734 case '-':
735 SETERROR(REG_ERANGE);
736 return; /* NOTE RETURN */
737 default:
738 c = '\0';
739 break;
740 }
741
742 switch (c) {
743 case ':': /* character class */
744 NEXT2();
745 (void) REQUIRE(MORE(), REG_EBRACK);
746 c = PEEK();
747 (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);
755 c = PEEK();
756 (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
757 p_b_eclass(p, cs);
758 (void) REQUIRE(MORE(), REG_EBRACK);
759 (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
760 break;
761 default: /* symbol, ordinary character, or range */
762 start = p_b_symbol(p);
763 if (SEE('-') && MORE2() && PEEK2() != ']') {
764 /* range */
765 NEXT();
766 if (EAT('-'))
767 finish = '-';
768 else
769 finish = p_b_symbol(p);
770 } else
771 finish = start;
772 if (start == finish)
773 CHadd(p, cs, start);
774 else {
775 if (_collate_load_error) {
776 (void) REQUIRE((uch)start <= (uch)finish,
777 REG_ERANGE);
778 CHaddrange(p, cs, start, finish);
779 } else {
780 (void) REQUIRE(_collate_range_cmp(start,
781 finish) <= 0, REG_ERANGE);
782 for (i = 0; i <= UCHAR_MAX; i++) {
783 if (_collate_range_cmp(start, i) <= 0 &&
784 _collate_range_cmp(i, finish) <= 0)
785 CHadd(p, cs, i);
786 }
787 }
788 }
789 break;
790 }
791 }
792
793 /*
794 * p_b_cclass - parse a character-class name and deal with it
795 */
796 static void
797 p_b_cclass(struct parse *p, cset *cs)
798 {
799 char *sp = p->next;
800 size_t len;
801 wctype_t wct;
802 char clname[16];
803
804 while (MORE() && isalpha((uch)PEEK()))
1350 *
1351 * This algorithm could do fancy things like analyzing the operands of |
1352 * for common subsequences. Someday. This code is simple and finds most
1353 * of the interesting cases.
1354 *
1355 * Note that must and mlen got initialized during setup.
1356 */
1357 static void
1358 findmust(struct parse *p, struct re_guts *g)
1359 {
1360 sop *scan;
1361 sop *start;
1362 sop *newstart;
1363 sopno newlen;
1364 sop s;
1365 char *cp;
1366 int offset;
1367 char buf[MB_LEN_MAX];
1368 size_t clen;
1369 mbstate_t mbs;
1370
1371 /* avoid making error situations worse */
1372 if (p->error != 0)
1373 return;
1374
1375 /*
1376 * It's not generally safe to do a ``char'' substring search on
1377 * multibyte character strings, but it's safe for at least
1378 * UTF-8 (see RFC 3629).
1379 */
1380 if (MB_CUR_MAX > 1 &&
1381 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1382 return;
1383
1384 /* find the longest OCHAR sequence in strip */
1385 newlen = 0;
1386 offset = 0;
1387 g->moffset = 0;
1388 scan = g->strip + 1;
1389 do {
1390 s = *scan++;
1391 switch (OP(s)) {
1392 case OCHAR: /* sequence member */
1393 if (newlen == 0) { /* new sequence */
1394 (void) memset(&mbs, 0, sizeof (mbs));
1395 newstart = scan - 1;
1396 }
1397 clen = wcrtomb(buf, OPND(s), &mbs);
1398 if (clen == (size_t)-1)
1399 goto toohard;
1400 newlen += clen;
1401 break;
|
1 /*
2 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
4 * Copyright 2012 Milan Jurik. All rights reserved.
5 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
6 * Copyright (c) 1992, 1993, 1994
7 * The Regents of the University of California. All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * Henry Spencer.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
709
710 if (cs->invert && p->g->cflags®_NEWLINE)
711 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
712
713 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
714 ordinary(p, ch);
715 freeset(p, cs);
716 } else
717 EMIT(OANYOF, (int)(cs - p->g->sets));
718 }
719
720 /*
721 * p_b_term - parse one term of a bracketed character list
722 */
723 static void
724 p_b_term(struct parse *p, cset *cs)
725 {
726 char c;
727 wint_t start, finish;
728 wint_t i;
729 locale_t loc = uselocale(NULL);
730
731 /* classify what we've got */
732 switch ((MORE()) ? PEEK() : '\0') {
733 case '[':
734 c = (MORE2()) ? PEEK2() : '\0';
735 break;
736 case '-':
737 SETERROR(REG_ERANGE);
738 return; /* NOTE RETURN */
739 default:
740 c = '\0';
741 break;
742 }
743
744 switch (c) {
745 case ':': /* character class */
746 NEXT2();
747 (void) REQUIRE(MORE(), REG_EBRACK);
748 c = PEEK();
749 (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);
757 c = PEEK();
758 (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
759 p_b_eclass(p, cs);
760 (void) REQUIRE(MORE(), REG_EBRACK);
761 (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
762 break;
763 default: /* symbol, ordinary character, or range */
764 start = p_b_symbol(p);
765 if (SEE('-') && MORE2() && PEEK2() != ']') {
766 /* range */
767 NEXT();
768 if (EAT('-'))
769 finish = '-';
770 else
771 finish = p_b_symbol(p);
772 } else
773 finish = start;
774 if (start == finish)
775 CHadd(p, cs, start);
776 else {
777 if (loc->collate->lc_is_posix) {
778 (void) REQUIRE((uch)start <= (uch)finish,
779 REG_ERANGE);
780 CHaddrange(p, cs, start, finish);
781 } else {
782 (void) REQUIRE(_collate_range_cmp(start,
783 finish, loc) <= 0, REG_ERANGE);
784 for (i = 0; i <= UCHAR_MAX; i++) {
785 if (_collate_range_cmp(start, i, loc)
786 <= 0 &&
787 _collate_range_cmp(i, finish, loc)
788 <= 0)
789 CHadd(p, cs, i);
790 }
791 }
792 }
793 break;
794 }
795 }
796
797 /*
798 * p_b_cclass - parse a character-class name and deal with it
799 */
800 static void
801 p_b_cclass(struct parse *p, cset *cs)
802 {
803 char *sp = p->next;
804 size_t len;
805 wctype_t wct;
806 char clname[16];
807
808 while (MORE() && isalpha((uch)PEEK()))
1354 *
1355 * This algorithm could do fancy things like analyzing the operands of |
1356 * for common subsequences. Someday. This code is simple and finds most
1357 * of the interesting cases.
1358 *
1359 * Note that must and mlen got initialized during setup.
1360 */
1361 static void
1362 findmust(struct parse *p, struct re_guts *g)
1363 {
1364 sop *scan;
1365 sop *start;
1366 sop *newstart;
1367 sopno newlen;
1368 sop s;
1369 char *cp;
1370 int offset;
1371 char buf[MB_LEN_MAX];
1372 size_t clen;
1373 mbstate_t mbs;
1374 locale_t loc = uselocale(NULL);
1375
1376 /* avoid making error situations worse */
1377 if (p->error != 0)
1378 return;
1379
1380 /*
1381 * It's not generally safe to do a ``char'' substring search on
1382 * multibyte character strings, but it's safe for at least
1383 * UTF-8 (see RFC 3629).
1384 */
1385 if (MB_CUR_MAX > 1 &&
1386 strcmp(loc->runelocale->__encoding, "UTF-8") != 0)
1387 return;
1388
1389 /* find the longest OCHAR sequence in strip */
1390 newlen = 0;
1391 offset = 0;
1392 g->moffset = 0;
1393 scan = g->strip + 1;
1394 do {
1395 s = *scan++;
1396 switch (OP(s)) {
1397 case OCHAR: /* sequence member */
1398 if (newlen == 0) { /* new sequence */
1399 (void) memset(&mbs, 0, sizeof (mbs));
1400 newstart = scan - 1;
1401 }
1402 clen = wcrtomb(buf, OPND(s), &mbs);
1403 if (clen == (size_t)-1)
1404 goto toohard;
1405 newlen += clen;
1406 break;
|