Print this page
2964 need POSIX 2008 locale object support
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
Approved by: TBD
   1 /*

   2  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
   3  * Copyright 2012 Milan Jurik. All rights reserved.
   4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
   5  * Copyright (c) 1992, 1993, 1994
   6  *      The Regents of the University of California.  All rights reserved.
   7  *
   8  * This code is derived from software contributed to Berkeley by
   9  * Henry Spencer.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  * 4. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.


 708 
 709         if (cs->invert && p->g->cflags&REG_NEWLINE)
 710                 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
 711 
 712         if ((ch = singleton(cs)) != OUT) {      /* optimize singleton sets */
 713                 ordinary(p, ch);
 714                 freeset(p, cs);
 715         } else
 716                 EMIT(OANYOF, (int)(cs - p->g->sets));
 717 }
 718 
 719 /*
 720  * p_b_term - parse one term of a bracketed character list
 721  */
 722 static void
 723 p_b_term(struct parse *p, cset *cs)
 724 {
 725         char c;
 726         wint_t start, finish;
 727         wint_t i;

 728 
 729         /* classify what we've got */
 730         switch ((MORE()) ? PEEK() : '\0') {
 731         case '[':
 732                 c = (MORE2()) ? PEEK2() : '\0';
 733                 break;
 734         case '-':
 735                 SETERROR(REG_ERANGE);
 736                 return;                 /* NOTE RETURN */
 737         default:
 738                 c = '\0';
 739                 break;
 740         }
 741 
 742         switch (c) {
 743         case ':':               /* character class */
 744                 NEXT2();
 745                 (void) REQUIRE(MORE(), REG_EBRACK);
 746                 c = PEEK();
 747                 (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);


 755                 c = PEEK();
 756                 (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
 757                 p_b_eclass(p, cs);
 758                 (void) REQUIRE(MORE(), REG_EBRACK);
 759                 (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
 760                 break;
 761         default:                /* symbol, ordinary character, or range */
 762                 start = p_b_symbol(p);
 763                 if (SEE('-') && MORE2() && PEEK2() != ']') {
 764                         /* range */
 765                         NEXT();
 766                         if (EAT('-'))
 767                                 finish = '-';
 768                         else
 769                                 finish = p_b_symbol(p);
 770                 } else
 771                         finish = start;
 772                 if (start == finish)
 773                         CHadd(p, cs, start);
 774                 else {
 775                         if (_collate_load_error) {
 776                                 (void) REQUIRE((uch)start <= (uch)finish,
 777                                     REG_ERANGE);
 778                                 CHaddrange(p, cs, start, finish);
 779                         } else {
 780                                 (void) REQUIRE(_collate_range_cmp(start,
 781                                     finish) <= 0, REG_ERANGE);
 782                                 for (i = 0; i <= UCHAR_MAX; i++) {
 783                                         if (_collate_range_cmp(start, i) <= 0 &&
 784                                             _collate_range_cmp(i, finish) <= 0)


 785                                                 CHadd(p, cs, i);
 786                                 }
 787                         }
 788                 }
 789                 break;
 790         }
 791 }
 792 
 793 /*
 794  * p_b_cclass - parse a character-class name and deal with it
 795  */
 796 static void
 797 p_b_cclass(struct parse *p, cset *cs)
 798 {
 799         char *sp = p->next;
 800         size_t len;
 801         wctype_t wct;
 802         char clname[16];
 803 
 804         while (MORE() && isalpha((uch)PEEK()))


1350  *
1351  * This algorithm could do fancy things like analyzing the operands of |
1352  * for common subsequences.  Someday.  This code is simple and finds most
1353  * of the interesting cases.
1354  *
1355  * Note that must and mlen got initialized during setup.
1356  */
1357 static void
1358 findmust(struct parse *p, struct re_guts *g)
1359 {
1360         sop *scan;
1361         sop *start;
1362         sop *newstart;
1363         sopno newlen;
1364         sop s;
1365         char *cp;
1366         int offset;
1367         char buf[MB_LEN_MAX];
1368         size_t clen;
1369         mbstate_t mbs;

1370 
1371         /* avoid making error situations worse */
1372         if (p->error != 0)
1373                 return;
1374 
1375         /*
1376          * It's not generally safe to do a ``char'' substring search on
1377          * multibyte character strings, but it's safe for at least
1378          * UTF-8 (see RFC 3629).
1379          */
1380         if (MB_CUR_MAX > 1 &&
1381             strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1382                 return;
1383 
1384         /* find the longest OCHAR sequence in strip */
1385         newlen = 0;
1386         offset = 0;
1387         g->moffset = 0;
1388         scan = g->strip + 1;
1389         do {
1390                 s = *scan++;
1391                 switch (OP(s)) {
1392                 case OCHAR:             /* sequence member */
1393                         if (newlen == 0) {              /* new sequence */
1394                                 (void) memset(&mbs, 0, sizeof (mbs));
1395                                 newstart = scan - 1;
1396                         }
1397                         clen = wcrtomb(buf, OPND(s), &mbs);
1398                         if (clen == (size_t)-1)
1399                                 goto toohard;
1400                         newlen += clen;
1401                         break;


   1 /*
   2  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
   3  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
   4  * Copyright 2012 Milan Jurik. All rights reserved.
   5  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
   6  * Copyright (c) 1992, 1993, 1994
   7  *      The Regents of the University of California.  All rights reserved.
   8  *
   9  * This code is derived from software contributed to Berkeley by
  10  * Henry Spencer.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 4. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.


 709 
 710         if (cs->invert && p->g->cflags&REG_NEWLINE)
 711                 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
 712 
 713         if ((ch = singleton(cs)) != OUT) {      /* optimize singleton sets */
 714                 ordinary(p, ch);
 715                 freeset(p, cs);
 716         } else
 717                 EMIT(OANYOF, (int)(cs - p->g->sets));
 718 }
 719 
 720 /*
 721  * p_b_term - parse one term of a bracketed character list
 722  */
 723 static void
 724 p_b_term(struct parse *p, cset *cs)
 725 {
 726         char c;
 727         wint_t start, finish;
 728         wint_t i;
 729         locale_t loc = uselocale(NULL);
 730 
 731         /* classify what we've got */
 732         switch ((MORE()) ? PEEK() : '\0') {
 733         case '[':
 734                 c = (MORE2()) ? PEEK2() : '\0';
 735                 break;
 736         case '-':
 737                 SETERROR(REG_ERANGE);
 738                 return;                 /* NOTE RETURN */
 739         default:
 740                 c = '\0';
 741                 break;
 742         }
 743 
 744         switch (c) {
 745         case ':':               /* character class */
 746                 NEXT2();
 747                 (void) REQUIRE(MORE(), REG_EBRACK);
 748                 c = PEEK();
 749                 (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);


 757                 c = PEEK();
 758                 (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
 759                 p_b_eclass(p, cs);
 760                 (void) REQUIRE(MORE(), REG_EBRACK);
 761                 (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
 762                 break;
 763         default:                /* symbol, ordinary character, or range */
 764                 start = p_b_symbol(p);
 765                 if (SEE('-') && MORE2() && PEEK2() != ']') {
 766                         /* range */
 767                         NEXT();
 768                         if (EAT('-'))
 769                                 finish = '-';
 770                         else
 771                                 finish = p_b_symbol(p);
 772                 } else
 773                         finish = start;
 774                 if (start == finish)
 775                         CHadd(p, cs, start);
 776                 else {
 777                         if (loc->collate->lc_is_posix) {
 778                                 (void) REQUIRE((uch)start <= (uch)finish,
 779                                     REG_ERANGE);
 780                                 CHaddrange(p, cs, start, finish);
 781                         } else {
 782                                 (void) REQUIRE(_collate_range_cmp(start,
 783                                     finish, loc) <= 0, REG_ERANGE);
 784                                 for (i = 0; i <= UCHAR_MAX; i++) {
 785                                         if (_collate_range_cmp(start, i, loc)
 786                                             <= 0 &&
 787                                             _collate_range_cmp(i, finish, loc)
 788                                             <= 0)
 789                                                 CHadd(p, cs, i);
 790                                 }
 791                         }
 792                 }
 793                 break;
 794         }
 795 }
 796 
 797 /*
 798  * p_b_cclass - parse a character-class name and deal with it
 799  */
 800 static void
 801 p_b_cclass(struct parse *p, cset *cs)
 802 {
 803         char *sp = p->next;
 804         size_t len;
 805         wctype_t wct;
 806         char clname[16];
 807 
 808         while (MORE() && isalpha((uch)PEEK()))


1354  *
1355  * This algorithm could do fancy things like analyzing the operands of |
1356  * for common subsequences.  Someday.  This code is simple and finds most
1357  * of the interesting cases.
1358  *
1359  * Note that must and mlen got initialized during setup.
1360  */
1361 static void
1362 findmust(struct parse *p, struct re_guts *g)
1363 {
1364         sop *scan;
1365         sop *start;
1366         sop *newstart;
1367         sopno newlen;
1368         sop s;
1369         char *cp;
1370         int offset;
1371         char buf[MB_LEN_MAX];
1372         size_t clen;
1373         mbstate_t mbs;
1374         locale_t loc = uselocale(NULL);
1375 
1376         /* avoid making error situations worse */
1377         if (p->error != 0)
1378                 return;
1379 
1380         /*
1381          * It's not generally safe to do a ``char'' substring search on
1382          * multibyte character strings, but it's safe for at least
1383          * UTF-8 (see RFC 3629).
1384          */
1385         if (MB_CUR_MAX > 1 &&
1386             strcmp(loc->runelocale->__encoding, "UTF-8") != 0)
1387                 return;
1388 
1389         /* find the longest OCHAR sequence in strip */
1390         newlen = 0;
1391         offset = 0;
1392         g->moffset = 0;
1393         scan = g->strip + 1;
1394         do {
1395                 s = *scan++;
1396                 switch (OP(s)) {
1397                 case OCHAR:             /* sequence member */
1398                         if (newlen == 0) {              /* new sequence */
1399                                 (void) memset(&mbs, 0, sizeof (mbs));
1400                                 newstart = scan - 1;
1401                         }
1402                         clen = wcrtomb(buf, OPND(s), &mbs);
1403                         if (clen == (size_t)-1)
1404                                 goto toohard;
1405                         newlen += clen;
1406                         break;