illumos-gate Wdiff usr/src/lib/libc/port/locale/utf8.c

Print this page

2964 need POSIX 2008 locale object support
Reviewed by: Robert Mustacchi <rm@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/lib/libc/port/locale/utf8.c
          +++ new/usr/src/lib/libc/port/locale/utf8.c
   1    1  /*
        2 + * Copyright 2013 Garrett D'Amore <garrett@damore.org>
   2    3   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
   3    4   * Copyright (c) 2002-2004 Tim J. Robbins
   4    5   * All rights reserved.
   5    6   *
   6    7   * Redistribution and use in source and binary forms, with or without
   7    8   * modification, are permitted provided that the following conditions
   8    9   * are met:
   9   10   * 1. Redistributions of source code must retain the above copyright
  10   11   *    notice, this list of conditions and the following disclaimer.
  11   12   * 2. Redistributions in binary form must reproduce the above copyright

  12   13   *    notice, this list of conditions and the following disclaimer in the
  13   14   *    documentation and/or other materials provided with the distribution.
  14   15   *
  15   16   * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  16   17   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17   18   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18   19   * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  19   20   * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20   21   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS

↓ open down ↓

9 lines elided

↑ open up ↑

  21   22   * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22   23   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23   24   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24   25   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25   26   * SUCH DAMAGE.
  26   27   */
  27   28  
  28   29  #include "lint.h"
  29   30  #include <errno.h>
  30   31  #include <limits.h>
  31      -#include "runetype.h"
  32   32  #include <stdlib.h>
  33   33  #include <string.h>
  34   34  #include <wchar.h>
  35   35  #include "mblocal.h"
       36 +#include "lctype.h"
  36   37  
  37   38  static size_t   _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD,
  38   39                      const char *_RESTRICT_KYWD,
  39   40                      size_t, mbstate_t *_RESTRICT_KYWD);
  40   41  static int      _UTF8_mbsinit(const mbstate_t *);
  41   42  static size_t   _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
  42   43                      const char **_RESTRICT_KYWD, size_t, size_t,
  43   44                      mbstate_t *_RESTRICT_KYWD);
  44   45  static size_t   _UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
  45   46                      mbstate_t *_RESTRICT_KYWD);
  46   47  static size_t   _UTF8_wcsnrtombs(char *_RESTRICT_KYWD,
  47   48                      const wchar_t **_RESTRICT_KYWD,
  48   49                      size_t, size_t, mbstate_t *_RESTRICT_KYWD);
  49   50  
  50   51  typedef struct {
  51   52          wchar_t ch;
  52   53          int     want;
  53   54          wchar_t lbound;
  54   55  } _UTF8State;
  55   56  
  56      -int
  57      -_UTF8_init(_RuneLocale *rl)
       57 +void
       58 +_UTF8_init(struct lc_ctype *lct)
  58   59  {
  59      -        __mbrtowc = _UTF8_mbrtowc;
  60      -        __wcrtomb = _UTF8_wcrtomb;
  61      -        __mbsinit = _UTF8_mbsinit;
  62      -        __mbsnrtowcs = _UTF8_mbsnrtowcs;
  63      -        __wcsnrtombs = _UTF8_wcsnrtombs;
  64      -        _CurrentRuneLocale = rl;
  65      -
  66      -        charset_is_ascii = 0;
  67      -
  68      -        /*
  69      -         * In theory up to 6 bytes can be used for the encoding,
  70      -         * but only encodings with more than 4 bytes are illegal.
  71      -         */
  72      -        __ctype[520] = 4;
  73      -        /*
  74      -         * Note that the other CSWIDTH members are nonsensical for this
  75      -         * this coding.  They only are valid with EUC codings.
  76      -         */
  77      -
  78      -        return (0);
       60 +        lct->lc_mbrtowc = _UTF8_mbrtowc;
       61 +        lct->lc_wcrtomb = _UTF8_wcrtomb;
       62 +        lct->lc_mbsinit = _UTF8_mbsinit;
       63 +        lct->lc_mbsnrtowcs = _UTF8_mbsnrtowcs;
       64 +        lct->lc_wcsnrtombs = _UTF8_wcsnrtombs;
       65 +        lct->lc_is_ascii = 0;
       66 +        lct->lc_max_mblen = 4;
  79   67  }
  80   68  
  81   69  static int
  82   70  _UTF8_mbsinit(const mbstate_t *ps)
  83   71  {
  84   72  
  85   73          return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
  86   74  }
  87   75  
  88   76  static size_t

  89   77  _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
  90   78      size_t n, mbstate_t *_RESTRICT_KYWD ps)
  91   79  {
  92   80          _UTF8State *us;
  93   81          int ch, i, mask, want;
  94   82          wchar_t lbound, wch;
  95   83  
  96   84          us = (_UTF8State *)ps;
  97   85  
  98   86          if (us->want < 0 || us->want > 6) {
  99   87                  errno = EINVAL;
 100   88                  return ((size_t)-1);
 101   89          }
 102   90  
 103   91          if (s == NULL) {
 104   92                  s = "";
 105   93                  n = 1;
 106   94                  pwc = NULL;
 107   95          }
 108   96  
 109   97          if (n == 0)
 110   98                  /* Incomplete multibyte sequence */
 111   99                  return ((size_t)-2);
 112  100  
 113  101          if (us->want == 0) {
 114  102                  /*
 115  103                   * Determine the number of octets that make up this character
 116  104                   * from the first octet, and a mask that extracts the
 117  105                   * interesting bits of the first octet. We already know
 118  106                   * the character is at least two bytes long.
 119  107                   *
 120  108                   * We also specify a lower bound for the character code to
 121  109                   * detect redundant, non-"shortest form" encodings. For
 122  110                   * example, the sequence C0 80 is _not_ a legal representation
 123  111                   * of the null character. This enforces a 1-to-1 mapping
 124  112                   * between character codes and their multibyte representations.
 125  113                   */
 126  114                  ch = (unsigned char)*s;
 127  115                  if ((ch & 0x80) == 0) {
 128  116                          /* Fast path for plain ASCII characters. */
 129  117                          if (pwc != NULL)
 130  118                                  *pwc = ch;
 131  119                          return (ch != '\0' ? 1 : 0);
 132  120                  }
 133  121                  if ((ch & 0xe0) == 0xc0) {
 134  122                          mask = 0x1f;
 135  123                          want = 2;
 136  124                          lbound = 0x80;
 137  125                  } else if ((ch & 0xf0) == 0xe0) {
 138  126                          mask = 0x0f;
 139  127                          want = 3;
 140  128                          lbound = 0x800;
 141  129                  } else if ((ch & 0xf8) == 0xf0) {
 142  130                          mask = 0x07;
 143  131                          want = 4;
 144  132                          lbound = 0x10000;
 145  133  #if 0
 146  134                  /* These would be illegal in the UTF-8 space */
 147  135  
 148  136                  } else if ((ch & 0xfc) == 0xf8) {
 149  137                          mask = 0x03;
 150  138                          want = 5;
 151  139                          lbound = 0x200000;
 152  140                  } else if ((ch & 0xfe) == 0xfc) {
 153  141                          mask = 0x01;
 154  142                          want = 6;
 155  143                          lbound = 0x4000000;
 156  144  #endif
 157  145                  } else {
 158  146                          /*
 159  147                           * Malformed input; input is not UTF-8.
 160  148                           */
 161  149                          errno = EILSEQ;
 162  150                          return ((size_t)-1);
 163  151                  }
 164  152          } else {
 165  153                  want = us->want;
 166  154                  lbound = us->lbound;
 167  155          }
 168  156  
 169  157          /*
 170  158           * Decode the octet sequence representing the character in chunks
 171  159           * of 6 bits, most significant first.
 172  160           */
 173  161          if (us->want == 0)
 174  162                  wch = (unsigned char)*s++ & mask;
 175  163          else
 176  164                  wch = us->ch;
 177  165  
 178  166          for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
 179  167                  if ((*s & 0xc0) != 0x80) {
 180  168                          /*
 181  169                           * Malformed input; bad characters in the middle
 182  170                           * of a character.
 183  171                           */
 184  172                          errno = EILSEQ;
 185  173                          return ((size_t)-1);
 186  174                  }
 187  175                  wch <<= 6;
 188  176                  wch |= *s++ & 0x3f;
 189  177          }
 190  178          if (i < want) {
 191  179                  /* Incomplete multibyte sequence. */
 192  180                  us->want = want - i;
 193  181                  us->lbound = lbound;
 194  182                  us->ch = wch;
 195  183                  return ((size_t)-2);
 196  184          }
 197  185          if (wch < lbound) {
 198  186                  /*
 199  187                   * Malformed input; redundant encoding.
 200  188                   */
 201  189                  errno = EILSEQ;
 202  190                  return ((size_t)-1);
 203  191          }
 204  192          if (pwc != NULL)
 205  193                  *pwc = wch;
 206  194          us->want = 0;
 207  195          return (wch == L'\0' ? 0 : want);
 208  196  }
 209  197  
 210  198  static size_t
 211  199  _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src,
 212  200      size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
 213  201  {
 214  202          _UTF8State *us;
 215  203          const char *s;
 216  204          size_t nchr;
 217  205          wchar_t wc;
 218  206          size_t nb;
 219  207  
 220  208          us = (_UTF8State *)ps;
 221  209  
 222  210          s = *src;
 223  211          nchr = 0;
 224  212  
 225  213          if (dst == NULL) {
 226  214                  /*
 227  215                   * The fast path in the loop below is not safe if an ASCII
 228  216                   * character appears as anything but the first byte of a
 229  217                   * multibyte sequence. Check now to avoid doing it in the loop.
 230  218                   */
 231  219                  if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
 232  220                          errno = EILSEQ;
 233  221                          return ((size_t)-1);
 234  222                  }
 235  223                  for (;;) {
 236  224                          if (nms > 0 && (signed char)*s > 0)
 237  225                                  /*
 238  226                                   * Fast path for plain ASCII characters
 239  227                                   * excluding NUL.
 240  228                                   */
 241  229                                  nb = 1;
 242  230                          else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
 243  231                              (size_t)-1)
 244  232                                  /* Invalid sequence - mbrtowc() sets errno. */
 245  233                                  return ((size_t)-1);
 246  234                          else if (nb == 0 || nb == (size_t)-2)
 247  235                                  return (nchr);
 248  236                          s += nb;
 249  237                          nms -= nb;
 250  238                          nchr++;
 251  239                  }
 252  240                  /*NOTREACHED*/
 253  241          }
 254  242  
 255  243          /*
 256  244           * The fast path in the loop below is not safe if an ASCII
 257  245           * character appears as anything but the first byte of a
 258  246           * multibyte sequence. Check now to avoid doing it in the loop.
 259  247           */
 260  248          if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
 261  249                  errno = EILSEQ;
 262  250                  return ((size_t)-1);
 263  251          }
 264  252          while (len-- > 0) {
 265  253                  if (nms > 0 && (signed char)*s > 0) {
 266  254                          /*
 267  255                           * Fast path for plain ASCII characters
 268  256                           * excluding NUL.
 269  257                           */
 270  258                          *dst = (wchar_t)*s;
 271  259                          nb = 1;
 272  260                  } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
 273  261                      (size_t)-1) {
 274  262                          *src = s;
 275  263                          return ((size_t)-1);
 276  264                  } else if (nb == (size_t)-2) {
 277  265                          *src = s + nms;
 278  266                          return (nchr);
 279  267                  } else if (nb == 0) {
 280  268                          *src = NULL;
 281  269                          return (nchr);
 282  270                  }
 283  271                  s += nb;
 284  272                  nms -= nb;
 285  273                  nchr++;
 286  274                  dst++;
 287  275          }
 288  276          *src = s;
 289  277          return (nchr);
 290  278  }
 291  279  
 292  280  static size_t
 293  281  _UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps)
 294  282  {
 295  283          _UTF8State *us;
 296  284          unsigned char lead;
 297  285          int i, len;
 298  286  
 299  287          us = (_UTF8State *)ps;
 300  288  
 301  289          if (us->want != 0) {
 302  290                  errno = EINVAL;
 303  291                  return ((size_t)-1);
 304  292          }
 305  293  
 306  294          if (s == NULL)
 307  295                  /* Reset to initial shift state (no-op) */
 308  296                  return (1);
 309  297  
 310  298          /*
 311  299           * Determine the number of octets needed to represent this character.
 312  300           * We always output the shortest sequence possible. Also specify the
 313  301           * first few bits of the first octet, which contains the information
 314  302           * about the sequence length.
 315  303           */
 316  304          if ((wc & ~0x7f) == 0) {
 317  305                  /* Fast path for plain ASCII characters. */
 318  306                  *s = (char)wc;
 319  307                  return (1);
 320  308          } else if ((wc & ~0x7ff) == 0) {
 321  309                  lead = 0xc0;
 322  310                  len = 2;
 323  311          } else if ((wc & ~0xffff) == 0) {
 324  312                  lead = 0xe0;
 325  313                  len = 3;
 326  314          } else if ((wc & ~0x1fffff) == 0) {
 327  315                  lead = 0xf0;
 328  316                  len = 4;
 329  317  #if 0
 330  318          /* Again, 5 and 6 byte encodings are simply not permitted */
 331  319          } else if ((wc & ~0x3ffffff) == 0) {
 332  320                  lead = 0xf8;
 333  321                  len = 5;
 334  322          } else if ((wc & ~0x7fffffff) == 0) {
 335  323                  lead = 0xfc;
 336  324                  len = 6;
 337  325  #endif
 338  326          } else {
 339  327                  errno = EILSEQ;
 340  328                  return ((size_t)-1);
 341  329          }
 342  330  
 343  331          /*
 344  332           * Output the octets representing the character in chunks
 345  333           * of 6 bits, least significant last. The first octet is
 346  334           * a special case because it contains the sequence length
 347  335           * information.
 348  336           */
 349  337          for (i = len - 1; i > 0; i--) {
 350  338                  s[i] = (wc & 0x3f) | 0x80;
 351  339                  wc >>= 6;
 352  340          }
 353  341          *s = (wc & 0xff) | lead;
 354  342  
 355  343          return (len);
 356  344  }
 357  345  
 358  346  static size_t
 359  347  _UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
 360  348      size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
 361  349  {
 362  350          _UTF8State *us;
 363  351          char buf[MB_LEN_MAX];
 364  352          const wchar_t *s;
 365  353          size_t nbytes;
 366  354          size_t nb;
 367  355  
 368  356          us = (_UTF8State *)ps;
 369  357  
 370  358          if (us->want != 0) {
 371  359                  errno = EINVAL;
 372  360                  return ((size_t)-1);
 373  361          }
 374  362  
 375  363          s = *src;
 376  364          nbytes = 0;
 377  365  
 378  366          if (dst == NULL) {
 379  367                  while (nwc-- > 0) {
 380  368                          if (0 <= *s && *s < 0x80)
 381  369                                  /* Fast path for plain ASCII characters. */
 382  370                                  nb = 1;
 383  371                          else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
 384  372                              (size_t)-1)
 385  373                                  /* Invalid character - wcrtomb() sets errno. */
 386  374                                  return ((size_t)-1);
 387  375                          if (*s == L'\0')
 388  376                                  return (nbytes + nb - 1);
 389  377                          s++;
 390  378                          nbytes += nb;
 391  379                  }
 392  380                  return (nbytes);
 393  381          }
 394  382  
 395  383          while (len > 0 && nwc-- > 0) {
 396  384                  if (0 <= *s && *s < 0x80) {
 397  385                          /* Fast path for plain ASCII characters. */
 398  386                          nb = 1;
 399  387                          *dst = *s;
 400  388                  } else if (len > (size_t)MB_CUR_MAX) {
 401  389                          /* Enough space to translate in-place. */
 402  390                          if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
 403  391                                  *src = s;
 404  392                                  return ((size_t)-1);
 405  393                          }
 406  394                  } else {
 407  395                          /*
 408  396                           * May not be enough space; use temp. buffer.
 409  397                           */
 410  398                          if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
 411  399                                  *src = s;
 412  400                                  return ((size_t)-1);
 413  401                          }
 414  402                          if (nb > (int)len)
 415  403                                  /* MB sequence for character won't fit. */
 416  404                                  break;
 417  405                          (void) memcpy(dst, buf, nb);
 418  406                  }
 419  407                  if (*s == L'\0') {
 420  408                          *src = NULL;
 421  409                          return (nbytes + nb - 1);
 422  410                  }
 423  411                  s++;
 424  412                  dst += nb;
 425  413                  len -= nb;
 426  414                  nbytes += nb;
 427  415          }
 428  416          *src = s;
 429  417          return (nbytes);
 430  418  }

↓ open down ↓

342 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX