illumos-localedef Wdiff usr/src/cmd/localedef/wide.c

Print this page

3154 Nonconforming tolower and toupper with UTF-8 locales

Split	Close
Expand all
Collapse all

          --- old/usr/src/cmd/localedef/wide.c
          +++ new/usr/src/cmd/localedef/wide.c

   1    1  /*
   2    2   * This file and its contents are supplied under the terms of the
   3    3   * Common Development and Distribution License ("CDDL"), version 1.0.
   4    4   * You may only use this file in accordance with the terms of version
   5    5   * 1.0 of the CDDL.
   6    6   *
   7    7   * A full copy of the text of the CDDL should have accompanied this
   8    8   * source.  A copy of the CDDL is also available via the Internet at
   9    9   * http://www.illumos.org/license/CDDL.
  10   10   */
  11   11  
  12   12  /*
  13   13   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  14   14   * Copyright 2012 Garrett D'Amore <garrett@damore.org>  All rights reserved.
  15   15   */
  16   16  
  17   17  /*
  18   18   * The functions in this file convert from the standard multibyte forms
  19   19   * to the wide character forms used internally by libc.  Unfortunately,
  20   20   * this approach means that we need a method for each and every encoding.
  21   21   */
  22   22  
  23   23  #include <stdlib.h>
  24   24  #include <wchar.h>
  25   25  #include <string.h>
  26   26  #include <note.h>
  27   27  #include <sys/types.h>
  28   28  #include "localedef.h"
  29   29  
  30   30  static int towide_none(wchar_t *, const char *, unsigned);
  31   31  static int towide_utf8(wchar_t *, const char *, unsigned);
  32   32  static int towide_big5(wchar_t *, const char *, unsigned);
  33   33  static int towide_gbk(wchar_t *, const char *, unsigned);
  34   34  static int towide_gb2312(wchar_t *, const char *, unsigned);
  35   35  static int towide_gb18030(wchar_t *, const char *, unsigned);
  36   36  static int towide_mskanji(wchar_t *, const char *, unsigned);
  37   37  static int towide_euccn(wchar_t *, const char *, unsigned);
  38   38  static int towide_eucjp(wchar_t *, const char *, unsigned);
  39   39  static int towide_euckr(wchar_t *, const char *, unsigned);
  40   40  static int towide_euctw(wchar_t *, const char *, unsigned);
  41   41  
  42   42  static int tomb_none(char *, wchar_t);
  43   43  static int tomb_utf8(char *, wchar_t);
  44   44  static int tomb_mbs(char *, wchar_t);
  45   45  
  46   46  static int (*_towide)(wchar_t *, const char *, unsigned) = towide_none;
  47   47  static int (*_tomb)(char *, wchar_t) = tomb_none;
  48   48  static const char *_encoding = "NONE";
  49   49  static int _nbits = 7;
  50   50  
  51   51  /*
  52   52   * Table of supported encodings.  We only bother to list the multibyte
  53   53   * encodings here, because single byte locales are handed by "NONE".
  54   54   */
  55   55  static struct {
  56   56          const char *name;
  57   57          /* the name that the underlying libc implemenation uses */
  58   58          const char *cname;
  59   59          /* the maximum number of bits required for priorities */
  60   60          int nbits;
  61   61          int (*towide)(wchar_t *, const char *, unsigned);
  62   62          int (*tomb)(char *, wchar_t);
  63   63  } mb_encodings[] = {
  64   64          /*
  65   65           * UTF8 values max out at 0x1fffff (although in theory there could
  66   66           * be later extensions, but it won't happen.)  This means we only need
  67   67           * 21 bits to be able to encode the entire range of priorities.
  68   68           */
  69   69          { "UTF-8",      "UTF-8",        21, towide_utf8, tomb_utf8 },
  70   70          { "UTF8",       "UTF-8",        21, towide_utf8, tomb_utf8 },
  71   71          { "utf8",       "UTF-8",        21, towide_utf8, tomb_utf8 },
  72   72          { "utf-8",      "UTF-8",        21, towide_utf8, tomb_utf8 },
  73   73  
  74   74          { "EUC-CN",     "EUC-CN",       16, towide_euccn, tomb_mbs },
  75   75          { "eucCN",      "EUC-CN",       16, towide_euccn, tomb_mbs },
  76   76          /*
  77   77           * Becuase the 3-byte form of EUC-JP use the same leading byte,
  78   78           * only 17 bits required to provide unique priorities.  (The low
  79   79           * bit of that first byte is set.)  By setting this value low,
  80   80           * we can get by with only 3 bytes in the strxfrm expansion.
  81   81           */
  82   82          { "EUC-JP",     "EUC-JP",       17, towide_eucjp, tomb_mbs },
  83   83          { "eucJP",      "EUC-JP",       17, towide_eucjp, tomb_mbs },
  84   84  
  85   85          { "EUC-KR",     "EUC-KR",       16, towide_euckr, tomb_mbs },
  86   86          { "eucKR",      "EUC-KR",       16, towide_euckr, tomb_mbs },
  87   87          /*
  88   88           * EUC-TW uses 2 bytes most of the time, but 4 bytes if the
  89   89           * high order byte is 0x8E.  However, with 4 byte encodings,
  90   90           * the third byte will be A0-B0.  So we only need to consider
  91   91           * the lower order 24 bits for collation.
  92   92           */
  93   93          { "EUC-TW",     "EUC-TW",       24, towide_euctw, tomb_mbs },
  94   94          { "eucTW",      "EUC-TW",       24, towide_euctw, tomb_mbs },
  95   95  
  96   96          { "MS_Kanji",   "MSKanji",      16, towide_mskanji, tomb_mbs },
  97   97          { "MSKanji",    "MSKanji",      16, towide_mskanji, tomb_mbs },
  98   98          { "PCK",        "MSKanji",      16, towide_mskanji, tomb_mbs },
  99   99          { "SJIS",       "MSKanji",      16, towide_mskanji, tomb_mbs },
 100  100          { "Shift_JIS",  "MSKanji",      16, towide_mskanji, tomb_mbs },
 101  101  
 102  102          { "BIG5",       "BIG5",         16, towide_big5, tomb_mbs },
 103  103          { "big5",       "BIG5",         16, towide_big5, tomb_mbs },
 104  104          { "Big5",       "BIG5",         16, towide_big5, tomb_mbs },
 105  105  
 106  106          { "GBK",        "GBK",          16, towide_gbk, tomb_mbs },
 107  107  
 108  108          /*
 109  109           * GB18030 can get away with just 31 bits.  This is because the
 110  110           * high order bit is always set for 4 byte values, and the
 111  111           * at least one of the other bits in that 4 byte value will
 112  112           * be non-zero.
 113  113           */
 114  114          { "GB18030",    "GB18030",      31, towide_gb18030, tomb_mbs },
 115  115  
 116  116          /*
 117  117           * This should probably be an aliase for euc-cn, or vice versa.
 118  118           */
 119  119          { "GB2312",     "GB2312",       16, towide_gb2312, tomb_mbs },
 120  120  
 121  121          { NULL, NULL },
 122  122  };
 123  123  
 124  124  static char *
 125  125  show_mb(const char *mb)
 126  126  {
 127  127          static char buf[64];
 128  128  
 129  129          /* ASCII stuff we just print */
 130  130          if (isascii(*mb) && isgraph(*mb)) {
 131  131                  buf[0] = *mb;
 132  132                  buf[1] = 0;
 133  133                  return (buf);
 134  134          }
 135  135          buf[0] = 0;
 136  136          while (*mb != 0) {
 137  137                  char scr[8];
 138  138                  (void) snprintf(scr, sizeof (scr), "\\x%02x", *mb);
 139  139                  (void) strlcat(buf, scr, sizeof (buf));
 140  140                  mb++;
 141  141          }
 142  142          return (buf);
 143  143  }
 144  144  
 145  145  static char     *widemsg;
 146  146  
 147  147  void
 148  148  werr(const char *fmt, ...)
 149  149  {
 150  150          char    *msg;
 151  151  
 152  152          va_list va;
 153  153          va_start(va, fmt);

↓ open down ↓

153 lines elided

↑ open up ↑

 154  154          (void) vasprintf(&msg, fmt, va);
 155  155          va_end(va);
 156  156  
 157  157          free(widemsg);
 158  158          widemsg = msg;
 159  159  }
 160  160  
 161  161  /*
 162  162   * This is used for 8-bit encodings.
 163  163   */
      164 +/* ARGSUSED */
 164  165  int
 165  166  towide_none(wchar_t *c, const char *mb, unsigned n)
 166  167  {
 167  168          _NOTE(ARGUNUSED(n));
 168  169  
 169  170          if (mb_cur_max != 1) {
 170  171                  werr("invalid or unsupported multibyte locale");
 171  172                  return (-1);
 172  173          }
 173  174          *c = (uint8_t)*mb;

 174  175          return (1);
 175  176  }
 176  177  
 177  178  int
 178  179  tomb_none(char *mb, wchar_t wc)
 179  180  {
 180  181          if (mb_cur_max != 1) {
 181  182                  werr("invalid or unsupported multibyte locale");
 182  183                  return (-1);
 183  184          }
 184  185          *(uint8_t *)mb = (wc & 0xff);
 185  186          mb[1] = 0;
 186  187          return (1);
 187  188  }
 188  189  
 189  190  /*
 190  191   * UTF-8 stores wide characters in UTF-32 form.
 191  192   */
 192  193  int
 193  194  towide_utf8(wchar_t *wc, const char *mb, unsigned n)
 194  195  {
 195  196          wchar_t c;
 196  197          int     nb;
 197  198          int     lv;     /* lowest legal value */
 198  199          int     i;
 199  200          const uint8_t *s = (const uint8_t *)mb;
 200  201  
 201  202          c = *s;
 202  203  
 203  204          if ((c & 0x80) == 0) {
 204  205                  /* 7-bit ASCII */
 205  206                  *wc = c;
 206  207                  return (1);
 207  208          } else if ((c & 0xe0) == 0xc0) {
 208  209                  /* u80-u7ff - two bytes encoded */
 209  210                  nb = 2;
 210  211                  lv = 0x80;
 211  212                  c &= ~0xe0;
 212  213          } else if ((c & 0xf0) == 0xe0) {
 213  214                  /* u800-uffff - three bytes encoded */
 214  215                  nb = 3;
 215  216                  lv = 0x800;
 216  217                  c &= ~0xf0;
 217  218          } else if ((c & 0xf8) == 0xf0) {
 218  219                  /* u1000-u1fffff - four bytes encoded */
 219  220                  nb = 4;
 220  221                  lv = 0x1000;
 221  222                  c &= ~0xf8;
 222  223          } else {
 223  224                  /* 5 and 6 byte encodings are not legal unicode */
 224  225                  werr("utf8 encoding too large (%s)", show_mb(mb));
 225  226                  return (-1);
 226  227          }
 227  228          if (nb > n) {
 228  229                  werr("incomplete utf8 sequence (%s)", show_mb(mb));
 229  230                  return (-1);
 230  231          }
 231  232  
 232  233          for (i = 1; i < nb; i++) {
 233  234                  if (((s[i]) & 0xc0) != 0x80) {
 234  235                          werr("illegal utf8 byte (%x)", s[i]);
 235  236                          return (-1);
 236  237                  }
 237  238                  c <<= 6;
 238  239                  c |= (s[i] & 0x3f);
 239  240          }
 240  241  
 241  242          if (c < lv) {
 242  243                  werr("illegal redundant utf8 encoding (%s)", show_mb(mb));
 243  244                  return (-1);
 244  245          }
 245  246          *wc = c;
 246  247          return (nb);
 247  248  }
 248  249  
 249  250  int
 250  251  tomb_utf8(char *mb, wchar_t wc)
 251  252  {
 252  253          uint8_t *s = (uint8_t *)mb;
 253  254          uint8_t msk;
 254  255          int cnt;
 255  256          int i;
 256  257  
 257  258          if (wc <= 0x7f) {
 258  259                  s[0] = wc & 0x7f;
 259  260                  s[1] = 0;
 260  261                  return (1);
 261  262          }
 262  263          if (wc <= 0x7ff) {
 263  264                  cnt = 2;
 264  265                  msk = 0xc0;
 265  266          } else if (wc <= 0xffff) {
 266  267                  cnt = 3;
 267  268                  msk = 0xe0;
 268  269          } else if (wc <= 0x1fffff) {
 269  270                  cnt = 4;
 270  271                  msk = 0xf0;
 271  272          } else {
 272  273                  werr("illegal uf8 char (%x)", wc);
 273  274                  return (-1);
 274  275          }
 275  276          for (i = cnt - 1; i; i--) {
 276  277                  s[i] = (wc & 0x3f) | 0x80;
 277  278                  wc >>= 6;
 278  279          }
 279  280          s[0] = (msk) | wc;
 280  281          s[cnt] = 0;
 281  282          return (cnt);
 282  283  }
 283  284  
 284  285  /*
 285  286   * Several encodings share a simplistic dual byte encoding.  In these
 286  287   * forms, they all indicate that a two byte sequence is to be used if
 287  288   * the first byte has its high bit set.  They all store this simple
 288  289   * encoding as a 16-bit value, although a great many of the possible
 289  290   * code points are not used in most character sets.  This gives a possible
 290  291   * set of just over 32,000 valid code points.
 291  292   *
 292  293   * 0x00 - 0x7f          - 1 byte encoding
 293  294   * 0x80 - 0x7fff        - illegal
 294  295   * 0x8000 - 0xffff      - 2 byte encoding
 295  296   */
 296  297  static int
 297  298  towide_dbcs(wchar_t *wc, const char *mb, unsigned n)
 298  299  {
 299  300          wchar_t c;
 300  301  
 301  302          c = *(uint8_t *)mb;
 302  303  
 303  304          if ((c & 0x80) == 0) {
 304  305                  /* 7-bit */
 305  306                  *wc = c;
 306  307                  return (1);
 307  308          }
 308  309          if (n < 2) {
 309  310                  werr("incomplete character sequence (%s)", show_mb(mb));
 310  311                  return (-1);
 311  312          }
 312  313  
 313  314          /* Store both bytes as a single 16-bit wide. */
 314  315          c <<= 8;
 315  316          c |= (uint8_t)(mb[1]);
 316  317          *wc = c;
 317  318          return (2);
 318  319  }
 319  320  
 320  321  /*
 321  322   * Most multibyte locales just convert the wide character to the multibyte
 322  323   * form by stripping leading null bytes, and writing the 32-bit quantity
 323  324   * in big-endian order.
 324  325   */
 325  326  int
 326  327  tomb_mbs(char *mb, wchar_t wc)
 327  328  {
 328  329          uint8_t *s = (uint8_t *)mb;
 329  330          int     n = 0, c;
 330  331  
 331  332          if ((wc & 0xff000000U) != 0) {
 332  333                  n = 4;
 333  334          } else if ((wc & 0x00ff0000U) != 0) {
 334  335                  n = 3;
 335  336          } else if ((wc & 0x0000ff00U) != 0) {
 336  337                  n = 2;
 337  338          } else {
 338  339                  n = 1;
 339  340          }
 340  341          c = n;
 341  342          while (n) {
 342  343                  n--;
 343  344                  s[n] = wc & 0xff;
 344  345                  wc >>= 8;
 345  346          }
 346  347          /* ensure null termination */
 347  348          s[c] = 0;
 348  349          return (c);
 349  350  }
 350  351  
 351  352  
 352  353  /*
 353  354   * big5 is a simple dual byte character set.
 354  355   */
 355  356  int
 356  357  towide_big5(wchar_t *wc, const char *mb, unsigned n)
 357  358  {
 358  359          return (towide_dbcs(wc, mb, n));
 359  360  }
 360  361  
 361  362  /*
 362  363   * GBK encodes wides in the same way that big5 does, the high order
 363  364   * bit of the first byte indicates a double byte character.
 364  365   */
 365  366  int
 366  367  towide_gbk(wchar_t *wc, const char *mb, unsigned n)
 367  368  {
 368  369          return (towide_dbcs(wc, mb, n));
 369  370  }
 370  371  
 371  372  /*
 372  373   * GB2312 is another DBCS.  Its cleaner than others in that the second
 373  374   * byte does not encode ASCII, but it supports characters.
 374  375   */
 375  376  int
 376  377  towide_gb2312(wchar_t *wc, const char *mb, unsigned n)
 377  378  {
 378  379          return (towide_dbcs(wc, mb, n));
 379  380  }
 380  381  
 381  382  /*
 382  383   * GB18030.  This encodes as 8, 16, or 32-bits.
 383  384   * 7-bit values are in 1 byte,  4 byte sequences are used when
 384  385   * the second byte encodes 0x30-39 and all other sequences are 2 bytes.
 385  386   */
 386  387  int
 387  388  towide_gb18030(wchar_t *wc, const char *mb, unsigned n)
 388  389  {
 389  390          wchar_t c;
 390  391  
 391  392          c = *(uint8_t *)mb;
 392  393  
 393  394          if ((c & 0x80) == 0) {
 394  395                  /* 7-bit */
 395  396                  *wc = c;
 396  397                  return (1);
 397  398          }
 398  399          if (n < 2) {
 399  400                  werr("incomplete character sequence (%s)", show_mb(mb));
 400  401                  return (-1);
 401  402          }
 402  403  
 403  404          /* pull in the second byte */
 404  405          c <<= 8;
 405  406          c |= (uint8_t)(mb[1]);
 406  407  
 407  408          if (((c & 0xff) >= 0x30) && ((c & 0xff) <= 0x39)) {
 408  409                  if (n < 4) {
 409  410                          werr("incomplete 4-byte character sequence (%s)",
 410  411                              show_mb(mb));
 411  412                          return (-1);
 412  413                  }
 413  414                  c <<= 8;
 414  415                  c |= (uint8_t)(mb[2]);
 415  416                  c <<= 8;
 416  417                  c |= (uint8_t)(mb[3]);
 417  418                  *wc = c;
 418  419                  return (4);
 419  420          }
 420  421  
 421  422          *wc = c;
 422  423          return (2);
 423  424  }
 424  425  
 425  426  /*
 426  427   * MS-Kanji (aka SJIS) is almost a clean DBCS like the others, but it
 427  428   * also has a range of single byte characters above 0x80.  (0xa1-0xdf).
 428  429   */
 429  430  int
 430  431  towide_mskanji(wchar_t *wc, const char *mb, unsigned n)
 431  432  {
 432  433          wchar_t c;
 433  434  
 434  435          c = *(uint8_t *)mb;
 435  436  
 436  437          if ((c < 0x80) || ((c > 0xa0) && (c < 0xe0))) {
 437  438                  /* 7-bit */
 438  439                  *wc = c;
 439  440                  return (1);
 440  441          }
 441  442  
 442  443          if (n < 2) {
 443  444                  werr("incomplete character sequence (%s)", show_mb(mb));
 444  445                  return (-1);
 445  446          }
 446  447  
 447  448          /* Store both bytes as a single 16-bit wide. */
 448  449          c <<= 8;
 449  450          c |= (uint8_t)(mb[1]);
 450  451          *wc = c;
 451  452          return (2);
 452  453  }
 453  454  
 454  455  /*
 455  456   * EUC forms.  EUC encodings are "variable".  FreeBSD carries some additional
 456  457   * variable data to encode these, but we're going to treat each as independent
 457  458   * instead.  Its the only way we can sensibly move forward.
 458  459   *
 459  460   * Note that the way in which the different EUC forms vary is how wide
 460  461   * CS2 and CS3 are and what the first byte of them is.
 461  462   */
 462  463  static int
 463  464  towide_euc_impl(wchar_t *wc, const char *mb, unsigned n,
 464  465      uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
 465  466  {
 466  467          int i;
 467  468          int width;
 468  469          wchar_t c;
 469  470  
 470  471          c = *(uint8_t *)mb;
 471  472  
 472  473          /*
 473  474           * All variations of EUC encode 7-bit ASCII as one byte, and use
 474  475           * additional bytes for more than that.
 475  476           */
 476  477          if ((c & 0x80) == 0) {
 477  478                  /* 7-bit */
 478  479                  *wc = c;
 479  480                  return (1);
 480  481          }
 481  482  
 482  483          /*
 483  484           * All EUC variants reserve 0xa1-0xff to identify CS1, which
 484  485           * is always two bytes wide.  Note that unused CS will be zero,
 485  486           * and that cannot be true because we know that the high order
 486  487           * bit must be set.
 487  488           */
 488  489          if (c >= 0xa1) {
 489  490                  width = 2;
 490  491          } else if (c == cs2) {
 491  492                  width = cs2width;
 492  493          } else if (c == cs3) {
 493  494                  width = cs3width;
 494  495          }
 495  496  
 496  497          if (n < width) {
 497  498                  werr("incomplete character sequence (%s)", show_mb(mb));
 498  499                  return (-1);
 499  500          }
 500  501  
 501  502          for (i = 1; i < width; i++) {
 502  503                  /* pull in the next byte */
 503  504                  c <<= 8;
 504  505                  c |= (uint8_t)(mb[i]);
 505  506          }
 506  507  
 507  508          *wc = c;
 508  509          return (width);
 509  510  }
 510  511  
 511  512  /*
 512  513   * EUC-CN encodes as follows:
 513  514   *
 514  515   * Code set 0 (ASCII):                          0x21-0x7E
 515  516   * Code set 1 (CNS 11643-1992 Plane 1):         0xA1A1-0xFEFE
 516  517   * Code set 2:                                  unused
 517  518   * Code set 3:                                  unused
 518  519   */
 519  520  int
 520  521  towide_euccn(wchar_t *wc, const char *mb, unsigned n)
 521  522  {
 522  523          return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0));
 523  524  }
 524  525  
 525  526  /*
 526  527   * EUC-JP encodes as follows:
 527  528   *
 528  529   * Code set 0 (ASCII or JIS X 0201-1976 Roman): 0x21-0x7E
 529  530   * Code set 1 (JIS X 0208):                     0xA1A1-0xFEFE
 530  531   * Code set 2 (half-width katakana):            0x8EA1-0x8EDF
 531  532   * Code set 3 (JIS X 0212-1990):                0x8FA1A1-0x8FFEFE
 532  533   */
 533  534  int
 534  535  towide_eucjp(wchar_t *wc, const char *mb, unsigned n)
 535  536  {
 536  537          return (towide_euc_impl(wc, mb, n, 0x8e, 2, 0x8f, 3));
 537  538  }
 538  539  
 539  540  /*
 540  541   * EUC-KR encodes as follows:
 541  542   *
 542  543   * Code set 0 (ASCII or KS C 5636-1993):        0x21-0x7E
 543  544   * Code set 1 (KS C 5601-1992):                 0xA1A1-0xFEFE
 544  545   * Code set 2:                                  unused
 545  546   * Code set 3:                                  unused
 546  547   */
 547  548  int
 548  549  towide_euckr(wchar_t *wc, const char *mb, unsigned n)
 549  550  {
 550  551          return (towide_euc_impl(wc, mb, n, 0, 0, 0, 0));
 551  552  }
 552  553  
 553  554  /*
 554  555   * EUC-TW encodes as follows:
 555  556   *
 556  557   * Code set 0 (ASCII):                          0x21-0x7E
 557  558   * Code set 1 (CNS 11643-1992 Plane 1):         0xA1A1-0xFEFE
 558  559   * Code set 2 (CNS 11643-1992 Planes 1-16):     0x8EA1A1A1-0x8EB0FEFE
 559  560   * Code set 3:                                  unused
 560  561   */
 561  562  int
 562  563  towide_euctw(wchar_t *wc, const char *mb, unsigned n)
 563  564  {
 564  565          return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0));
 565  566  }
 566  567  
 567  568  /*
 568  569   * Public entry points.
 569  570   */
 570  571  
 571  572  int
 572  573  to_wide(wchar_t *wc, const char *mb)
 573  574  {
 574  575          /* this won't fail hard */
 575  576          return (_towide(wc, mb, strlen(mb)));
 576  577  }
 577  578  
 578  579  int
 579  580  to_mb(char *mb, wchar_t wc)
 580  581  {
 581  582          int     rv;
 582  583  
 583  584          if ((rv = _tomb(mb, wc)) < 0) {
 584  585                  errf(widemsg);
 585  586                  free(widemsg);
 586  587                  widemsg = NULL;
 587  588          }
 588  589          return (rv);
 589  590  }
 590  591  
 591  592  char *
 592  593  to_mb_string(const wchar_t *wcs)
 593  594  {
 594  595          char    *mbs;
 595  596          char    *ptr;
 596  597          int     len;
 597  598  
 598  599          mbs = malloc((wcslen(wcs) * mb_cur_max) + 1);
 599  600          if (mbs == NULL) {
 600  601                  errf("out of memory");
 601  602                  return (NULL);
 602  603          }
 603  604          ptr = mbs;
 604  605          while (*wcs) {
 605  606                  if ((len = to_mb(ptr, *wcs)) < 0) {
 606  607                          INTERR;
 607  608                          free(mbs);
 608  609                          return (NULL);
 609  610                  }
 610  611                  wcs++;
 611  612                  ptr += len;
 612  613          }
 613  614          *ptr = 0;
 614  615          return (mbs);
 615  616  }
 616  617  
 617  618  void
 618  619  set_wide_encoding(const char *encoding)
 619  620  {
 620  621          int i;
 621  622  
 622  623          _towide = towide_none;
 623  624          _tomb = tomb_none;
 624  625          _encoding = "NONE";
 625  626          _nbits = 8;
 626  627  
 627  628          for (i = 0; mb_encodings[i].name; i++) {
 628  629                  if (strcasecmp(encoding, mb_encodings[i].name) == 0) {
 629  630                          _towide = mb_encodings[i].towide;
 630  631                          _tomb = mb_encodings[i].tomb;
 631  632                          _encoding = mb_encodings[i].cname;
 632  633                          _nbits = mb_encodings[i].nbits;
 633  634                          break;
 634  635                  }
 635  636          }
 636  637  }
 637  638  
 638  639  const char *
 639  640  get_wide_encoding(void)
 640  641  {
 641  642          return (_encoding);
 642  643  }
 643  644  
 644  645  int
 645  646  max_wide(void)
 646  647  {
 647  648          return ((int)((1U << _nbits) - 1));
 648  649  }

↓ open down ↓

475 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX