1 /*
   2  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
   3  * Copyright (c) 2002-2004 Tim J. Robbins
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25  * SUCH DAMAGE.
  26  */
  27 
  28 /*
  29  * PRC National Standard GB 18030-2000 encoding of Chinese text.
  30  *
  31  * See gb18030(5) for details.
  32  */
  33 
  34 #include "lint.h"
  35 #include <sys/types.h>
  36 #include <errno.h>
  37 #include "runetype.h"
  38 #include <stdlib.h>
  39 #include <string.h>
  40 #include <wchar.h>
  41 #include "mblocal.h"
  42 
  43 
  44 static size_t   _GB18030_mbrtowc(wchar_t *_RESTRICT_KYWD,
  45                     const char *_RESTRICT_KYWD,
  46                     size_t, mbstate_t *_RESTRICT_KYWD);
  47 static int      _GB18030_mbsinit(const mbstate_t *);
  48 static size_t   _GB18030_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
  49                     mbstate_t *_RESTRICT_KYWD);
  50 
  51 typedef struct {
  52         int     count;
  53         uchar_t bytes[4];
  54 } _GB18030State;
  55 
  56 int
  57 _GB18030_init(_RuneLocale *rl)
  58 {
  59 
  60         __mbrtowc = _GB18030_mbrtowc;
  61         __wcrtomb = _GB18030_wcrtomb;
  62         __mbsinit = _GB18030_mbsinit;
  63         _CurrentRuneLocale = rl;
  64         __ctype[520] = 4;
  65         charset_is_ascii = 0;
  66 
  67         return (0);
  68 }
  69 
  70 static int
  71 _GB18030_mbsinit(const mbstate_t *ps)
  72 {
  73 
  74         return (ps == NULL || ((const _GB18030State *)ps)->count == 0);
  75 }
  76 
  77 static size_t
  78 _GB18030_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
  79     size_t n, mbstate_t *_RESTRICT_KYWD ps)
  80 {
  81         _GB18030State *gs;
  82         wchar_t wch;
  83         int ch, len, ocount;
  84         size_t ncopy;
  85 
  86         gs = (_GB18030State *)ps;
  87 
  88         if (gs->count < 0 || gs->count > sizeof (gs->bytes)) {
  89                 errno = EINVAL;
  90                 return ((size_t)-1);
  91         }
  92 
  93         if (s == NULL) {
  94                 s = "";
  95                 n = 1;
  96                 pwc = NULL;
  97         }
  98 
  99         ncopy = MIN(MIN(n, MB_CUR_MAX), sizeof (gs->bytes) - gs->count);
 100         (void) memcpy(gs->bytes + gs->count, s, ncopy);
 101         ocount = gs->count;
 102         gs->count += ncopy;
 103         s = (char *)gs->bytes;
 104         n = gs->count;
 105 
 106         if (n == 0)
 107                 /* Incomplete multibyte sequence */
 108                 return ((size_t)-2);
 109 
 110         /*
 111          * Single byte:         [00-7f]
 112          * Two byte:            [81-fe][40-7e,80-fe]
 113          * Four byte:           [81-fe][30-39][81-fe][30-39]
 114          */
 115         ch = (unsigned char)*s++;
 116         if (ch <= 0x7f) {
 117                 len = 1;
 118                 wch = ch;
 119         } else if (ch >= 0x81 && ch <= 0xfe) {
 120                 wch = ch;
 121                 if (n < 2)
 122                         return ((size_t)-2);
 123                 ch = (unsigned char)*s++;
 124                 if ((ch >= 0x40 && ch <= 0x7e) || (ch >= 0x80 && ch <= 0xfe)) {
 125                         wch = (wch << 8) | ch;
 126                         len = 2;
 127                 } else if (ch >= 0x30 && ch <= 0x39) {
 128                         /*
 129                          * Strip high bit off the wide character we will
 130                          * eventually output so that it is positive when
 131                          * cast to wint_t on 32-bit twos-complement machines.
 132                          */
 133                         wch = ((wch & 0x7f) << 8) | ch;
 134                         if (n < 3)
 135                                 return ((size_t)-2);
 136                         ch = (unsigned char)*s++;
 137                         if (ch < 0x81 || ch > 0xfe)
 138                                 goto ilseq;
 139                         wch = (wch << 8) | ch;
 140                         if (n < 4)
 141                                 return ((size_t)-2);
 142                         ch = (unsigned char)*s++;
 143                         if (ch < 0x30 || ch > 0x39)
 144                                 goto ilseq;
 145                         wch = (wch << 8) | ch;
 146                         len = 4;
 147                 } else
 148                         goto ilseq;
 149         } else
 150                 goto ilseq;
 151 
 152         if (pwc != NULL)
 153                 *pwc = wch;
 154         gs->count = 0;
 155         return (wch == L'\0' ? 0 : len - ocount);
 156 ilseq:
 157         errno = EILSEQ;
 158         return ((size_t)-1);
 159 }
 160 
 161 static size_t
 162 _GB18030_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
 163     mbstate_t *_RESTRICT_KYWD ps)
 164 {
 165         _GB18030State *gs;
 166         size_t len;
 167         int c;
 168 
 169         gs = (_GB18030State *)ps;
 170 
 171         if (gs->count != 0) {
 172                 errno = EINVAL;
 173                 return ((size_t)-1);
 174         }
 175 
 176         if (s == NULL)
 177                 /* Reset to initial shift state (no-op) */
 178                 return (1);
 179         if ((wc & ~0x7fffffff) != 0)
 180                 goto ilseq;
 181         if (wc & 0x7f000000) {
 182                 /* Replace high bit that mbrtowc() removed. */
 183                 wc |= 0x80000000;
 184                 c = (wc >> 24) & 0xff;
 185                 if (c < 0x81 || c > 0xfe)
 186                         goto ilseq;
 187                 *s++ = c;
 188                 c = (wc >> 16) & 0xff;
 189                 if (c < 0x30 || c > 0x39)
 190                         goto ilseq;
 191                 *s++ = c;
 192                 c = (wc >> 8) & 0xff;
 193                 if (c < 0x81 || c > 0xfe)
 194                         goto ilseq;
 195                 *s++ = c;
 196                 c = wc & 0xff;
 197                 if (c < 0x30 || c > 0x39)
 198                         goto ilseq;
 199                 *s++ = c;
 200                 len = 4;
 201         } else if (wc & 0x00ff0000)
 202                 goto ilseq;
 203         else if (wc & 0x0000ff00) {
 204                 c = (wc >> 8) & 0xff;
 205                 if (c < 0x81 || c > 0xfe)
 206                         goto ilseq;
 207                 *s++ = c;
 208                 c = wc & 0xff;
 209                 if (c < 0x40 || c == 0x7f || c == 0xff)
 210                         goto ilseq;
 211                 *s++ = c;
 212                 len = 2;
 213         } else if (wc <= 0x7f) {
 214                 *s++ = wc;
 215                 len = 1;
 216         } else
 217                 goto ilseq;
 218 
 219         return (len);
 220 ilseq:
 221         errno = EILSEQ;
 222         return ((size_t)-1);
 223 }