1 /*
   2  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
   3  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
   4  * Copyright (c) 2002-2004 Tim J. Robbins
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28 
  29 /*
  30  * PRC National Standard GB 18030-2000 encoding of Chinese text.
  31  *
  32  * See gb18030(5) for details.
  33  */
  34 
  35 #include "lint.h"
  36 #include <sys/types.h>
  37 #include <errno.h>
  38 #include <stdlib.h>
  39 #include <string.h>
  40 #include <wchar.h>
  41 #include "mblocal.h"
  42 #include "lctype.h"
  43 
  44 
  45 static size_t   _GB18030_mbrtowc(wchar_t *_RESTRICT_KYWD,
  46                     const char *_RESTRICT_KYWD,
  47                     size_t, mbstate_t *_RESTRICT_KYWD);
  48 static int      _GB18030_mbsinit(const mbstate_t *);
  49 static size_t   _GB18030_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
  50                     mbstate_t *_RESTRICT_KYWD);
  51 static size_t   _GB18030_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
  52                     const char **_RESTRICT_KYWD, size_t, size_t,
  53                     mbstate_t *_RESTRICT_KYWD);
  54 static size_t   _GB18030_wcsnrtombs(char *_RESTRICT_KYWD,
  55                     const wchar_t **_RESTRICT_KYWD, size_t, size_t,
  56                     mbstate_t *_RESTRICT_KYWD);
  57 
  58 
  59 typedef struct {
  60         int     count;
  61         uchar_t bytes[4];
  62 } _GB18030State;
  63 
  64 void
  65 _GB18030_init(struct lc_ctype *lct)
  66 {
  67 
  68         lct->lc_mbrtowc = _GB18030_mbrtowc;
  69         lct->lc_wcrtomb = _GB18030_wcrtomb;
  70         lct->lc_mbsinit = _GB18030_mbsinit;
  71         lct->lc_mbsnrtowcs = _GB18030_mbsnrtowcs;
  72         lct->lc_wcsnrtombs = _GB18030_wcsnrtombs;
  73         lct->lc_max_mblen = 4;
  74         lct->lc_is_ascii = 0;
  75 }
  76 
  77 static int
  78 _GB18030_mbsinit(const mbstate_t *ps)
  79 {
  80 
  81         return (ps == NULL || ((const _GB18030State *)ps)->count == 0);
  82 }
  83 
  84 static size_t
  85 _GB18030_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
  86     size_t n, mbstate_t *_RESTRICT_KYWD ps)
  87 {
  88         _GB18030State *gs;
  89         wchar_t wch;
  90         int ch, len, ocount;
  91         size_t ncopy;
  92 
  93         gs = (_GB18030State *)ps;
  94 
  95         if (gs->count < 0 || gs->count > sizeof (gs->bytes)) {
  96                 errno = EINVAL;
  97                 return ((size_t)-1);
  98         }
  99 
 100         if (s == NULL) {
 101                 s = "";
 102                 n = 1;
 103                 pwc = NULL;
 104         }
 105 
 106         ncopy = MIN(MIN(n, MB_CUR_MAX), sizeof (gs->bytes) - gs->count);
 107         (void) memcpy(gs->bytes + gs->count, s, ncopy);
 108         ocount = gs->count;
 109         gs->count += ncopy;
 110         s = (char *)gs->bytes;
 111         n = gs->count;
 112 
 113         if (n == 0)
 114                 /* Incomplete multibyte sequence */
 115                 return ((size_t)-2);
 116 
 117         /*
 118          * Single byte:         [00-7f]
 119          * Two byte:            [81-fe][40-7e,80-fe]
 120          * Four byte:           [81-fe][30-39][81-fe][30-39]
 121          */
 122         ch = (unsigned char)*s++;
 123         if (ch <= 0x7f) {
 124                 len = 1;
 125                 wch = ch;
 126         } else if (ch >= 0x81 && ch <= 0xfe) {
 127                 wch = ch;
 128                 if (n < 2)
 129                         return ((size_t)-2);
 130                 ch = (unsigned char)*s++;
 131                 if ((ch >= 0x40 && ch <= 0x7e) || (ch >= 0x80 && ch <= 0xfe)) {
 132                         wch = (wch << 8) | ch;
 133                         len = 2;
 134                 } else if (ch >= 0x30 && ch <= 0x39) {
 135                         /*
 136                          * Strip high bit off the wide character we will
 137                          * eventually output so that it is positive when
 138                          * cast to wint_t on 32-bit twos-complement machines.
 139                          */
 140                         wch = ((wch & 0x7f) << 8) | ch;
 141                         if (n < 3)
 142                                 return ((size_t)-2);
 143                         ch = (unsigned char)*s++;
 144                         if (ch < 0x81 || ch > 0xfe)
 145                                 goto ilseq;
 146                         wch = (wch << 8) | ch;
 147                         if (n < 4)
 148                                 return ((size_t)-2);
 149                         ch = (unsigned char)*s++;
 150                         if (ch < 0x30 || ch > 0x39)
 151                                 goto ilseq;
 152                         wch = (wch << 8) | ch;
 153                         len = 4;
 154                 } else
 155                         goto ilseq;
 156         } else
 157                 goto ilseq;
 158 
 159         if (pwc != NULL)
 160                 *pwc = wch;
 161         gs->count = 0;
 162         return (wch == L'\0' ? 0 : len - ocount);
 163 ilseq:
 164         errno = EILSEQ;
 165         return ((size_t)-1);
 166 }
 167 
 168 static size_t
 169 _GB18030_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
 170     mbstate_t *_RESTRICT_KYWD ps)
 171 {
 172         _GB18030State *gs;
 173         size_t len;
 174         int c;
 175 
 176         gs = (_GB18030State *)ps;
 177 
 178         if (gs->count != 0) {
 179                 errno = EINVAL;
 180                 return ((size_t)-1);
 181         }
 182 
 183         if (s == NULL)
 184                 /* Reset to initial shift state (no-op) */
 185                 return (1);
 186         if ((wc & ~0x7fffffff) != 0)
 187                 goto ilseq;
 188         if (wc & 0x7f000000) {
 189                 /* Replace high bit that mbrtowc() removed. */
 190                 wc |= 0x80000000;
 191                 c = (wc >> 24) & 0xff;
 192                 if (c < 0x81 || c > 0xfe)
 193                         goto ilseq;
 194                 *s++ = c;
 195                 c = (wc >> 16) & 0xff;
 196                 if (c < 0x30 || c > 0x39)
 197                         goto ilseq;
 198                 *s++ = c;
 199                 c = (wc >> 8) & 0xff;
 200                 if (c < 0x81 || c > 0xfe)
 201                         goto ilseq;
 202                 *s++ = c;
 203                 c = wc & 0xff;
 204                 if (c < 0x30 || c > 0x39)
 205                         goto ilseq;
 206                 *s++ = c;
 207                 len = 4;
 208         } else if (wc & 0x00ff0000)
 209                 goto ilseq;
 210         else if (wc & 0x0000ff00) {
 211                 c = (wc >> 8) & 0xff;
 212                 if (c < 0x81 || c > 0xfe)
 213                         goto ilseq;
 214                 *s++ = c;
 215                 c = wc & 0xff;
 216                 if (c < 0x40 || c == 0x7f || c == 0xff)
 217                         goto ilseq;
 218                 *s++ = c;
 219                 len = 2;
 220         } else if (wc <= 0x7f) {
 221                 *s++ = wc;
 222                 len = 1;
 223         } else
 224                 goto ilseq;
 225 
 226         return (len);
 227 ilseq:
 228         errno = EILSEQ;
 229         return ((size_t)-1);
 230 }
 231 
 232 static size_t
 233 _GB18030_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
 234     const char **_RESTRICT_KYWD src, size_t nms, size_t len,
 235     mbstate_t *_RESTRICT_KYWD ps)
 236 {
 237         return (__mbsnrtowcs_std(dst, src, nms, len, ps, _GB18030_mbrtowc));
 238 }
 239 
 240 static size_t
 241 _GB18030_wcsnrtombs(char *_RESTRICT_KYWD dst,
 242     const wchar_t **_RESTRICT_KYWD src, size_t nwc, size_t len,
 243     mbstate_t *_RESTRICT_KYWD ps)
 244 {
 245         return (__wcsnrtombs_std(dst, src, nwc, len, ps, _GB18030_wcrtomb));
 246 }