1 /*
2 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
3 * Copyright (c) 2002-2004 Tim J. Robbins
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 /*
29 * PRC National Standard GB 18030-2000 encoding of Chinese text.
30 *
31 * See gb18030(5) for details.
32 */
33
34 #include "lint.h"
35 #include <sys/types.h>
36 #include <errno.h>
37 #include "runetype.h"
38 #include <stdlib.h>
39 #include <string.h>
40 #include <wchar.h>
41 #include "mblocal.h"
42
43
44 static size_t _GB18030_mbrtowc(wchar_t *_RESTRICT_KYWD,
45 const char *_RESTRICT_KYWD,
46 size_t, mbstate_t *_RESTRICT_KYWD);
47 static int _GB18030_mbsinit(const mbstate_t *);
48 static size_t _GB18030_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
49 mbstate_t *_RESTRICT_KYWD);
50
51 typedef struct {
52 int count;
53 uchar_t bytes[4];
54 } _GB18030State;
55
56 int
57 _GB18030_init(_RuneLocale *rl)
58 {
59
60 __mbrtowc = _GB18030_mbrtowc;
61 __wcrtomb = _GB18030_wcrtomb;
62 __mbsinit = _GB18030_mbsinit;
63 _CurrentRuneLocale = rl;
64 __ctype[520] = 4;
65 charset_is_ascii = 0;
66
67 return (0);
68 }
69
70 static int
71 _GB18030_mbsinit(const mbstate_t *ps)
72 {
73
74 return (ps == NULL || ((const _GB18030State *)ps)->count == 0);
75 }
76
77 static size_t
78 _GB18030_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
79 size_t n, mbstate_t *_RESTRICT_KYWD ps)
80 {
81 _GB18030State *gs;
82 wchar_t wch;
83 int ch, len, ocount;
84 size_t ncopy;
85
86 gs = (_GB18030State *)ps;
87
88 if (gs->count < 0 || gs->count > sizeof (gs->bytes)) {
89 errno = EINVAL;
90 return ((size_t)-1);
91 }
92
93 if (s == NULL) {
94 s = "";
95 n = 1;
96 pwc = NULL;
97 }
98
99 ncopy = MIN(MIN(n, MB_CUR_MAX), sizeof (gs->bytes) - gs->count);
100 (void) memcpy(gs->bytes + gs->count, s, ncopy);
101 ocount = gs->count;
102 gs->count += ncopy;
103 s = (char *)gs->bytes;
104 n = gs->count;
105
106 if (n == 0)
107 /* Incomplete multibyte sequence */
108 return ((size_t)-2);
109
110 /*
111 * Single byte: [00-7f]
112 * Two byte: [81-fe][40-7e,80-fe]
113 * Four byte: [81-fe][30-39][81-fe][30-39]
114 */
115 ch = (unsigned char)*s++;
116 if (ch <= 0x7f) {
117 len = 1;
118 wch = ch;
119 } else if (ch >= 0x81 && ch <= 0xfe) {
120 wch = ch;
121 if (n < 2)
122 return ((size_t)-2);
123 ch = (unsigned char)*s++;
124 if ((ch >= 0x40 && ch <= 0x7e) || (ch >= 0x80 && ch <= 0xfe)) {
125 wch = (wch << 8) | ch;
126 len = 2;
127 } else if (ch >= 0x30 && ch <= 0x39) {
128 /*
129 * Strip high bit off the wide character we will
130 * eventually output so that it is positive when
131 * cast to wint_t on 32-bit twos-complement machines.
132 */
133 wch = ((wch & 0x7f) << 8) | ch;
134 if (n < 3)
135 return ((size_t)-2);
136 ch = (unsigned char)*s++;
137 if (ch < 0x81 || ch > 0xfe)
138 goto ilseq;
139 wch = (wch << 8) | ch;
140 if (n < 4)
141 return ((size_t)-2);
142 ch = (unsigned char)*s++;
143 if (ch < 0x30 || ch > 0x39)
144 goto ilseq;
145 wch = (wch << 8) | ch;
146 len = 4;
147 } else
148 goto ilseq;
149 } else
150 goto ilseq;
151
152 if (pwc != NULL)
153 *pwc = wch;
154 gs->count = 0;
155 return (wch == L'\0' ? 0 : len - ocount);
156 ilseq:
157 errno = EILSEQ;
158 return ((size_t)-1);
159 }
160
161 static size_t
162 _GB18030_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
163 mbstate_t *_RESTRICT_KYWD ps)
164 {
165 _GB18030State *gs;
166 size_t len;
167 int c;
168
169 gs = (_GB18030State *)ps;
170
171 if (gs->count != 0) {
172 errno = EINVAL;
173 return ((size_t)-1);
174 }
175
176 if (s == NULL)
177 /* Reset to initial shift state (no-op) */
178 return (1);
179 if ((wc & ~0x7fffffff) != 0)
180 goto ilseq;
181 if (wc & 0x7f000000) {
182 /* Replace high bit that mbrtowc() removed. */
183 wc |= 0x80000000;
184 c = (wc >> 24) & 0xff;
185 if (c < 0x81 || c > 0xfe)
186 goto ilseq;
187 *s++ = c;
188 c = (wc >> 16) & 0xff;
189 if (c < 0x30 || c > 0x39)
190 goto ilseq;
191 *s++ = c;
192 c = (wc >> 8) & 0xff;
193 if (c < 0x81 || c > 0xfe)
194 goto ilseq;
195 *s++ = c;
196 c = wc & 0xff;
197 if (c < 0x30 || c > 0x39)
198 goto ilseq;
199 *s++ = c;
200 len = 4;
201 } else if (wc & 0x00ff0000)
202 goto ilseq;
203 else if (wc & 0x0000ff00) {
204 c = (wc >> 8) & 0xff;
205 if (c < 0x81 || c > 0xfe)
206 goto ilseq;
207 *s++ = c;
208 c = wc & 0xff;
209 if (c < 0x40 || c == 0x7f || c == 0xff)
210 goto ilseq;
211 *s++ = c;
212 len = 2;
213 } else if (wc <= 0x7f) {
214 *s++ = wc;
215 len = 1;
216 } else
217 goto ilseq;
218
219 return (len);
220 ilseq:
221 errno = EILSEQ;
222 return ((size_t)-1);
223 }