1 /*
2 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
3 * Copyright (c) 2002-2004 Tim J. Robbins
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include "lint.h"
29 #include <errno.h>
30 #include <limits.h>
31 #include "runetype.h"
32 #include <stdlib.h>
33 #include <string.h>
34 #include <wchar.h>
35 #include "mblocal.h"
36
37 static size_t _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD,
38 const char *_RESTRICT_KYWD,
39 size_t, mbstate_t *_RESTRICT_KYWD);
40 static int _UTF8_mbsinit(const mbstate_t *);
41 static size_t _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
42 const char **_RESTRICT_KYWD, size_t, size_t,
43 mbstate_t *_RESTRICT_KYWD);
44 static size_t _UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
45 mbstate_t *_RESTRICT_KYWD);
46 static size_t _UTF8_wcsnrtombs(char *_RESTRICT_KYWD,
47 const wchar_t **_RESTRICT_KYWD,
48 size_t, size_t, mbstate_t *_RESTRICT_KYWD);
49
50 typedef struct {
51 wchar_t ch;
52 int want;
53 wchar_t lbound;
54 } _UTF8State;
55
56 int
57 _UTF8_init(_RuneLocale *rl)
58 {
59 __mbrtowc = _UTF8_mbrtowc;
60 __wcrtomb = _UTF8_wcrtomb;
61 __mbsinit = _UTF8_mbsinit;
62 __mbsnrtowcs = _UTF8_mbsnrtowcs;
63 __wcsnrtombs = _UTF8_wcsnrtombs;
64 _CurrentRuneLocale = rl;
65
66 charset_is_ascii = 0;
67
68 /*
69 * In theory up to 6 bytes can be used for the encoding,
70 * but only encodings with more than 4 bytes are illegal.
71 */
72 __ctype[520] = 4;
73 /*
74 * Note that the other CSWIDTH members are nonsensical for this
75 * this coding. They only are valid with EUC codings.
76 */
77
78 return (0);
79 }
80
81 static int
82 _UTF8_mbsinit(const mbstate_t *ps)
83 {
84
85 return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
86 }
87
88 static size_t
89 _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
90 size_t n, mbstate_t *_RESTRICT_KYWD ps)
91 {
92 _UTF8State *us;
93 int ch, i, mask, want;
94 wchar_t lbound, wch;
95
96 us = (_UTF8State *)ps;
97
98 if (us->want < 0 || us->want > 6) {
99 errno = EINVAL;
100 return ((size_t)-1);
101 }
102
103 if (s == NULL) {
104 s = "";
105 n = 1;
106 pwc = NULL;
107 }
108
109 if (n == 0)
110 /* Incomplete multibyte sequence */
111 return ((size_t)-2);
112
113 if (us->want == 0) {
114 /*
115 * Determine the number of octets that make up this character
116 * from the first octet, and a mask that extracts the
117 * interesting bits of the first octet. We already know
118 * the character is at least two bytes long.
119 *
120 * We also specify a lower bound for the character code to
121 * detect redundant, non-"shortest form" encodings. For
122 * example, the sequence C0 80 is _not_ a legal representation
123 * of the null character. This enforces a 1-to-1 mapping
124 * between character codes and their multibyte representations.
125 */
126 ch = (unsigned char)*s;
127 if ((ch & 0x80) == 0) {
128 /* Fast path for plain ASCII characters. */
129 if (pwc != NULL)
130 *pwc = ch;
131 return (ch != '\0' ? 1 : 0);
132 }
133 if ((ch & 0xe0) == 0xc0) {
134 mask = 0x1f;
135 want = 2;
136 lbound = 0x80;
137 } else if ((ch & 0xf0) == 0xe0) {
138 mask = 0x0f;
139 want = 3;
140 lbound = 0x800;
141 } else if ((ch & 0xf8) == 0xf0) {
142 mask = 0x07;
143 want = 4;
144 lbound = 0x10000;
145 #if 0
146 /* These would be illegal in the UTF-8 space */
147
148 } else if ((ch & 0xfc) == 0xf8) {
149 mask = 0x03;
150 want = 5;
151 lbound = 0x200000;
152 } else if ((ch & 0xfe) == 0xfc) {
153 mask = 0x01;
154 want = 6;
155 lbound = 0x4000000;
156 #endif
157 } else {
158 /*
159 * Malformed input; input is not UTF-8.
160 */
161 errno = EILSEQ;
162 return ((size_t)-1);
163 }
164 } else {
165 want = us->want;
166 lbound = us->lbound;
167 }
168
169 /*
170 * Decode the octet sequence representing the character in chunks
171 * of 6 bits, most significant first.
172 */
173 if (us->want == 0)
174 wch = (unsigned char)*s++ & mask;
175 else
176 wch = us->ch;
177
178 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
179 if ((*s & 0xc0) != 0x80) {
180 /*
181 * Malformed input; bad characters in the middle
182 * of a character.
183 */
184 errno = EILSEQ;
185 return ((size_t)-1);
186 }
187 wch <<= 6;
188 wch |= *s++ & 0x3f;
189 }
190 if (i < want) {
191 /* Incomplete multibyte sequence. */
192 us->want = want - i;
193 us->lbound = lbound;
194 us->ch = wch;
195 return ((size_t)-2);
196 }
197 if (wch < lbound) {
198 /*
199 * Malformed input; redundant encoding.
200 */
201 errno = EILSEQ;
202 return ((size_t)-1);
203 }
204 if (pwc != NULL)
205 *pwc = wch;
206 us->want = 0;
207 return (wch == L'\0' ? 0 : want);
208 }
209
210 static size_t
211 _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src,
212 size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
213 {
214 _UTF8State *us;
215 const char *s;
216 size_t nchr;
217 wchar_t wc;
218 size_t nb;
219
220 us = (_UTF8State *)ps;
221
222 s = *src;
223 nchr = 0;
224
225 if (dst == NULL) {
226 /*
227 * The fast path in the loop below is not safe if an ASCII
228 * character appears as anything but the first byte of a
229 * multibyte sequence. Check now to avoid doing it in the loop.
230 */
231 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
232 errno = EILSEQ;
233 return ((size_t)-1);
234 }
235 for (;;) {
236 if (nms > 0 && (signed char)*s > 0)
237 /*
238 * Fast path for plain ASCII characters
239 * excluding NUL.
240 */
241 nb = 1;
242 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
243 (size_t)-1)
244 /* Invalid sequence - mbrtowc() sets errno. */
245 return ((size_t)-1);
246 else if (nb == 0 || nb == (size_t)-2)
247 return (nchr);
248 s += nb;
249 nms -= nb;
250 nchr++;
251 }
252 /*NOTREACHED*/
253 }
254
255 /*
256 * The fast path in the loop below is not safe if an ASCII
257 * character appears as anything but the first byte of a
258 * multibyte sequence. Check now to avoid doing it in the loop.
259 */
260 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
261 errno = EILSEQ;
262 return ((size_t)-1);
263 }
264 while (len-- > 0) {
265 if (nms > 0 && (signed char)*s > 0) {
266 /*
267 * Fast path for plain ASCII characters
268 * excluding NUL.
269 */
270 *dst = (wchar_t)*s;
271 nb = 1;
272 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
273 (size_t)-1) {
274 *src = s;
275 return ((size_t)-1);
276 } else if (nb == (size_t)-2) {
277 *src = s + nms;
278 return (nchr);
279 } else if (nb == 0) {
280 *src = NULL;
281 return (nchr);
282 }
283 s += nb;
284 nms -= nb;
285 nchr++;
286 dst++;
287 }
288 *src = s;
289 return (nchr);
290 }
291
292 static size_t
293 _UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps)
294 {
295 _UTF8State *us;
296 unsigned char lead;
297 int i, len;
298
299 us = (_UTF8State *)ps;
300
301 if (us->want != 0) {
302 errno = EINVAL;
303 return ((size_t)-1);
304 }
305
306 if (s == NULL)
307 /* Reset to initial shift state (no-op) */
308 return (1);
309
310 /*
311 * Determine the number of octets needed to represent this character.
312 * We always output the shortest sequence possible. Also specify the
313 * first few bits of the first octet, which contains the information
314 * about the sequence length.
315 */
316 if ((wc & ~0x7f) == 0) {
317 /* Fast path for plain ASCII characters. */
318 *s = (char)wc;
319 return (1);
320 } else if ((wc & ~0x7ff) == 0) {
321 lead = 0xc0;
322 len = 2;
323 } else if ((wc & ~0xffff) == 0) {
324 lead = 0xe0;
325 len = 3;
326 } else if ((wc & ~0x1fffff) == 0) {
327 lead = 0xf0;
328 len = 4;
329 #if 0
330 /* Again, 5 and 6 byte encodings are simply not permitted */
331 } else if ((wc & ~0x3ffffff) == 0) {
332 lead = 0xf8;
333 len = 5;
334 } else if ((wc & ~0x7fffffff) == 0) {
335 lead = 0xfc;
336 len = 6;
337 #endif
338 } else {
339 errno = EILSEQ;
340 return ((size_t)-1);
341 }
342
343 /*
344 * Output the octets representing the character in chunks
345 * of 6 bits, least significant last. The first octet is
346 * a special case because it contains the sequence length
347 * information.
348 */
349 for (i = len - 1; i > 0; i--) {
350 s[i] = (wc & 0x3f) | 0x80;
351 wc >>= 6;
352 }
353 *s = (wc & 0xff) | lead;
354
355 return (len);
356 }
357
358 static size_t
359 _UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
360 size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
361 {
362 _UTF8State *us;
363 char buf[MB_LEN_MAX];
364 const wchar_t *s;
365 size_t nbytes;
366 size_t nb;
367
368 us = (_UTF8State *)ps;
369
370 if (us->want != 0) {
371 errno = EINVAL;
372 return ((size_t)-1);
373 }
374
375 s = *src;
376 nbytes = 0;
377
378 if (dst == NULL) {
379 while (nwc-- > 0) {
380 if (0 <= *s && *s < 0x80)
381 /* Fast path for plain ASCII characters. */
382 nb = 1;
383 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
384 (size_t)-1)
385 /* Invalid character - wcrtomb() sets errno. */
386 return ((size_t)-1);
387 if (*s == L'\0')
388 return (nbytes + nb - 1);
389 s++;
390 nbytes += nb;
391 }
392 return (nbytes);
393 }
394
395 while (len > 0 && nwc-- > 0) {
396 if (0 <= *s && *s < 0x80) {
397 /* Fast path for plain ASCII characters. */
398 nb = 1;
399 *dst = *s;
400 } else if (len > (size_t)MB_CUR_MAX) {
401 /* Enough space to translate in-place. */
402 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
403 *src = s;
404 return ((size_t)-1);
405 }
406 } else {
407 /*
408 * May not be enough space; use temp. buffer.
409 */
410 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
411 *src = s;
412 return ((size_t)-1);
413 }
414 if (nb > (int)len)
415 /* MB sequence for character won't fit. */
416 break;
417 (void) memcpy(dst, buf, nb);
418 }
419 if (*s == L'\0') {
420 *src = NULL;
421 return (nbytes + nb - 1);
422 }
423 s++;
424 dst += nb;
425 len -= nb;
426 nbytes += nb;
427 }
428 *src = s;
429 return (nbytes);
430 }