Print this page
2964 need POSIX 2008 locale object support
Reviewed by: Robert Mustacchi <rm@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/lib/libc/port/locale/utf8.c
+++ new/usr/src/lib/libc/port/locale/utf8.c
1 1 /*
2 + * Copyright 2013 Garrett D'Amore <garrett@damore.org>
2 3 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
3 4 * Copyright (c) 2002-2004 Tim J. Robbins
4 5 * All rights reserved.
5 6 *
6 7 * Redistribution and use in source and binary forms, with or without
7 8 * modification, are permitted provided that the following conditions
8 9 * are met:
9 10 * 1. Redistributions of source code must retain the above copyright
10 11 * notice, this list of conditions and the following disclaimer.
11 12 * 2. Redistributions in binary form must reproduce the above copyright
12 13 * notice, this list of conditions and the following disclaimer in the
13 14 * documentation and/or other materials provided with the distribution.
14 15 *
15 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
↓ open down ↓ |
9 lines elided |
↑ open up ↑ |
21 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 26 * SUCH DAMAGE.
26 27 */
27 28
28 29 #include "lint.h"
29 30 #include <errno.h>
30 31 #include <limits.h>
31 -#include "runetype.h"
32 32 #include <stdlib.h>
33 33 #include <string.h>
34 34 #include <wchar.h>
35 35 #include "mblocal.h"
36 +#include "lctype.h"
36 37
37 38 static size_t _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD,
38 39 const char *_RESTRICT_KYWD,
39 40 size_t, mbstate_t *_RESTRICT_KYWD);
40 41 static int _UTF8_mbsinit(const mbstate_t *);
41 42 static size_t _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
42 43 const char **_RESTRICT_KYWD, size_t, size_t,
43 44 mbstate_t *_RESTRICT_KYWD);
44 45 static size_t _UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
45 46 mbstate_t *_RESTRICT_KYWD);
46 47 static size_t _UTF8_wcsnrtombs(char *_RESTRICT_KYWD,
47 48 const wchar_t **_RESTRICT_KYWD,
48 49 size_t, size_t, mbstate_t *_RESTRICT_KYWD);
49 50
50 51 typedef struct {
51 52 wchar_t ch;
52 53 int want;
53 54 wchar_t lbound;
54 55 } _UTF8State;
55 56
56 -int
57 -_UTF8_init(_RuneLocale *rl)
57 +void
58 +_UTF8_init(struct lc_ctype *lct)
58 59 {
59 - __mbrtowc = _UTF8_mbrtowc;
60 - __wcrtomb = _UTF8_wcrtomb;
61 - __mbsinit = _UTF8_mbsinit;
62 - __mbsnrtowcs = _UTF8_mbsnrtowcs;
63 - __wcsnrtombs = _UTF8_wcsnrtombs;
64 - _CurrentRuneLocale = rl;
65 -
66 - charset_is_ascii = 0;
67 -
68 - /*
69 - * In theory up to 6 bytes can be used for the encoding,
70 - * but only encodings with more than 4 bytes are illegal.
71 - */
72 - __ctype[520] = 4;
73 - /*
74 - * Note that the other CSWIDTH members are nonsensical for this
75 - * this coding. They only are valid with EUC codings.
76 - */
77 -
78 - return (0);
60 + lct->lc_mbrtowc = _UTF8_mbrtowc;
61 + lct->lc_wcrtomb = _UTF8_wcrtomb;
62 + lct->lc_mbsinit = _UTF8_mbsinit;
63 + lct->lc_mbsnrtowcs = _UTF8_mbsnrtowcs;
64 + lct->lc_wcsnrtombs = _UTF8_wcsnrtombs;
65 + lct->lc_is_ascii = 0;
66 + lct->lc_max_mblen = 4;
79 67 }
80 68
81 69 static int
82 70 _UTF8_mbsinit(const mbstate_t *ps)
83 71 {
84 72
85 73 return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
86 74 }
87 75
88 76 static size_t
89 77 _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
90 78 size_t n, mbstate_t *_RESTRICT_KYWD ps)
91 79 {
92 80 _UTF8State *us;
93 81 int ch, i, mask, want;
94 82 wchar_t lbound, wch;
95 83
96 84 us = (_UTF8State *)ps;
97 85
98 86 if (us->want < 0 || us->want > 6) {
99 87 errno = EINVAL;
100 88 return ((size_t)-1);
101 89 }
102 90
103 91 if (s == NULL) {
104 92 s = "";
105 93 n = 1;
106 94 pwc = NULL;
107 95 }
108 96
109 97 if (n == 0)
110 98 /* Incomplete multibyte sequence */
111 99 return ((size_t)-2);
112 100
113 101 if (us->want == 0) {
114 102 /*
115 103 * Determine the number of octets that make up this character
116 104 * from the first octet, and a mask that extracts the
117 105 * interesting bits of the first octet. We already know
118 106 * the character is at least two bytes long.
119 107 *
120 108 * We also specify a lower bound for the character code to
121 109 * detect redundant, non-"shortest form" encodings. For
122 110 * example, the sequence C0 80 is _not_ a legal representation
123 111 * of the null character. This enforces a 1-to-1 mapping
124 112 * between character codes and their multibyte representations.
125 113 */
126 114 ch = (unsigned char)*s;
127 115 if ((ch & 0x80) == 0) {
128 116 /* Fast path for plain ASCII characters. */
129 117 if (pwc != NULL)
130 118 *pwc = ch;
131 119 return (ch != '\0' ? 1 : 0);
132 120 }
133 121 if ((ch & 0xe0) == 0xc0) {
134 122 mask = 0x1f;
135 123 want = 2;
136 124 lbound = 0x80;
137 125 } else if ((ch & 0xf0) == 0xe0) {
138 126 mask = 0x0f;
139 127 want = 3;
140 128 lbound = 0x800;
141 129 } else if ((ch & 0xf8) == 0xf0) {
142 130 mask = 0x07;
143 131 want = 4;
144 132 lbound = 0x10000;
145 133 #if 0
146 134 /* These would be illegal in the UTF-8 space */
147 135
148 136 } else if ((ch & 0xfc) == 0xf8) {
149 137 mask = 0x03;
150 138 want = 5;
151 139 lbound = 0x200000;
152 140 } else if ((ch & 0xfe) == 0xfc) {
153 141 mask = 0x01;
154 142 want = 6;
155 143 lbound = 0x4000000;
156 144 #endif
157 145 } else {
158 146 /*
159 147 * Malformed input; input is not UTF-8.
160 148 */
161 149 errno = EILSEQ;
162 150 return ((size_t)-1);
163 151 }
164 152 } else {
165 153 want = us->want;
166 154 lbound = us->lbound;
167 155 }
168 156
169 157 /*
170 158 * Decode the octet sequence representing the character in chunks
171 159 * of 6 bits, most significant first.
172 160 */
173 161 if (us->want == 0)
174 162 wch = (unsigned char)*s++ & mask;
175 163 else
176 164 wch = us->ch;
177 165
178 166 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
179 167 if ((*s & 0xc0) != 0x80) {
180 168 /*
181 169 * Malformed input; bad characters in the middle
182 170 * of a character.
183 171 */
184 172 errno = EILSEQ;
185 173 return ((size_t)-1);
186 174 }
187 175 wch <<= 6;
188 176 wch |= *s++ & 0x3f;
189 177 }
190 178 if (i < want) {
191 179 /* Incomplete multibyte sequence. */
192 180 us->want = want - i;
193 181 us->lbound = lbound;
194 182 us->ch = wch;
195 183 return ((size_t)-2);
196 184 }
197 185 if (wch < lbound) {
198 186 /*
199 187 * Malformed input; redundant encoding.
200 188 */
201 189 errno = EILSEQ;
202 190 return ((size_t)-1);
203 191 }
204 192 if (pwc != NULL)
205 193 *pwc = wch;
206 194 us->want = 0;
207 195 return (wch == L'\0' ? 0 : want);
208 196 }
209 197
210 198 static size_t
211 199 _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src,
212 200 size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
213 201 {
214 202 _UTF8State *us;
215 203 const char *s;
216 204 size_t nchr;
217 205 wchar_t wc;
218 206 size_t nb;
219 207
220 208 us = (_UTF8State *)ps;
221 209
222 210 s = *src;
223 211 nchr = 0;
224 212
225 213 if (dst == NULL) {
226 214 /*
227 215 * The fast path in the loop below is not safe if an ASCII
228 216 * character appears as anything but the first byte of a
229 217 * multibyte sequence. Check now to avoid doing it in the loop.
230 218 */
231 219 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
232 220 errno = EILSEQ;
233 221 return ((size_t)-1);
234 222 }
235 223 for (;;) {
236 224 if (nms > 0 && (signed char)*s > 0)
237 225 /*
238 226 * Fast path for plain ASCII characters
239 227 * excluding NUL.
240 228 */
241 229 nb = 1;
242 230 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
243 231 (size_t)-1)
244 232 /* Invalid sequence - mbrtowc() sets errno. */
245 233 return ((size_t)-1);
246 234 else if (nb == 0 || nb == (size_t)-2)
247 235 return (nchr);
248 236 s += nb;
249 237 nms -= nb;
250 238 nchr++;
251 239 }
252 240 /*NOTREACHED*/
253 241 }
254 242
255 243 /*
256 244 * The fast path in the loop below is not safe if an ASCII
257 245 * character appears as anything but the first byte of a
258 246 * multibyte sequence. Check now to avoid doing it in the loop.
259 247 */
260 248 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
261 249 errno = EILSEQ;
262 250 return ((size_t)-1);
263 251 }
264 252 while (len-- > 0) {
265 253 if (nms > 0 && (signed char)*s > 0) {
266 254 /*
267 255 * Fast path for plain ASCII characters
268 256 * excluding NUL.
269 257 */
270 258 *dst = (wchar_t)*s;
271 259 nb = 1;
272 260 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
273 261 (size_t)-1) {
274 262 *src = s;
275 263 return ((size_t)-1);
276 264 } else if (nb == (size_t)-2) {
277 265 *src = s + nms;
278 266 return (nchr);
279 267 } else if (nb == 0) {
280 268 *src = NULL;
281 269 return (nchr);
282 270 }
283 271 s += nb;
284 272 nms -= nb;
285 273 nchr++;
286 274 dst++;
287 275 }
288 276 *src = s;
289 277 return (nchr);
290 278 }
291 279
292 280 static size_t
293 281 _UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps)
294 282 {
295 283 _UTF8State *us;
296 284 unsigned char lead;
297 285 int i, len;
298 286
299 287 us = (_UTF8State *)ps;
300 288
301 289 if (us->want != 0) {
302 290 errno = EINVAL;
303 291 return ((size_t)-1);
304 292 }
305 293
306 294 if (s == NULL)
307 295 /* Reset to initial shift state (no-op) */
308 296 return (1);
309 297
310 298 /*
311 299 * Determine the number of octets needed to represent this character.
312 300 * We always output the shortest sequence possible. Also specify the
313 301 * first few bits of the first octet, which contains the information
314 302 * about the sequence length.
315 303 */
316 304 if ((wc & ~0x7f) == 0) {
317 305 /* Fast path for plain ASCII characters. */
318 306 *s = (char)wc;
319 307 return (1);
320 308 } else if ((wc & ~0x7ff) == 0) {
321 309 lead = 0xc0;
322 310 len = 2;
323 311 } else if ((wc & ~0xffff) == 0) {
324 312 lead = 0xe0;
325 313 len = 3;
326 314 } else if ((wc & ~0x1fffff) == 0) {
327 315 lead = 0xf0;
328 316 len = 4;
329 317 #if 0
330 318 /* Again, 5 and 6 byte encodings are simply not permitted */
331 319 } else if ((wc & ~0x3ffffff) == 0) {
332 320 lead = 0xf8;
333 321 len = 5;
334 322 } else if ((wc & ~0x7fffffff) == 0) {
335 323 lead = 0xfc;
336 324 len = 6;
337 325 #endif
338 326 } else {
339 327 errno = EILSEQ;
340 328 return ((size_t)-1);
341 329 }
342 330
343 331 /*
344 332 * Output the octets representing the character in chunks
345 333 * of 6 bits, least significant last. The first octet is
346 334 * a special case because it contains the sequence length
347 335 * information.
348 336 */
349 337 for (i = len - 1; i > 0; i--) {
350 338 s[i] = (wc & 0x3f) | 0x80;
351 339 wc >>= 6;
352 340 }
353 341 *s = (wc & 0xff) | lead;
354 342
355 343 return (len);
356 344 }
357 345
358 346 static size_t
359 347 _UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
360 348 size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
361 349 {
362 350 _UTF8State *us;
363 351 char buf[MB_LEN_MAX];
364 352 const wchar_t *s;
365 353 size_t nbytes;
366 354 size_t nb;
367 355
368 356 us = (_UTF8State *)ps;
369 357
370 358 if (us->want != 0) {
371 359 errno = EINVAL;
372 360 return ((size_t)-1);
373 361 }
374 362
375 363 s = *src;
376 364 nbytes = 0;
377 365
378 366 if (dst == NULL) {
379 367 while (nwc-- > 0) {
380 368 if (0 <= *s && *s < 0x80)
381 369 /* Fast path for plain ASCII characters. */
382 370 nb = 1;
383 371 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
384 372 (size_t)-1)
385 373 /* Invalid character - wcrtomb() sets errno. */
386 374 return ((size_t)-1);
387 375 if (*s == L'\0')
388 376 return (nbytes + nb - 1);
389 377 s++;
390 378 nbytes += nb;
391 379 }
392 380 return (nbytes);
393 381 }
394 382
395 383 while (len > 0 && nwc-- > 0) {
396 384 if (0 <= *s && *s < 0x80) {
397 385 /* Fast path for plain ASCII characters. */
398 386 nb = 1;
399 387 *dst = *s;
400 388 } else if (len > (size_t)MB_CUR_MAX) {
401 389 /* Enough space to translate in-place. */
402 390 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
403 391 *src = s;
404 392 return ((size_t)-1);
405 393 }
406 394 } else {
407 395 /*
408 396 * May not be enough space; use temp. buffer.
409 397 */
410 398 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
411 399 *src = s;
412 400 return ((size_t)-1);
413 401 }
414 402 if (nb > (int)len)
415 403 /* MB sequence for character won't fit. */
416 404 break;
417 405 (void) memcpy(dst, buf, nb);
418 406 }
419 407 if (*s == L'\0') {
420 408 *src = NULL;
421 409 return (nbytes + nb - 1);
422 410 }
423 411 s++;
424 412 dst += nb;
425 413 len -= nb;
426 414 nbytes += nb;
427 415 }
428 416 *src = s;
429 417 return (nbytes);
430 418 }
↓ open down ↓ |
342 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX