1 /* 2 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 3 * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved. 4 * Copyright (c) 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Paul Borman at Krystal Technologies. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include "lint.h" 36 #include <errno.h> 37 #include <limits.h> 38 #include <stdlib.h> 39 #include <string.h> 40 #include <wchar.h> 41 #include <sys/types.h> 42 #include <sys/euc.h> 43 #include "runetype.h" 44 #include "mblocal.h" 45 46 static size_t _EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD, 47 const char *_RESTRICT_KYWD, 48 size_t, mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t); 49 static size_t _EUC_wcrtomb_impl(char *_RESTRICT_KYWD, wchar_t, 50 mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t); 51 52 static size_t _EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD, 53 const char *_RESTRICT_KYWD, 54 size_t, mbstate_t *_RESTRICT_KYWD); 55 static size_t _EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD, 56 const char *_RESTRICT_KYWD, 57 size_t, mbstate_t *_RESTRICT_KYWD); 58 static size_t _EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD, 59 const char *_RESTRICT_KYWD, 60 size_t, mbstate_t *_RESTRICT_KYWD); 61 static size_t _EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD, 62 const char *_RESTRICT_KYWD, 63 size_t, mbstate_t *_RESTRICT_KYWD); 64 static size_t _EUC_CN_wcrtomb(char *_RESTRICT_KYWD, wchar_t, 65 mbstate_t *_RESTRICT_KYWD); 66 static size_t _EUC_JP_wcrtomb(char *_RESTRICT_KYWD, wchar_t, 67 mbstate_t *_RESTRICT_KYWD); 68 static size_t _EUC_KR_wcrtomb(char *_RESTRICT_KYWD, wchar_t, 69 mbstate_t *_RESTRICT_KYWD); 70 static size_t _EUC_TW_wcrtomb(char *_RESTRICT_KYWD, wchar_t, 71 mbstate_t *_RESTRICT_KYWD); 72 static int _EUC_mbsinit(const mbstate_t *); 73 74 typedef struct { 75 wchar_t ch; 76 int set; 77 int want; 78 } _EucState; 79 80 static int 81 _EUC_mbsinit(const mbstate_t *ps) 82 { 83 84 return (ps == NULL || ((const _EucState *)ps)->want == 0); 85 } 86 87 /* 88 * EUC-CN uses CS0, CS1 and CS2 (4 bytes). 89 */ 90 int 91 _EUC_CN_init(_RuneLocale *rl) 92 { 93 __mbrtowc = _EUC_CN_mbrtowc; 94 __wcrtomb = _EUC_CN_wcrtomb; 95 __mbsinit = _EUC_mbsinit; 96 97 _CurrentRuneLocale = rl; 98 99 __ctype[520] = 4; 100 charset_is_ascii = 0; 101 return (0); 102 } 103 104 static size_t 105 _EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, 106 size_t n, mbstate_t *_RESTRICT_KYWD ps) 107 { 108 return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0)); 109 } 110 111 static size_t 112 _EUC_CN_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, 113 mbstate_t *_RESTRICT_KYWD ps) 114 { 115 return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0)); 116 } 117 118 /* 119 * EUC-KR uses only CS0 and CS1. 120 */ 121 int 122 _EUC_KR_init(_RuneLocale *rl) 123 { 124 __mbrtowc = _EUC_KR_mbrtowc; 125 __wcrtomb = _EUC_KR_wcrtomb; 126 __mbsinit = _EUC_mbsinit; 127 128 _CurrentRuneLocale = rl; 129 130 __ctype[520] = 2; 131 charset_is_ascii = 0; 132 return (0); 133 } 134 135 static size_t 136 _EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, 137 size_t n, mbstate_t *_RESTRICT_KYWD ps) 138 { 139 return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0)); 140 } 141 142 static size_t 143 _EUC_KR_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, 144 mbstate_t *_RESTRICT_KYWD ps) 145 { 146 return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0)); 147 } 148 149 /* 150 * EUC-JP uses CS0, CS1, CS2, and CS3. 151 */ 152 int 153 _EUC_JP_init(_RuneLocale *rl) 154 { 155 __mbrtowc = _EUC_JP_mbrtowc; 156 __wcrtomb = _EUC_JP_wcrtomb; 157 __mbsinit = _EUC_mbsinit; 158 159 _CurrentRuneLocale = rl; 160 161 __ctype[520] = 3; 162 charset_is_ascii = 0; 163 return (0); 164 } 165 166 static size_t 167 _EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, 168 size_t n, mbstate_t *_RESTRICT_KYWD ps) 169 { 170 return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3)); 171 } 172 173 static size_t 174 _EUC_JP_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, 175 mbstate_t *_RESTRICT_KYWD ps) 176 { 177 return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3)); 178 } 179 180 /* 181 * EUC-TW uses CS0, CS1, and CS2. 182 */ 183 int 184 _EUC_TW_init(_RuneLocale *rl) 185 { 186 __mbrtowc = _EUC_TW_mbrtowc; 187 __wcrtomb = _EUC_TW_wcrtomb; 188 __mbsinit = _EUC_mbsinit; 189 190 _CurrentRuneLocale = rl; 191 192 __ctype[520] = 4; 193 charset_is_ascii = 0; 194 return (0); 195 } 196 197 static size_t 198 _EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, 199 size_t n, mbstate_t *_RESTRICT_KYWD ps) 200 { 201 return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0)); 202 } 203 204 static size_t 205 _EUC_TW_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, 206 mbstate_t *_RESTRICT_KYWD ps) 207 { 208 return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0)); 209 } 210 211 /* 212 * Common EUC code. 213 */ 214 215 static size_t 216 _EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, 217 size_t n, mbstate_t *_RESTRICT_KYWD ps, 218 uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) 219 { 220 _EucState *es; 221 int i, want; 222 wchar_t wc; 223 unsigned char ch; 224 225 es = (_EucState *)ps; 226 227 if (es->want < 0 || es->want > MB_CUR_MAX) { 228 errno = EINVAL; 229 return ((size_t)-1); 230 } 231 232 if (s == NULL) { 233 s = ""; 234 n = 1; 235 pwc = NULL; 236 } 237 238 if (n == 0) 239 /* Incomplete multibyte sequence */ 240 return ((size_t)-2); 241 242 if (es->want == 0) { 243 /* Fast path for plain ASCII (CS0) */ 244 if (((ch = (unsigned char)*s) & 0x80) == 0) { 245 if (pwc != NULL) 246 *pwc = ch; 247 return (ch != '\0' ? 1 : 0); 248 } 249 250 if (ch >= 0xa1) { 251 /* CS1 */ 252 want = 2; 253 } else if (ch == cs2) { 254 want = cs2width; 255 } else if (ch == cs3) { 256 want = cs3width; 257 } else { 258 errno = EILSEQ; 259 return ((size_t)-1); 260 } 261 262 263 es->want = want; 264 es->ch = 0; 265 } else { 266 want = es->want; 267 wc = es->ch; 268 } 269 270 for (i = 0; i < MIN(want, n); i++) { 271 wc <<= 8; 272 wc |= *s; 273 s++; 274 } 275 if (i < want) { 276 /* Incomplete multibyte sequence */ 277 es->want = want - i; 278 es->ch = wc; 279 return ((size_t)-2); 280 } 281 if (pwc != NULL) 282 *pwc = wc; 283 es->want = 0; 284 return (wc == L'\0' ? 0 : want); 285 } 286 287 static size_t 288 _EUC_wcrtomb_impl(char *_RESTRICT_KYWD s, wchar_t wc, 289 mbstate_t *_RESTRICT_KYWD ps, 290 uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) 291 { 292 _EucState *es; 293 int i, len; 294 wchar_t nm; 295 296 es = (_EucState *)ps; 297 298 if (es->want != 0) { 299 errno = EINVAL; 300 return ((size_t)-1); 301 } 302 303 if (s == NULL) 304 /* Reset to initial shift state (no-op) */ 305 return (1); 306 307 if ((wc & ~0x7f) == 0) { 308 /* Fast path for plain ASCII (CS0) */ 309 *s = (char)wc; 310 return (1); 311 } 312 313 /* Determine the "length" */ 314 if ((unsigned)wc > 0xffffff) { 315 len = 4; 316 } else if ((unsigned)wc > 0xffff) { 317 len = 3; 318 } else if ((unsigned)wc > 0xff) { 319 len = 2; 320 } else { 321 len = 1; 322 } 323 324 if (len > MB_CUR_MAX) { 325 errno = EILSEQ; 326 return ((size_t)-1); 327 } 328 329 /* This first check excludes CS1, which is implicitly valid. */ 330 if ((wc < 0xa100) || (wc > 0xffff)) { 331 /* Check for valid CS2 or CS3 */ 332 nm = (wc >> ((len - 1) * 8)); 333 if (nm == cs2) { 334 if (len != cs2width) { 335 errno = EILSEQ; 336 return ((size_t)-1); 337 } 338 } else if (nm == cs3) { 339 if (len != cs3width) { 340 errno = EILSEQ; 341 return ((size_t)-1); 342 } 343 } else { 344 errno = EILSEQ; 345 return ((size_t)-1); 346 } 347 } 348 349 /* Stash the bytes, least significant last */ 350 for (i = len - 1; i >= 0; i--) { 351 s[i] = (wc & 0xff); 352 wc >>= 8; 353 } 354 return (len); 355 }