1 /*
   2  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
   3  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
   4  * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved.
   5  * Copyright (c) 1993
   6  *      The Regents of the University of California.  All rights reserved.
   7  *
   8  * This code is derived from software contributed to Berkeley by
   9  * Paul Borman at Krystal Technologies.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  * 4. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  */
  35 
  36 #include "lint.h"
  37 #include <errno.h>
  38 #include <limits.h>
  39 #include <stdlib.h>
  40 #include <string.h>
  41 #include <wchar.h>
  42 #include <sys/types.h>
  43 #include <sys/euc.h>
  44 #include "mblocal.h"
  45 #include "lctype.h"
  46 
  47 static size_t   _EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD,
  48     const char *_RESTRICT_KYWD,
  49     size_t, mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t);
  50 static size_t   _EUC_wcrtomb_impl(char *_RESTRICT_KYWD, wchar_t,
  51     mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t);
  52 
  53 static size_t   _EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD,
  54                     const char *_RESTRICT_KYWD,
  55                     size_t, mbstate_t *_RESTRICT_KYWD);
  56 static size_t   _EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD,
  57                     const char *_RESTRICT_KYWD,
  58                     size_t, mbstate_t *_RESTRICT_KYWD);
  59 static size_t   _EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD,
  60                     const char *_RESTRICT_KYWD,
  61                     size_t, mbstate_t *_RESTRICT_KYWD);
  62 static size_t   _EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD,
  63                     const char *_RESTRICT_KYWD,
  64                     size_t, mbstate_t *_RESTRICT_KYWD);
  65 
  66 static size_t   _EUC_CN_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
  67                     mbstate_t *_RESTRICT_KYWD);
  68 static size_t   _EUC_JP_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
  69                     mbstate_t *_RESTRICT_KYWD);
  70 static size_t   _EUC_KR_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
  71                     mbstate_t *_RESTRICT_KYWD);
  72 static size_t   _EUC_TW_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
  73                     mbstate_t *_RESTRICT_KYWD);
  74 
  75 static size_t   _EUC_CN_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
  76                     const char **_RESTRICT_KYWD, size_t, size_t,
  77                     mbstate_t *_RESTRICT_KYWD);
  78 static size_t   _EUC_JP_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
  79                     const char **_RESTRICT_KYWD, size_t, size_t,
  80                     mbstate_t *_RESTRICT_KYWD);
  81 static size_t   _EUC_KR_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
  82                     const char **_RESTRICT_KYWD, size_t, size_t,
  83                     mbstate_t *_RESTRICT_KYWD);
  84 static size_t   _EUC_TW_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
  85                     const char **_RESTRICT_KYWD, size_t, size_t,
  86                     mbstate_t *_RESTRICT_KYWD);
  87 
  88 static size_t   _EUC_CN_wcsnrtombs(char *_RESTRICT_KYWD,
  89                     const wchar_t **_RESTRICT_KYWD, size_t, size_t,
  90                     mbstate_t *_RESTRICT_KYWD);
  91 static size_t   _EUC_JP_wcsnrtombs(char *_RESTRICT_KYWD,
  92                     const wchar_t **_RESTRICT_KYWD, size_t, size_t,
  93                     mbstate_t *_RESTRICT_KYWD);
  94 static size_t   _EUC_KR_wcsnrtombs(char *_RESTRICT_KYWD,
  95                     const wchar_t **_RESTRICT_KYWD, size_t, size_t,
  96                     mbstate_t *_RESTRICT_KYWD);
  97 static size_t   _EUC_TW_wcsnrtombs(char *_RESTRICT_KYWD,
  98                     const wchar_t **_RESTRICT_KYWD, size_t, size_t,
  99                     mbstate_t *_RESTRICT_KYWD);
 100 
 101 static int      _EUC_mbsinit(const mbstate_t *);
 102 
 103 typedef struct {
 104         wchar_t ch;
 105         int     set;
 106         int     want;
 107 } _EucState;
 108 
 109 int
 110 _EUC_mbsinit(const mbstate_t *ps)
 111 {
 112 
 113         return (ps == NULL || ((const _EucState *)ps)->want == 0);
 114 }
 115 
 116 /*
 117  * EUC-CN uses CS0, CS1 and CS2 (4 bytes).
 118  */
 119 void
 120 _EUC_CN_init(struct lc_ctype *lct)
 121 {
 122         lct->lc_mbrtowc = _EUC_CN_mbrtowc;
 123         lct->lc_wcrtomb = _EUC_CN_wcrtomb;
 124         lct->lc_mbsnrtowcs = _EUC_CN_mbsnrtowcs;
 125         lct->lc_wcsnrtombs = _EUC_CN_wcsnrtombs;
 126         lct->lc_mbsinit = _EUC_mbsinit;
 127 
 128         lct->lc_max_mblen = 4;
 129         lct->lc_is_ascii = 0;
 130 }
 131 
 132 static size_t
 133 _EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
 134     size_t n, mbstate_t *_RESTRICT_KYWD ps)
 135 {
 136         return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
 137 }
 138 
 139 static size_t
 140 _EUC_CN_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
 141     const char **_RESTRICT_KYWD src,
 142     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
 143 {
 144         return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_CN_mbrtowc));
 145 }
 146 
 147 static size_t
 148 _EUC_CN_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
 149     mbstate_t *_RESTRICT_KYWD ps)
 150 {
 151         return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
 152 }
 153 
 154 static size_t
 155 _EUC_CN_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
 156         size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
 157 {
 158         return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_CN_wcrtomb));
 159 }
 160 
 161 /*
 162  * EUC-KR uses only CS0 and CS1.
 163  */
 164 void
 165 _EUC_KR_init(struct lc_ctype *lct)
 166 {
 167         lct->lc_mbrtowc = _EUC_KR_mbrtowc;
 168         lct->lc_wcrtomb = _EUC_KR_wcrtomb;
 169         lct->lc_mbsnrtowcs = _EUC_KR_mbsnrtowcs;
 170         lct->lc_wcsnrtombs = _EUC_KR_wcsnrtombs;
 171         lct->lc_mbsinit = _EUC_mbsinit;
 172 
 173         lct->lc_max_mblen = 2;
 174         lct->lc_is_ascii = 0;
 175 }
 176 
 177 static size_t
 178 _EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
 179     size_t n, mbstate_t *_RESTRICT_KYWD ps)
 180 {
 181         return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0));
 182 }
 183 
 184 static size_t
 185 _EUC_KR_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
 186     const char **_RESTRICT_KYWD src,
 187     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
 188 {
 189         return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_KR_mbrtowc));
 190 }
 191 
 192 static size_t
 193 _EUC_KR_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
 194         mbstate_t *_RESTRICT_KYWD ps)
 195 {
 196         return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0));
 197 }
 198 
 199 static size_t
 200 _EUC_KR_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
 201         size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
 202 {
 203         return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_KR_wcrtomb));
 204 }
 205 
 206 /*
 207  * EUC-JP uses CS0, CS1, CS2, and CS3.
 208  */
 209 void
 210 _EUC_JP_init(struct lc_ctype *lct)
 211 {
 212         lct->lc_mbrtowc = _EUC_JP_mbrtowc;
 213         lct->lc_wcrtomb = _EUC_JP_wcrtomb;
 214         lct->lc_mbsnrtowcs = _EUC_JP_mbsnrtowcs;
 215         lct->lc_wcsnrtombs = _EUC_JP_wcsnrtombs;
 216         lct->lc_mbsinit = _EUC_mbsinit;
 217 
 218         lct->lc_max_mblen = 3;
 219         lct->lc_is_ascii = 0;
 220 }
 221 
 222 static size_t
 223 _EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
 224     size_t n, mbstate_t *_RESTRICT_KYWD ps)
 225 {
 226         return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3));
 227 }
 228 
 229 static size_t
 230 _EUC_JP_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
 231     const char **_RESTRICT_KYWD src,
 232     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
 233 {
 234         return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_JP_mbrtowc));
 235 }
 236 
 237 static size_t
 238 _EUC_JP_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
 239     mbstate_t *_RESTRICT_KYWD ps)
 240 {
 241         return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3));
 242 }
 243 
 244 static size_t
 245 _EUC_JP_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
 246         size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
 247 {
 248         return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_JP_wcrtomb));
 249 }
 250 
 251 /*
 252  * EUC-TW uses CS0, CS1, and CS2.
 253  */
 254 void
 255 _EUC_TW_init(struct lc_ctype *lct)
 256 {
 257         lct->lc_mbrtowc = _EUC_TW_mbrtowc;
 258         lct->lc_wcrtomb = _EUC_TW_wcrtomb;
 259         lct->lc_mbsnrtowcs = _EUC_TW_mbsnrtowcs;
 260         lct->lc_wcsnrtombs = _EUC_TW_wcsnrtombs;
 261         lct->lc_mbsinit = _EUC_mbsinit;
 262 
 263         lct->lc_max_mblen = 4;
 264         lct->lc_is_ascii = 0;
 265 }
 266 
 267 static size_t
 268 _EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
 269         size_t n, mbstate_t *_RESTRICT_KYWD ps)
 270 {
 271         return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
 272 }
 273 
 274 static size_t
 275 _EUC_TW_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
 276         const char **_RESTRICT_KYWD src,
 277         size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
 278 {
 279         return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_TW_mbrtowc));
 280 }
 281 
 282 static size_t
 283 _EUC_TW_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
 284         mbstate_t *_RESTRICT_KYWD ps)
 285 {
 286         return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
 287 }
 288 
 289 static size_t
 290 _EUC_TW_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
 291         size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
 292 {
 293         return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_TW_wcrtomb));
 294 }
 295 
 296 /*
 297  * Common EUC code.
 298  */
 299 
 300 static size_t
 301 _EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
 302         size_t n, mbstate_t *_RESTRICT_KYWD ps,
 303         uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
 304 {
 305         _EucState *es;
 306         int i, want;
 307         wchar_t wc;
 308         unsigned char ch;
 309 
 310         es = (_EucState *)ps;
 311 
 312         if (es->want < 0 || es->want > MB_CUR_MAX) {
 313                 errno = EINVAL;
 314                 return ((size_t)-1);
 315         }
 316 
 317         if (s == NULL) {
 318                 s = "";
 319                 n = 1;
 320                 pwc = NULL;
 321         }
 322 
 323         if (n == 0)
 324                 /* Incomplete multibyte sequence */
 325                 return ((size_t)-2);
 326 
 327         if (es->want == 0) {
 328                 /* Fast path for plain ASCII (CS0) */
 329                 if (((ch = (unsigned char)*s) & 0x80) == 0) {
 330                         if (pwc != NULL)
 331                                 *pwc = ch;
 332                         return (ch != '\0' ? 1 : 0);
 333                 }
 334 
 335                 if (ch >= 0xa1) {
 336                         /* CS1 */
 337                         want = 2;
 338                 } else if (ch == cs2) {
 339                         want = cs2width;
 340                 } else if (ch == cs3) {
 341                         want = cs3width;
 342                 } else {
 343                         errno = EILSEQ;
 344                         return ((size_t)-1);
 345                 }
 346 
 347 
 348                 es->want = want;
 349                 es->ch = 0;
 350         } else {
 351                 want = es->want;
 352                 wc = es->ch;
 353         }
 354 
 355         for (i = 0; i < MIN(want, n); i++) {
 356                 wc <<= 8;
 357                 wc |= *s;
 358                 s++;
 359         }
 360         if (i < want) {
 361                 /* Incomplete multibyte sequence */
 362                 es->want = want - i;
 363                 es->ch = wc;
 364                 return ((size_t)-2);
 365         }
 366         if (pwc != NULL)
 367                 *pwc = wc;
 368         es->want = 0;
 369         return (wc == L'\0' ? 0 : want);
 370 }
 371 
 372 static size_t
 373 _EUC_wcrtomb_impl(char *_RESTRICT_KYWD s, wchar_t wc,
 374     mbstate_t *_RESTRICT_KYWD ps,
 375     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
 376 {
 377         _EucState *es;
 378         int i, len;
 379         wchar_t nm;
 380 
 381         es = (_EucState *)ps;
 382 
 383         if (es->want != 0) {
 384                 errno = EINVAL;
 385                 return ((size_t)-1);
 386         }
 387 
 388         if (s == NULL)
 389                 /* Reset to initial shift state (no-op) */
 390                 return (1);
 391 
 392         if ((wc & ~0x7f) == 0) {
 393                 /* Fast path for plain ASCII (CS0) */
 394                 *s = (char)wc;
 395                 return (1);
 396         }
 397 
 398         /* Determine the "length" */
 399         if ((unsigned)wc > 0xffffff) {
 400                 len = 4;
 401         } else if ((unsigned)wc > 0xffff) {
 402                 len = 3;
 403         } else if ((unsigned)wc > 0xff) {
 404                 len = 2;
 405         } else {
 406                 len = 1;
 407         }
 408 
 409         if (len > MB_CUR_MAX) {
 410                 errno = EILSEQ;
 411                 return ((size_t)-1);
 412         }
 413 
 414         /* This first check excludes CS1, which is implicitly valid. */
 415         if ((wc < 0xa100) || (wc > 0xffff)) {
 416                 /* Check for valid CS2 or CS3 */
 417                 nm = (wc >> ((len - 1) * 8));
 418                 if (nm == cs2) {
 419                         if (len != cs2width) {
 420                                 errno = EILSEQ;
 421                                 return ((size_t)-1);
 422                         }
 423                 } else if (nm == cs3) {
 424                         if (len != cs3width) {
 425                                 errno = EILSEQ;
 426                                 return ((size_t)-1);
 427                         }
 428                 } else {
 429                         errno = EILSEQ;
 430                         return ((size_t)-1);
 431                 }
 432         }
 433 
 434         /* Stash the bytes, least significant last */
 435         for (i = len - 1; i >= 0; i--) {
 436                 s[i] = (wc & 0xff);
 437                 wc >>= 8;
 438         }
 439         return (len);
 440 }