1 /*
   2  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
   3  * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved.
   4  * Copyright (c) 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Paul Borman at Krystal Technologies.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 4. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34 
  35 #include "lint.h"
  36 #include <errno.h>
  37 #include <limits.h>
  38 #include <stdlib.h>
  39 #include <string.h>
  40 #include <wchar.h>
  41 #include <sys/types.h>
  42 #include <sys/euc.h>
  43 #include "runetype.h"
  44 #include "mblocal.h"
  45 
  46 static size_t   _EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD,
  47     const char *_RESTRICT_KYWD,
  48     size_t, mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t);
  49 static size_t   _EUC_wcrtomb_impl(char *_RESTRICT_KYWD, wchar_t,
  50     mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t);
  51 
  52 static size_t   _EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD,
  53                     const char *_RESTRICT_KYWD,
  54                     size_t, mbstate_t *_RESTRICT_KYWD);
  55 static size_t   _EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD,
  56                     const char *_RESTRICT_KYWD,
  57                     size_t, mbstate_t *_RESTRICT_KYWD);
  58 static size_t   _EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD,
  59                     const char *_RESTRICT_KYWD,
  60                     size_t, mbstate_t *_RESTRICT_KYWD);
  61 static size_t   _EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD,
  62                     const char *_RESTRICT_KYWD,
  63                     size_t, mbstate_t *_RESTRICT_KYWD);
  64 static size_t   _EUC_CN_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
  65                     mbstate_t *_RESTRICT_KYWD);
  66 static size_t   _EUC_JP_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
  67                     mbstate_t *_RESTRICT_KYWD);
  68 static size_t   _EUC_KR_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
  69                     mbstate_t *_RESTRICT_KYWD);
  70 static size_t   _EUC_TW_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
  71                     mbstate_t *_RESTRICT_KYWD);
  72 static int      _EUC_mbsinit(const mbstate_t *);
  73 
  74 typedef struct {
  75         wchar_t ch;
  76         int     set;
  77         int     want;
  78 } _EucState;
  79 
  80 static int
  81 _EUC_mbsinit(const mbstate_t *ps)
  82 {
  83 
  84         return (ps == NULL || ((const _EucState *)ps)->want == 0);
  85 }
  86 
  87 /*
  88  * EUC-CN uses CS0, CS1 and CS2 (4 bytes).
  89  */
  90 int
  91 _EUC_CN_init(_RuneLocale *rl)
  92 {
  93         __mbrtowc = _EUC_CN_mbrtowc;
  94         __wcrtomb = _EUC_CN_wcrtomb;
  95         __mbsinit = _EUC_mbsinit;
  96 
  97         _CurrentRuneLocale = rl;
  98 
  99         __ctype[520] = 4;
 100         charset_is_ascii = 0;
 101         return (0);
 102 }
 103 
 104 static size_t
 105 _EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
 106     size_t n, mbstate_t *_RESTRICT_KYWD ps)
 107 {
 108         return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
 109 }
 110 
 111 static size_t
 112 _EUC_CN_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
 113     mbstate_t *_RESTRICT_KYWD ps)
 114 {
 115         return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
 116 }
 117 
 118 /*
 119  * EUC-KR uses only CS0 and CS1.
 120  */
 121 int
 122 _EUC_KR_init(_RuneLocale *rl)
 123 {
 124         __mbrtowc = _EUC_KR_mbrtowc;
 125         __wcrtomb = _EUC_KR_wcrtomb;
 126         __mbsinit = _EUC_mbsinit;
 127 
 128         _CurrentRuneLocale = rl;
 129 
 130         __ctype[520] = 2;
 131         charset_is_ascii = 0;
 132         return (0);
 133 }
 134 
 135 static size_t
 136 _EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
 137     size_t n, mbstate_t *_RESTRICT_KYWD ps)
 138 {
 139         return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0));
 140 }
 141 
 142 static size_t
 143 _EUC_KR_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
 144     mbstate_t *_RESTRICT_KYWD ps)
 145 {
 146         return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0));
 147 }
 148 
 149 /*
 150  * EUC-JP uses CS0, CS1, CS2, and CS3.
 151  */
 152 int
 153 _EUC_JP_init(_RuneLocale *rl)
 154 {
 155         __mbrtowc = _EUC_JP_mbrtowc;
 156         __wcrtomb = _EUC_JP_wcrtomb;
 157         __mbsinit = _EUC_mbsinit;
 158 
 159         _CurrentRuneLocale = rl;
 160 
 161         __ctype[520] = 3;
 162         charset_is_ascii = 0;
 163         return (0);
 164 }
 165 
 166 static size_t
 167 _EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
 168     size_t n, mbstate_t *_RESTRICT_KYWD ps)
 169 {
 170         return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3));
 171 }
 172 
 173 static size_t
 174 _EUC_JP_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
 175     mbstate_t *_RESTRICT_KYWD ps)
 176 {
 177         return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3));
 178 }
 179 
 180 /*
 181  * EUC-TW uses CS0, CS1, and CS2.
 182  */
 183 int
 184 _EUC_TW_init(_RuneLocale *rl)
 185 {
 186         __mbrtowc = _EUC_TW_mbrtowc;
 187         __wcrtomb = _EUC_TW_wcrtomb;
 188         __mbsinit = _EUC_mbsinit;
 189 
 190         _CurrentRuneLocale = rl;
 191 
 192         __ctype[520] = 4;
 193         charset_is_ascii = 0;
 194         return (0);
 195 }
 196 
 197 static size_t
 198 _EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
 199     size_t n, mbstate_t *_RESTRICT_KYWD ps)
 200 {
 201         return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
 202 }
 203 
 204 static size_t
 205 _EUC_TW_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
 206     mbstate_t *_RESTRICT_KYWD ps)
 207 {
 208         return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
 209 }
 210 
 211 /*
 212  * Common EUC code.
 213  */
 214 
 215 static size_t
 216 _EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
 217     size_t n, mbstate_t *_RESTRICT_KYWD ps,
 218     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
 219 {
 220         _EucState *es;
 221         int i, want;
 222         wchar_t wc;
 223         unsigned char ch;
 224 
 225         es = (_EucState *)ps;
 226 
 227         if (es->want < 0 || es->want > MB_CUR_MAX) {
 228                 errno = EINVAL;
 229                 return ((size_t)-1);
 230         }
 231 
 232         if (s == NULL) {
 233                 s = "";
 234                 n = 1;
 235                 pwc = NULL;
 236         }
 237 
 238         if (n == 0)
 239                 /* Incomplete multibyte sequence */
 240                 return ((size_t)-2);
 241 
 242         if (es->want == 0) {
 243                 /* Fast path for plain ASCII (CS0) */
 244                 if (((ch = (unsigned char)*s) & 0x80) == 0) {
 245                         if (pwc != NULL)
 246                                 *pwc = ch;
 247                         return (ch != '\0' ? 1 : 0);
 248                 }
 249 
 250                 if (ch >= 0xa1) {
 251                         /* CS1 */
 252                         want = 2;
 253                 } else if (ch == cs2) {
 254                         want = cs2width;
 255                 } else if (ch == cs3) {
 256                         want = cs3width;
 257                 } else {
 258                         errno = EILSEQ;
 259                         return ((size_t)-1);
 260                 }
 261 
 262 
 263                 es->want = want;
 264                 es->ch = 0;
 265         } else {
 266                 want = es->want;
 267                 wc = es->ch;
 268         }
 269 
 270         for (i = 0; i < MIN(want, n); i++) {
 271                 wc <<= 8;
 272                 wc |= *s;
 273                 s++;
 274         }
 275         if (i < want) {
 276                 /* Incomplete multibyte sequence */
 277                 es->want = want - i;
 278                 es->ch = wc;
 279                 return ((size_t)-2);
 280         }
 281         if (pwc != NULL)
 282                 *pwc = wc;
 283         es->want = 0;
 284         return (wc == L'\0' ? 0 : want);
 285 }
 286 
 287 static size_t
 288 _EUC_wcrtomb_impl(char *_RESTRICT_KYWD s, wchar_t wc,
 289     mbstate_t *_RESTRICT_KYWD ps,
 290     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
 291 {
 292         _EucState *es;
 293         int i, len;
 294         wchar_t nm;
 295 
 296         es = (_EucState *)ps;
 297 
 298         if (es->want != 0) {
 299                 errno = EINVAL;
 300                 return ((size_t)-1);
 301         }
 302 
 303         if (s == NULL)
 304                 /* Reset to initial shift state (no-op) */
 305                 return (1);
 306 
 307         if ((wc & ~0x7f) == 0) {
 308                 /* Fast path for plain ASCII (CS0) */
 309                 *s = (char)wc;
 310                 return (1);
 311         }
 312 
 313         /* Determine the "length" */
 314         if ((unsigned)wc > 0xffffff) {
 315                 len = 4;
 316         } else if ((unsigned)wc > 0xffff) {
 317                 len = 3;
 318         } else if ((unsigned)wc > 0xff) {
 319                 len = 2;
 320         } else {
 321                 len = 1;
 322         }
 323 
 324         if (len > MB_CUR_MAX) {
 325                 errno = EILSEQ;
 326                 return ((size_t)-1);
 327         }
 328 
 329         /* This first check excludes CS1, which is implicitly valid. */
 330         if ((wc < 0xa100) || (wc > 0xffff)) {
 331                 /* Check for valid CS2 or CS3 */
 332                 nm = (wc >> ((len - 1) * 8));
 333                 if (nm == cs2) {
 334                         if (len != cs2width) {
 335                                 errno = EILSEQ;
 336                                 return ((size_t)-1);
 337                         }
 338                 } else if (nm == cs3) {
 339                         if (len != cs3width) {
 340                                 errno = EILSEQ;
 341                                 return ((size_t)-1);
 342                         }
 343                 } else {
 344                         errno = EILSEQ;
 345                         return ((size_t)-1);
 346                 }
 347         }
 348 
 349         /* Stash the bytes, least significant last */
 350         for (i = len - 1; i >= 0; i--) {
 351                 s[i] = (wc & 0xff);
 352                 wc >>= 8;
 353         }
 354         return (len);
 355 }