1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/sysmacros.h> 31 #include <sys/systm.h> 32 #include <sys/debug.h> 33 #include <sys/kmem.h> 34 #include <sys/sunddi.h> 35 #include <sys/byteorder.h> 36 #include <sys/errno.h> 37 #include <sys/modctl.h> 38 #include <sys/kiconv.h> 39 #include <sys/u8_textprep.h> 40 #include <sys/kiconv_cck_common.h> 41 #include <sys/kiconv_sc.h> 42 #include <sys/kiconv_gb18030_utf8.h> 43 #include <sys/kiconv_gb2312_utf8.h> 44 #include <sys/kiconv_utf8_gb18030.h> 45 #include <sys/kiconv_utf8_gb2312.h> 46 47 static int8_t gb2312_to_utf8(uchar_t byte1, uchar_t byte2, uchar_t *ob, 48 uchar_t *obtail, size_t *ret_val); 49 static int8_t gbk_to_utf8(uint32_t gbk_val, uchar_t *ob, uchar_t *obtail, 50 size_t *ret_val, boolean_t isgbk4); 51 static int8_t utf8_to_gb2312(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 52 uchar_t *ob, uchar_t *obtail, size_t *ret); 53 static int8_t utf8_to_gbk(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 54 uchar_t *ob, uchar_t *obtail, size_t *ret); 55 static int8_t utf8_to_gb18030(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 56 uchar_t *ob, uchar_t *obtail, size_t *ret); 57 58 #define KICONV_SC_GB18030 (0x01) 59 #define KICONV_SC_GBK (0x02) 60 #define KICONV_SC_EUCCN (0x03) 61 #define KICONV_SC_MAX_MAGIC_ID (0x03) 62 63 static void * 64 open_fr_gb18030() 65 { 66 return ((void *)KICONV_SC_GB18030); 67 } 68 69 static void * 70 open_fr_gbk() 71 { 72 return ((void *)KICONV_SC_GBK); 73 } 74 75 static void * 76 open_fr_euccn() 77 { 78 return ((void *)KICONV_SC_EUCCN); 79 } 80 81 static int 82 close_fr_sc(void *s) 83 { 84 if ((uintptr_t)s > KICONV_SC_MAX_MAGIC_ID) 85 return (EBADF); 86 87 return (0); 88 } 89 90 /* 91 * Encoding convertor from UTF-8 to GB18030. 92 */ 93 size_t 94 kiconv_to_gb18030(void *kcd, char **inbuf, size_t *inbytesleft, 95 char **outbuf, size_t *outbytesleft, int *errno) 96 { 97 98 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 99 outbytesleft, errno, utf8_to_gb18030); 100 } 101 102 /* 103 * String based encoding convertor from UTF-8 to GB18030. 104 */ 105 size_t 106 kiconvstr_to_gb18030(char *inarray, size_t *inlen, char *outarray, 107 size_t *outlen, int flag, int *errno) 108 { 109 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 110 (uchar_t *)outarray, outlen, flag, errno, utf8_to_gb18030); 111 } 112 113 /* 114 * Encoding convertor from GB18030 to UTF-8. 115 */ 116 size_t 117 kiconv_fr_gb18030(void *kcd, char **inbuf, size_t *inbytesleft, 118 char **outbuf, size_t *outbytesleft, int *errno) 119 { 120 uchar_t *ib; 121 uchar_t *ob; 122 uchar_t *ibtail; 123 uchar_t *obtail; 124 size_t ret_val; 125 int8_t sz; 126 uint32_t gb_val; 127 boolean_t isgbk4; 128 129 /* Check on the kiconv code conversion descriptor. */ 130 if (kcd == NULL || kcd == (void *)-1) { 131 *errno = EBADF; 132 return ((size_t)-1); 133 } 134 135 /* If this is a state reset request, process and return. */ 136 if (inbuf == NULL || *inbuf == NULL) { 137 return (0); 138 } 139 140 ret_val = 0; 141 ib = (uchar_t *)*inbuf; 142 ob = (uchar_t *)*outbuf; 143 ibtail = ib + *inbytesleft; 144 obtail = ob + *outbytesleft; 145 146 while (ib < ibtail) { 147 if (KICONV_IS_ASCII(*ib)) { 148 if (ob >= obtail) { 149 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 150 } 151 152 *ob++ = *ib++; 153 continue; 154 } 155 156 /* 157 * Issue EILSEQ error if the first byte is not a 158 * valid GB18030 leading byte. 159 */ 160 if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) { 161 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 162 } 163 164 isgbk4 = (ibtail - ib < 2) ? B_FALSE : 165 KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1)); 166 167 if (isgbk4) { 168 if (ibtail - ib < 4) { 169 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 170 } 171 172 if (! (KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1)) && 173 KICONV_SC_IS_GB18030_3rd_BYTE(*(ib + 2)) && 174 KICONV_SC_IS_GB18030_4th_BYTE(*(ib + 3)))) { 175 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 176 } 177 178 gb_val = (uint32_t)(*ib) << 24 | 179 (uint32_t)(*(ib + 1)) << 16 | 180 (uint32_t)(*(ib + 2)) << 8 | *(ib + 3); 181 } else { 182 if (ibtail - ib < 2) { 183 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 184 } 185 186 if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) { 187 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 188 } 189 190 gb_val = (uint32_t)(*ib) << 8 | *(ib + 1); 191 } 192 193 sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, isgbk4); 194 if (sz < 0) { 195 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 196 } 197 198 ib += isgbk4 ? 4 : 2; 199 ob += sz; 200 } 201 202 *inbuf = (char *)ib; 203 *inbytesleft = ibtail - ib; 204 *outbuf = (char *)ob; 205 *outbytesleft = obtail - ob; 206 207 return (ret_val); 208 } 209 210 /* 211 * String based encoding convertor from GB18030 to UTF-8. 212 */ 213 size_t 214 kiconvstr_fr_gb18030(char *inarray, size_t *inlen, char *outarray, 215 size_t *outlen, int flag, int *errno) 216 { 217 uchar_t *ib; 218 uchar_t *ob; 219 uchar_t *ibtail; 220 uchar_t *obtail; 221 uchar_t *oldib; 222 size_t ret_val; 223 int8_t sz; 224 uint32_t gb_val; 225 boolean_t isgbk4; 226 boolean_t do_not_ignore_null; 227 228 ret_val = 0; 229 ib = (uchar_t *)inarray; 230 ob = (uchar_t *)outarray; 231 ibtail = ib + *inlen; 232 obtail = ob + *outlen; 233 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0); 234 235 while (ib < ibtail) { 236 if (*ib == '\0' && do_not_ignore_null) 237 break; 238 239 if (KICONV_IS_ASCII(*ib)) { 240 if (ob >= obtail) { 241 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 242 } 243 244 *ob++ = *ib++; 245 continue; 246 } 247 248 oldib = ib; 249 250 if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) { 251 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ); 252 } 253 254 isgbk4 = (ibtail - ib < 2) ? B_FALSE : 255 KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1)); 256 257 if (isgbk4) { 258 if (ibtail - ib < 4) { 259 if (flag & KICONV_REPLACE_INVALID) { 260 ib = ibtail; 261 goto REPLACE_INVALID; 262 } 263 264 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 265 } 266 267 if (! (KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1)) && 268 KICONV_SC_IS_GB18030_3rd_BYTE(*(ib + 2)) && 269 KICONV_SC_IS_GB18030_4th_BYTE(*(ib + 3)))) { 270 KICONV_SET_ERRNO_WITH_FLAG(4, EILSEQ); 271 } 272 273 gb_val = (uint32_t)(*ib) << 24 | 274 (uint32_t)(*(ib + 1)) << 16 | 275 (uint32_t)(*(ib + 2)) << 8 | *(ib + 3); 276 } else { 277 if (ibtail - ib < 2) { 278 if (flag & KICONV_REPLACE_INVALID) { 279 ib = ibtail; 280 goto REPLACE_INVALID; 281 } 282 283 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 284 } 285 286 if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) { 287 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ); 288 } 289 290 gb_val = (uint32_t)(*ib) << 8 | *(ib + 1); 291 } 292 293 sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, isgbk4); 294 if (sz < 0) { 295 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 296 } 297 298 ib += isgbk4 ? 4 : 2; 299 ob += sz; 300 continue; 301 302 REPLACE_INVALID: 303 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) { 304 ib = oldib; 305 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 306 } 307 308 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1; 309 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2; 310 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3; 311 ret_val++; 312 } 313 314 *inlen = ibtail - ib; 315 *outlen = obtail - ob; 316 317 return (ret_val); 318 } 319 320 /* 321 * Encoding convertor from UTF-8 to GBK. 322 */ 323 size_t 324 kiconv_to_gbk(void *kcd, char **inbuf, size_t *inbytesleft, 325 char **outbuf, size_t *outbytesleft, int *errno) 326 { 327 328 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 329 outbytesleft, errno, utf8_to_gbk); 330 } 331 332 /* 333 * String based encoding convertor from UTF-8 to GBK. 334 */ 335 size_t 336 kiconvstr_to_gbk(char *inarray, size_t *inlen, char *outarray, 337 size_t *outlen, int flag, int *errno) 338 { 339 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 340 (uchar_t *)outarray, outlen, flag, errno, utf8_to_gbk); 341 } 342 343 /* 344 * Encoding convertor from GBK to UTF-8. 345 */ 346 size_t 347 kiconv_fr_gbk(void *kcd, char **inbuf, size_t *inbytesleft, 348 char **outbuf, size_t *outbytesleft, int *errno) 349 { 350 uchar_t *ib; 351 uchar_t *ob; 352 uchar_t *ibtail; 353 uchar_t *obtail; 354 size_t ret_val; 355 int8_t sz; 356 uint32_t gb_val; 357 358 /* Check on the kiconv code conversion descriptor. */ 359 if (kcd == NULL || kcd == (void *)-1) { 360 *errno = EBADF; 361 return ((size_t)-1); 362 } 363 364 /* If this is a state reset request, process and return. */ 365 if (inbuf == NULL || *inbuf == NULL) { 366 return (0); 367 } 368 369 ret_val = 0; 370 ib = (uchar_t *)*inbuf; 371 ob = (uchar_t *)*outbuf; 372 ibtail = ib + *inbytesleft; 373 obtail = ob + *outbytesleft; 374 375 while (ib < ibtail) { 376 if (KICONV_IS_ASCII(*ib)) { 377 if (ob >= obtail) { 378 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 379 } 380 381 *ob++ = *ib++; 382 continue; 383 } 384 385 /* 386 * Issue EILSEQ error if the first byte is not a 387 * valid GBK leading byte. 388 */ 389 if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) { 390 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 391 } 392 393 /* 394 * Issue EINVAL error if input buffer has an incomplete 395 * character at the end of the buffer. 396 */ 397 if (ibtail - ib < 2) { 398 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 399 } 400 401 /* 402 * Issue EILSEQ error if the remaining byte is not 403 * a valid GBK byte. 404 */ 405 if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) { 406 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 407 } 408 409 /* Now we have a valid GBK character. */ 410 gb_val = (uint32_t)(*ib) << 8 | *(ib + 1); 411 sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, B_FALSE); 412 413 if (sz < 0) { 414 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 415 } 416 417 ib += 2; 418 ob += sz; 419 } 420 421 *inbuf = (char *)ib; 422 *inbytesleft = ibtail - ib; 423 *outbuf = (char *)ob; 424 *outbytesleft = obtail - ob; 425 426 return (ret_val); 427 } 428 429 /* 430 * String based encoding convertor from GBK to UTF-8. 431 */ 432 size_t 433 kiconvstr_fr_gbk(char *inarray, size_t *inlen, char *outarray, 434 size_t *outlen, int flag, int *errno) 435 { 436 uchar_t *ib; 437 uchar_t *ob; 438 uchar_t *ibtail; 439 uchar_t *obtail; 440 uchar_t *oldib; 441 size_t ret_val; 442 int8_t sz; 443 uint32_t gb_val; 444 boolean_t do_not_ignore_null; 445 446 ret_val = 0; 447 ib = (uchar_t *)inarray; 448 ob = (uchar_t *)outarray; 449 ibtail = ib + *inlen; 450 obtail = ob + *outlen; 451 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0); 452 453 while (ib < ibtail) { 454 if (*ib == '\0' && do_not_ignore_null) 455 break; 456 457 if (KICONV_IS_ASCII(*ib)) { 458 if (ob >= obtail) { 459 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 460 } 461 462 *ob++ = *ib++; 463 continue; 464 } 465 466 oldib = ib; 467 468 if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) { 469 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ); 470 } 471 472 if (ibtail - ib < 2) { 473 KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL); 474 } 475 476 if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) { 477 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ); 478 } 479 480 gb_val = (uint32_t)(*ib << 8) | *(ib + 1); 481 sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, B_FALSE); 482 483 if (sz < 0) { 484 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 485 } 486 487 ib += 2; 488 ob += sz; 489 continue; 490 491 REPLACE_INVALID: 492 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) { 493 ib = oldib; 494 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 495 } 496 497 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1; 498 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2; 499 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3; 500 ret_val++; 501 } 502 503 *inlen = ibtail - ib; 504 *outlen = obtail - ob; 505 506 return (ret_val); 507 } 508 509 /* 510 * Encoding convertor from UTF-8 to EUC-CN. 511 */ 512 size_t 513 kiconv_to_euccn(void *kcd, char **inbuf, size_t *inbytesleft, 514 char **outbuf, size_t *outbytesleft, int *errno) 515 { 516 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 517 outbytesleft, errno, utf8_to_gb2312); 518 } 519 520 /* 521 * String based encoding convertor from UTF-8 to EUC-CN. 522 */ 523 size_t 524 kiconvstr_to_euccn(char *inarray, size_t *inlen, char *outarray, 525 size_t *outlen, int flag, int *errno) 526 { 527 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 528 (uchar_t *)outarray, outlen, flag, errno, utf8_to_gb2312); 529 } 530 531 /* 532 * Encoding converto from EUC-CN to UTF-8 code. 533 */ 534 size_t 535 kiconv_fr_euccn(void *kcd, char **inbuf, size_t *inbytesleft, 536 char **outbuf, size_t *outbytesleft, int *errno) 537 { 538 uchar_t *ib; 539 uchar_t *ob; 540 uchar_t *ibtail; 541 uchar_t *obtail; 542 size_t ret_val; 543 int8_t sz; 544 545 /* Check on the kiconv code conversion descriptor. */ 546 if (kcd == NULL || kcd == (void *)-1) { 547 *errno = EBADF; 548 return ((size_t)-1); 549 } 550 551 /* If this is a state reset request, process and return. */ 552 if (inbuf == NULL || *inbuf == NULL) { 553 return (0); 554 } 555 556 ret_val = 0; 557 ib = (uchar_t *)*inbuf; 558 ob = (uchar_t *)*outbuf; 559 ibtail = ib + *inbytesleft; 560 obtail = ob + *outbytesleft; 561 562 while (ib < ibtail) { 563 if (KICONV_IS_ASCII(*ib)) { 564 if (ob >= obtail) { 565 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 566 } 567 568 *ob++ = *ib++; 569 continue; 570 } 571 572 /* 573 * Issue EILSEQ error if the first byte is not a 574 * valid GB2312 leading byte. 575 */ 576 if (! KICONV_SC_IS_GB2312_BYTE(*ib)) { 577 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 578 } 579 580 /* 581 * Issue EINVAL error if input buffer has an incomplete 582 * character at the end of the buffer. 583 */ 584 if (ibtail - ib < 2) { 585 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 586 } 587 588 /* 589 * Issue EILSEQ error if the remaining byte is not 590 * a valid GB2312 byte. 591 */ 592 if (! KICONV_SC_IS_GB2312_BYTE(*(ib + 1))) { 593 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 594 } 595 596 /* Now we have a valid GB2312 character */ 597 sz = gb2312_to_utf8(*ib, *(ib + 1), ob, obtail, &ret_val); 598 if (sz < 0) { 599 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 600 } 601 602 ib += 2; 603 ob += sz; 604 } 605 606 *inbuf = (char *)ib; 607 *inbytesleft = ibtail - ib; 608 *outbuf = (char *)ob; 609 *outbytesleft = obtail - ob; 610 611 return (ret_val); 612 } 613 614 /* 615 * String based encoding convertor from EUC-CN to UTF-8. 616 */ 617 size_t 618 kiconvstr_fr_euccn(char *inarray, size_t *inlen, char *outarray, 619 size_t *outlen, int flag, int *errno) 620 { 621 uchar_t *ib; 622 uchar_t *ob; 623 uchar_t *ibtail; 624 uchar_t *obtail; 625 uchar_t *oldib; 626 size_t ret_val; 627 int8_t sz; 628 boolean_t do_not_ignore_null; 629 630 ret_val = 0; 631 ib = (uchar_t *)inarray; 632 ob = (uchar_t *)outarray; 633 ibtail = ib + *inlen; 634 obtail = ob + *outlen; 635 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0); 636 637 while (ib < ibtail) { 638 if (*ib == '\0' && do_not_ignore_null) 639 break; 640 641 if (KICONV_IS_ASCII(*ib)) { 642 if (ob >= obtail) { 643 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 644 } 645 646 *ob++ = *ib++; 647 continue; 648 } 649 650 oldib = ib; 651 652 if (! KICONV_SC_IS_GB2312_BYTE(*ib)) { 653 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ); 654 } 655 656 if (ibtail - ib < 2) { 657 KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL); 658 } 659 660 if (! KICONV_SC_IS_GB2312_BYTE(*(ib + 1))) { 661 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ); 662 } 663 664 sz = gb2312_to_utf8(*ib, *(ib + 1), ob, obtail, &ret_val); 665 if (sz < 0) { 666 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 667 } 668 669 ib += 2; 670 ob += sz; 671 continue; 672 673 REPLACE_INVALID: 674 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) { 675 ib = oldib; 676 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 677 } 678 679 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1; 680 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2; 681 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3; 682 ret_val++; 683 } 684 685 *inlen = ibtail - ib; 686 *outlen = obtail - ob; 687 688 return (ret_val); 689 } 690 691 /* 692 * Convert single GB2312 character to UTF-8. 693 * Return: > 0 - Converted successfully 694 * = -1 - E2BIG 695 */ 696 static int8_t 697 gb2312_to_utf8(uchar_t b1, uchar_t b2, uchar_t *ob, uchar_t *obtail, 698 size_t *ret_val) 699 { 700 size_t index; 701 int8_t sz; 702 uchar_t *u8; 703 704 /* index = (b1 - KICONV_EUC_START) * 94 + b2 - KICONV_EUC_START; */ 705 index = b1 * 94 + b2 - 0x3BBF; 706 707 if (index >= KICONV_GB2312_UTF8_MAX) 708 index = KICONV_GB2312_UTF8_MAX - 1; /* Map to 0xEFBFBD */ 709 710 u8 = kiconv_gb2312_utf8[index]; 711 sz = u8_number_of_bytes[u8[0]]; 712 713 if (obtail - ob < sz) { 714 *ret_val = (size_t)-1; 715 return (-1); 716 } 717 718 for (index = 0; index < sz; index++) 719 *ob++ = u8[index]; 720 721 /* 722 * As kiconv_gb2312_utf8 contain muliple KICONV_UTF8_REPLACEMENT_CHAR 723 * elements, so need to ckeck more. 724 */ 725 if (sz == KICONV_UTF8_REPLACEMENT_CHAR_LEN && 726 u8[0] == KICONV_UTF8_REPLACEMENT_CHAR1 && 727 u8[1] == KICONV_UTF8_REPLACEMENT_CHAR2 && 728 u8[2] == KICONV_UTF8_REPLACEMENT_CHAR3) 729 (*ret_val)++; 730 731 return (sz); 732 } 733 734 /* 735 * Convert single GB18030 or GBK character to UTF-8. 736 * Return: > 0 - Converted successfully 737 * = -1 - E2BIG 738 */ 739 static int8_t 740 gbk_to_utf8(uint32_t gbk_val, uchar_t *ob, uchar_t *obtail, size_t *ret_val, 741 boolean_t isgbk4) 742 { 743 size_t index; 744 int8_t sz; 745 uchar_t u8array[4]; 746 uchar_t *u8; 747 748 if (isgbk4) { 749 if (gbk_val >= KICONV_SC_PLANE1_GB18030_START) { 750 uint32_t u32; 751 752 /* 753 * u32 = ((gbk_val >> 24) - 0x90) * 12600 + 754 * (((gbk_val & 0xFF0000) >> 16) - 0x30) * 1260 + 755 * (((gbk_val & 0xFF00) >> 8) - 0x81) * 10 + 756 * (gbk_val & 0xFF - 0x30)+ 757 * KICONV_SC_PLANE1_UCS4_START; 758 */ 759 u32 = (gbk_val >> 24) * 12600 + 760 ((gbk_val & 0xFF0000) >> 16) * 1260 + 761 ((gbk_val & 0xFF00) >> 8) * 10 + 762 (gbk_val & 0xFF) - 0x1BA0FA; 763 u8array[0] = (uchar_t)(0xF0 | ((u32 & 0x1C0000) >> 18)); 764 u8array[1] = (uchar_t)(0x80 | ((u32 & 0x03F000) >> 12)); 765 u8array[2] = (uchar_t)(0x80 | ((u32 & 0x000FC0) >> 6)); 766 u8array[3] = (uchar_t)(0x80 | (u32 & 0x00003F)); 767 u8 = u8array; 768 index = 1; 769 } else { 770 index = kiconv_binsearch(gbk_val, 771 kiconv_gbk4_utf8, KICONV_GBK4_UTF8_MAX); 772 u8 = kiconv_gbk4_utf8[index].u8; 773 } 774 } else { 775 index = kiconv_binsearch(gbk_val, 776 kiconv_gbk_utf8, KICONV_GBK_UTF8_MAX); 777 u8 = kiconv_gbk_utf8[index].u8; 778 } 779 780 sz = u8_number_of_bytes[u8[0]]; 781 if (obtail - ob < sz) { 782 *ret_val = (size_t)-1; 783 return (-1); 784 } 785 786 if (index == 0) 787 (*ret_val)++; /* Non-identical conversion */ 788 789 for (index = 0; index < sz; index++) 790 *ob++ = u8[index]; 791 792 return (sz); 793 } 794 795 /* 796 * Convert single UTF-8 character to GB18030. 797 * Return: > 0 - Converted successfully 798 * = -1 - E2BIG 799 */ 800 /* ARGSUSED */ 801 static int8_t 802 utf8_to_gb18030(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 803 uchar_t *ob, uchar_t *obtail, size_t *ret) 804 { 805 size_t index; 806 int8_t gbklen; 807 uint32_t gbkcode; 808 809 if (utf8 >= KICONV_SC_PLANE1_UTF8_START) { 810 /* Four bytes GB18030 [0x90308130, 0xe339fe39] handling. */ 811 uint32_t u32; 812 813 u32 = (((utf8 & 0x07000000) >> 6) | ((utf8 & 0x3F0000) >> 4) | 814 ((utf8 & 0x3F00) >> 2) | (utf8 & 0x3F)) - 815 KICONV_SC_PLANE1_UCS4_START; 816 gbkcode = ((u32 / 12600 + 0x90) << 24) | 817 (((u32 % 12600) / 1260 + 0x30) << 16) | 818 (((u32 % 1260) / 10 + 0x81) << 8) | (u32 % 10 + 0x30); 819 gbklen = 4; 820 index = 1; 821 } else { 822 index = kiconv_binsearch(utf8, kiconv_utf8_gb18030, 823 KICONV_UTF8_GB18030_MAX); 824 gbkcode = kiconv_utf8_gb18030[index].value; 825 KICONV_SC_GET_GB_LEN(gbkcode, gbklen); 826 } 827 828 if (obtail - ob < gbklen) { 829 *ret = (size_t)-1; 830 return (-1); 831 } 832 833 if (index == 0) 834 (*ret)++; /* Non-identical conversion */ 835 836 if (gbklen == 2) { 837 *ob++ = (uchar_t)(gbkcode >> 8); 838 } else if (gbklen == 4) { 839 *ob++ = (uchar_t)(gbkcode >> 24); 840 *ob++ = (uchar_t)(gbkcode >> 16); 841 *ob++ = (uchar_t)(gbkcode >> 8); 842 } 843 *ob = (uchar_t)(gbkcode & 0xFF); 844 845 return (gbklen); 846 } 847 848 /* 849 * Convert single UTF-8 character to GBK. 850 * Return: > 0 - Converted successfully 851 * = -1 - E2BIG 852 */ 853 /* ARGSUSED */ 854 static int8_t 855 utf8_to_gbk(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 856 uchar_t *ob, uchar_t *obtail, size_t *ret) 857 { 858 size_t index; 859 int8_t gbklen; 860 uint32_t gbkcode; 861 862 index = kiconv_binsearch(utf8, kiconv_utf8_gb18030, 863 KICONV_UTF8_GB18030_MAX); 864 gbkcode = kiconv_utf8_gb18030[index].value; 865 KICONV_SC_GET_GB_LEN(gbkcode, gbklen); 866 867 /* GBK and GB18030 share the same table, so check the length. */ 868 if (gbklen == 4) { 869 index = 0; 870 gbkcode = kiconv_utf8_gb18030[index].value; 871 gbklen = 1; 872 } 873 874 if (obtail - ob < gbklen) { 875 *ret = (size_t)-1; 876 return (-1); 877 } 878 879 if (index == 0) 880 (*ret)++; /* Non-identical conversion */ 881 882 if (gbklen > 1) 883 *ob++ = (uchar_t)(gbkcode >> 8); 884 *ob = (uchar_t)(gbkcode & 0xFF); 885 886 return (gbklen); 887 } 888 889 /* 890 * Convert single UTF-8 character to GB2312. 891 * Return: > 0 - Converted successfully 892 * = -1 - E2BIG 893 */ 894 /* ARGSUSED */ 895 static int8_t 896 utf8_to_gb2312(uint32_t utf8, uchar_t **inbuf, uchar_t *intail, 897 uchar_t *ob, uchar_t *obtail, size_t *ret) 898 { 899 size_t index; 900 int8_t gblen; 901 uint32_t gbcode; 902 903 index = kiconv_binsearch(utf8, kiconv_utf8_gb2312, 904 KICONV_UTF8_GB2312_MAX); 905 gbcode = kiconv_utf8_gb2312[index].value; 906 gblen = (gbcode <= 0xFF) ? 1 : 2; 907 908 if (obtail - ob < gblen) { 909 *ret = (size_t)-1; 910 return (-1); 911 } 912 913 if (index == 0) 914 (*ret)++; 915 916 if (gblen > 1) 917 *ob++ = (uchar_t)(gbcode >> 8); 918 *ob = (uchar_t)(gbcode & 0xFF); 919 920 return (gblen); 921 } 922 923 static kiconv_ops_t kiconv_sc_ops_tbl[] = { 924 { 925 "gb18030", "utf-8", kiconv_open_to_cck, kiconv_to_gb18030, 926 kiconv_close_to_cck, kiconvstr_to_gb18030 927 }, 928 { 929 "utf-8", "gb18030", open_fr_gb18030, kiconv_fr_gb18030, 930 close_fr_sc, kiconvstr_fr_gb18030 931 }, 932 { 933 "gbk", "utf-8", kiconv_open_to_cck, kiconv_to_gbk, 934 kiconv_close_to_cck, kiconvstr_to_gbk 935 }, 936 { 937 "utf-8", "gbk", open_fr_gbk, kiconv_fr_gbk, 938 close_fr_sc, kiconvstr_fr_gbk 939 }, 940 { 941 "euccn", "utf-8", kiconv_open_to_cck, kiconv_to_euccn, 942 kiconv_close_to_cck, kiconvstr_to_euccn 943 }, 944 { 945 "utf-8", "euccn", open_fr_euccn, kiconv_fr_euccn, 946 close_fr_sc, kiconvstr_fr_euccn 947 }, 948 }; 949 950 static kiconv_module_info_t kiconv_sc_info = { 951 "kiconv_sc", /* module name */ 952 sizeof (kiconv_sc_ops_tbl) / sizeof (kiconv_sc_ops_tbl[0]), 953 kiconv_sc_ops_tbl, 954 0, 955 NULL, 956 NULL, 957 0 958 }; 959 960 static struct modlkiconv modlkiconv_sc = { 961 &mod_kiconvops, 962 "kiconv Simplified Chinese module 1.0", 963 &kiconv_sc_info 964 }; 965 966 static struct modlinkage modlinkage = { 967 MODREV_1, 968 (void *)&modlkiconv_sc, 969 NULL 970 }; 971 972 int 973 _init(void) 974 { 975 int err; 976 977 err = mod_install(&modlinkage); 978 if (err) 979 cmn_err(CE_WARN, "kiconv_sc: failed to load kernel module"); 980 981 return (err); 982 } 983 984 int 985 _fini(void) 986 { 987 int err; 988 989 /* 990 * If this module is being used, then, we cannot remove the module. 991 * The following checking will catch pretty much all usual cases. 992 * 993 * Any remaining will be catached by the kiconv_unregister_module() 994 * during mod_remove() at below. 995 */ 996 if (kiconv_module_ref_count(KICONV_MODULE_ID_SC)) 997 return (EBUSY); 998 999 err = mod_remove(&modlinkage); 1000 if (err) 1001 cmn_err(CE_WARN, "kiconv_sc: failed to remove kernel module"); 1002 1003 return (err); 1004 } 1005 1006 int 1007 _info(struct modinfo *modinfop) 1008 { 1009 return (mod_info(&modlinkage, modinfop)); 1010 }