1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/sysmacros.h> 30 #include <sys/systm.h> 31 #include <sys/debug.h> 32 #include <sys/kmem.h> 33 #include <sys/sunddi.h> 34 #include <sys/byteorder.h> 35 #include <sys/errno.h> 36 #include <sys/modctl.h> 37 #include <sys/u8_textprep.h> 38 #include <sys/kiconv.h> 39 #include <sys/kiconv_cck_common.h> 40 #include <sys/kiconv_ko.h> 41 #include <sys/kiconv_uhc_utf8.h> 42 #include <sys/kiconv_utf8_uhc.h> 43 #include <sys/kiconv_euckr_utf8.h> 44 #include <sys/kiconv_utf8_euckr.h> 45 46 static int8_t utf8_to_euckr(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 47 uchar_t *ob, uchar_t *obtail, size_t *ret_val); 48 static int8_t utf8_to_uhc(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 49 uchar_t *ob, uchar_t *obtail, size_t *ret_val); 50 static int8_t ko_to_utf8(uint32_t ko_val, uchar_t *ob, uchar_t *obtail, 51 size_t *ret_val, kiconv_table_array_t *table, size_t nitems); 52 53 54 #define KICONV_KO_EUCKR (0x01) 55 #define KICONV_KO_UHC (0x02) 56 #define KICONV_KO_MAX_MAGIC_ID (0x02) 57 58 static void * 59 open_fr_euckr() 60 { 61 return ((void *)KICONV_KO_EUCKR); 62 } 63 64 static void * 65 open_fr_uhc() 66 { 67 return ((void *)KICONV_KO_UHC); 68 } 69 70 static int 71 close_fr_ko(void *s) 72 { 73 if ((uintptr_t)s > KICONV_KO_MAX_MAGIC_ID) 74 return (EBADF); 75 76 return (0); 77 } 78 79 /* 80 * Encoding convertor from EUC-KR to UTF-8. 81 */ 82 static size_t 83 kiconv_fr_euckr(void *kcd, char **inbuf, size_t *inbufleft, 84 char **outbuf, size_t *outbufleft, int *errno) 85 { 86 uchar_t *ib; 87 uchar_t *ob; 88 uchar_t *ibtail; 89 uchar_t *obtail; 90 size_t ret_val; 91 int8_t sz; 92 uint32_t euckr_val; 93 94 /* Check on the kiconv code conversion descriptor. */ 95 if (kcd == NULL || kcd == (void *)-1) { 96 *errno = EBADF; 97 return ((size_t)-1); 98 } 99 100 /* If this is a state reset request, process and return. */ 101 if (inbuf == NULL || *inbuf == NULL) { 102 return (0); 103 } 104 105 ret_val = 0; 106 ib = (uchar_t *)*inbuf; 107 ob = (uchar_t *)*outbuf; 108 ibtail = ib + *inbufleft; 109 obtail = ob + *outbufleft; 110 111 while (ib < ibtail) { 112 if (KICONV_IS_ASCII(*ib)) { 113 if (ob >= obtail) { 114 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 115 } 116 117 *ob++ = *ib++; 118 continue; 119 } 120 121 /* 122 * Issue EILSEQ error if the first byte is not a 123 * valid EUC-KR leading byte. 124 */ 125 if (! KICONV_KO_IS_EUCKR_BYTE(*ib)) { 126 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 127 } 128 129 /* 130 * Issue EINVAL error if input buffer has an incomplete 131 * character at the end of the buffer. 132 */ 133 if (ibtail - ib < 2) { 134 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 135 } 136 137 /* 138 * Issue EILSEQ error if the remaining byte is not 139 * a valid EUC-KR byte. 140 */ 141 if (! KICONV_KO_IS_EUCKR_BYTE(*(ib + 1))) { 142 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 143 } 144 145 euckr_val = (uint32_t)(*ib) << 8 | *(ib + 1); 146 sz = ko_to_utf8(euckr_val, ob, obtail, &ret_val, 147 kiconv_euckr_utf8, KICONV_EUCKR_UTF8_MAX); 148 149 if (sz < 0) { 150 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 151 } 152 153 ib += 2; 154 ob += sz; 155 } 156 157 *inbuf = (char *)ib; 158 *inbufleft = ibtail - ib; 159 *outbuf = (char *)ob; 160 *outbufleft = obtail - ob; 161 162 return (ret_val); 163 } 164 165 /* 166 * String based encoding convertor from EUC-KR to UTF-8. 167 */ 168 static size_t 169 kiconvstr_fr_euckr(char *inarray, size_t *inlen, char *outarray, 170 size_t *outlen, int flag, int *errno) 171 { 172 uchar_t *ib; 173 uchar_t *ob; 174 uchar_t *ibtail; 175 uchar_t *obtail; 176 uchar_t *oldib; 177 size_t ret_val; 178 int8_t sz; 179 uint32_t euckr_val; 180 boolean_t do_not_ignore_null; 181 182 ret_val = 0; 183 ib = (uchar_t *)inarray; 184 ob = (uchar_t *)outarray; 185 ibtail = ib + *inlen; 186 obtail = ob + *outlen; 187 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0); 188 189 while (ib < ibtail) { 190 if (*ib == '\0' && do_not_ignore_null) 191 break; 192 193 if (KICONV_IS_ASCII(*ib)) { 194 if (ob >= obtail) { 195 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 196 } 197 198 *ob++ = *ib++; 199 continue; 200 } 201 202 oldib = ib; 203 204 if (! KICONV_KO_IS_EUCKR_BYTE(*ib)) { 205 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ); 206 } 207 208 if (ibtail - ib < 2) { 209 KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL); 210 } 211 212 if (! KICONV_KO_IS_EUCKR_BYTE(*(ib + 1))) { 213 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ); 214 } 215 216 euckr_val = *ib++; 217 euckr_val = (euckr_val << 8) | *ib++; 218 sz = ko_to_utf8(euckr_val, ob, obtail, &ret_val, 219 kiconv_euckr_utf8, KICONV_EUCKR_UTF8_MAX); 220 221 if (sz < 0) { 222 ib = oldib; 223 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 224 } 225 226 ob += sz; 227 continue; 228 229 REPLACE_INVALID: 230 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) { 231 ib = oldib; 232 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 233 } 234 235 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1; 236 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2; 237 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3; 238 ret_val++; 239 } 240 241 *inlen = ibtail - ib; 242 *outlen = obtail - ob; 243 244 return (ret_val); 245 } 246 247 /* 248 * Encoding convertor from Unified Hangul Code to UTF-8. 249 */ 250 static size_t 251 kiconv_fr_uhc(void *kcd, char **inbuf, size_t *inbufleft, 252 char **outbuf, size_t *outbufleft, int *errno) 253 { 254 uchar_t *ib; 255 uchar_t *ob; 256 uchar_t *ibtail; 257 uchar_t *obtail; 258 size_t ret_val; 259 int8_t sz; 260 uint32_t uhc_val; 261 262 /* Check on the kiconv code conversion descriptor. */ 263 if (kcd == NULL || kcd == (void *)-1) { 264 *errno = EBADF; 265 return ((size_t)-1); 266 } 267 268 /* If this is a state reset request, process and return. */ 269 if (inbuf == NULL || *inbuf == NULL) { 270 return (0); 271 } 272 273 ret_val = 0; 274 ib = (uchar_t *)*inbuf; 275 ob = (uchar_t *)*outbuf; 276 ibtail = ib + *inbufleft; 277 obtail = ob + *outbufleft; 278 279 while (ib < ibtail) { 280 if (KICONV_IS_ASCII(*ib)) { 281 if (ob >= obtail) { 282 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 283 } 284 285 *ob++ = *ib++; 286 continue; 287 } 288 289 /* 290 * Issue EILSEQ error if the first byte is not a 291 * valid UHC leading byte. 292 */ 293 if (! KICONV_KO_IS_UHC_1st_BYTE(*ib)) { 294 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 295 } 296 297 /* 298 * Issue EINVAL error if input buffer has an incomplete 299 * character at the end of the buffer. 300 */ 301 if (ibtail - ib < 2) { 302 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 303 } 304 305 /* 306 * Issue EILSEQ error if the remaining byte is not 307 * a valid UHC byte. 308 */ 309 if (! KICONV_KO_IS_UHC_2nd_BYTE(*(ib + 1))) { 310 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 311 } 312 313 uhc_val = (uint32_t)(*ib) << 8 | *(ib + 1); 314 sz = ko_to_utf8(uhc_val, ob, obtail, &ret_val, 315 kiconv_uhc_utf8, KICONV_UHC_UTF8_MAX); 316 317 if (sz < 0) { 318 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 319 } 320 321 ib += 2; 322 ob += sz; 323 } 324 325 *inbuf = (char *)ib; 326 *inbufleft = ibtail - ib; 327 *outbuf = (char *)ob; 328 *outbufleft = obtail - ob; 329 330 return (ret_val); 331 } 332 333 /* 334 * String based encoding convertor from Unified Hangul Code to UTF-8. 335 */ 336 static size_t 337 kiconvstr_fr_uhc(char *inarray, size_t *inlen, char *outarray, 338 size_t *outlen, int flag, int *errno) 339 { 340 uchar_t *ib; 341 uchar_t *ob; 342 uchar_t *ibtail; 343 uchar_t *obtail; 344 uchar_t *oldib; 345 size_t ret_val; 346 int8_t sz; 347 uint32_t uhc_val; 348 boolean_t do_not_ignore_null; 349 350 ret_val = 0; 351 ib = (uchar_t *)inarray; 352 ob = (uchar_t *)outarray; 353 ibtail = ib + *inlen; 354 obtail = ob + *outlen; 355 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0); 356 357 while (ib < ibtail) { 358 if (*ib == '\0' && do_not_ignore_null) 359 break; 360 361 if (KICONV_IS_ASCII(*ib)) { 362 if (ob >= obtail) { 363 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 364 } 365 366 *ob++ = *ib++; 367 continue; 368 } 369 370 oldib = ib; 371 372 if (! KICONV_KO_IS_UHC_1st_BYTE(*ib)) { 373 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ); 374 } 375 376 if (ibtail - ib < 2) { 377 KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL); 378 } 379 380 if (! KICONV_KO_IS_UHC_2nd_BYTE(*(ib + 1))) { 381 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ); 382 } 383 384 uhc_val = *ib++; 385 uhc_val = (uhc_val << 8) | *ib++; 386 sz = ko_to_utf8(uhc_val, ob, obtail, &ret_val, 387 kiconv_uhc_utf8, KICONV_UHC_UTF8_MAX); 388 389 if (sz < 0) { 390 ib = oldib; 391 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 392 } 393 394 ob += sz; 395 continue; 396 397 REPLACE_INVALID: 398 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) { 399 ib = oldib; 400 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 401 } 402 403 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1; 404 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2; 405 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3; 406 ret_val++; 407 } 408 409 *inlen = ibtail - ib; 410 *outlen = obtail - ob; 411 412 return (ret_val); 413 } 414 415 /* 416 * Encoding convertor from UTF-8 to EUC-KR. 417 */ 418 static size_t 419 kiconv_to_euckr(void *kcd, char **inbuf, size_t *inbytesleft, 420 char **outbuf, size_t *outbytesleft, int *errno) 421 { 422 return (kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 423 outbytesleft, errno, utf8_to_euckr)); 424 } 425 426 /* 427 * Encoding convertor from UTF-8 to Unified Hangul Code. 428 */ 429 static size_t 430 kiconv_to_uhc(void *kcd, char **inbuf, size_t *inbytesleft, 431 char **outbuf, size_t *outbytesleft, int *errno) 432 { 433 return (kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 434 outbytesleft, errno, utf8_to_uhc)); 435 } 436 437 /* 438 * String based encoding convertor from UTF-8 to EUC-KR. 439 */ 440 static size_t 441 kiconvstr_to_euckr(char *inarray, size_t *inlen, char *outarray, 442 size_t *outlen, int flag, int *errno) 443 { 444 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 445 (uchar_t *)outarray, outlen, flag, errno, utf8_to_euckr); 446 } 447 448 /* 449 * String based encoding convertor from UTF-8 to Unified Hangul Code. 450 */ 451 static size_t 452 kiconvstr_to_uhc(char *inarray, size_t *inlen, char *outarray, 453 size_t *outlen, int flag, int *errno) 454 { 455 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 456 (uchar_t *)outarray, outlen, flag, errno, utf8_to_uhc); 457 } 458 459 /* 460 * Convert an UTF-8 character to a character of ko encodings 461 * (EUC-KR or UHC). 462 */ 463 static int8_t 464 utf8_to_ko(uint32_t utf8, uchar_t *ob, uchar_t *obtail, size_t *ret_val, 465 kiconv_table_t *table, size_t nitems) 466 { 467 size_t index; 468 size_t kocode; 469 int8_t kolen; 470 471 if (KICONV_KO_IS_UDC_IN_UTF8(utf8)) { 472 /* User Definable Area handing. */ 473 kocode = (((utf8 & 0xF0000) >> 4) | ((utf8 & 0x3F00) >> 2) | 474 (utf8 & 0x3F)) - KICONV_KO_UDA_UCS4_START; 475 if (kocode < KICONV_KO_UDA_RANGE) { 476 kocode = (KICONV_KO_UDA_EUC_SEG1 << 8) | 477 (kocode + KICONV_KO_UDA_OFFSET_START); 478 } else { 479 /* 0x43 = 0xA1 - 0x5E */ 480 kocode = (KICONV_KO_UDA_EUC_SEG2 << 8) | 481 (kocode + 0x43); 482 } 483 484 index = 1; 485 } else { 486 index = kiconv_binsearch(utf8, table, nitems); 487 kocode = table[index].value; 488 } 489 490 kolen = (kocode <= 0xFF) ? 1 : 2; 491 492 if (obtail - ob < kolen) { 493 *ret_val = (size_t)-1; 494 return (-1); 495 } 496 497 if (index == 0) 498 (*ret_val)++; 499 500 if (kolen > 1) 501 *ob++ = (uchar_t)(kocode >> 8); 502 *ob = (uchar_t)(kocode & 0xFF); 503 504 return (kolen); 505 } 506 507 /* 508 * Convert an UTF-8 character to Unified Hangual Code. 509 */ 510 /* ARGSUSED */ 511 static int8_t 512 utf8_to_uhc(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 513 uchar_t *ob, uchar_t *obtail, size_t *ret_val) 514 { 515 return (utf8_to_ko(utf8, ob, obtail, ret_val, kiconv_utf8_uhc, 516 KICONV_UTF8_UHC_MAX)); 517 } 518 519 /* 520 * Convert an UTF-8 character to EUC-KR. 521 */ 522 /* ARGSUSED */ 523 static int8_t 524 utf8_to_euckr(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 525 uchar_t *ob, uchar_t *obtail, size_t *ret_val) 526 { 527 return (utf8_to_ko(utf8, ob, obtail, ret_val, kiconv_utf8_euckr, 528 KICONV_UTF8_EUCKR_MAX)); 529 } 530 531 /* 532 * Convert a single ko encoding (EUC-KR or UHC) character to UTF-8. 533 */ 534 static int8_t 535 ko_to_utf8(uint32_t ko_val, uchar_t *ob, uchar_t *obtail, size_t *ret_val, 536 kiconv_table_array_t *table, size_t nitems) 537 { 538 size_t index; 539 int8_t sz; 540 uchar_t udc[3]; 541 uchar_t *u8; 542 543 if (KICONV_KO_IS_UDC_IN_EUC(ko_val)) { 544 /* UDA(User Definable Area) handling. */ 545 uint32_t u32; 546 547 u32 = (ko_val & 0xFF) + (((ko_val & 0xFF00) == 0xC900) ? 548 KICONV_KO_UDA_OFFSET_1 : KICONV_KO_UDA_OFFSET_2); 549 udc[0] = 0xEF; 550 udc[1] = (uchar_t)(0x80 | (u32 & 0x00000FC0) >> 6); 551 udc[2] = (uchar_t)(0x80 | (u32 & 0x0000003F)); 552 u8 = udc; 553 index = 1; 554 } else { 555 index = kiconv_binsearch(ko_val, table, nitems); 556 u8 = table[index].u8; 557 } 558 559 sz = u8_number_of_bytes[u8[0]]; 560 561 if (obtail - ob < sz) { 562 *ret_val = (size_t)-1; 563 return (-1); 564 } 565 566 if (index == 0) 567 (*ret_val)++; /* Non-identical conversion */ 568 569 for (index = 0; index < sz; index++) 570 *ob++ = u8[index]; 571 572 return (sz); 573 } 574 575 static kiconv_ops_t kiconv_ko_ops_tbl[] = { 576 { 577 "euc-kr", "utf-8", kiconv_open_to_cck, kiconv_to_euckr, 578 kiconv_close_to_cck, kiconvstr_to_euckr 579 }, 580 { 581 "utf-8", "euc-kr", open_fr_euckr, kiconv_fr_euckr, 582 close_fr_ko, kiconvstr_fr_euckr 583 }, 584 { 585 "unifiedhangul", "utf-8", kiconv_open_to_cck, kiconv_to_uhc, 586 kiconv_close_to_cck, kiconvstr_to_uhc 587 }, 588 { 589 "utf-8", "unifiedhangul", open_fr_uhc, kiconv_fr_uhc, 590 close_fr_ko, kiconvstr_fr_uhc 591 } 592 }; 593 594 static kiconv_module_info_t kiconv_ko_info = { 595 "kiconv_ko", /* module name */ 596 sizeof (kiconv_ko_ops_tbl) / sizeof (kiconv_ko_ops_tbl[0]), 597 kiconv_ko_ops_tbl, 598 0, 599 NULL, 600 NULL, 601 0 602 }; 603 604 static struct modlkiconv modlkiconv_ko = { 605 &mod_kiconvops, 606 "kiconv korean module 1.0", 607 &kiconv_ko_info 608 }; 609 610 static struct modlinkage modlinkage = { 611 MODREV_1, 612 { (void *)&modlkiconv_ko, NULL } 613 }; 614 615 int 616 _init(void) 617 { 618 int err; 619 620 err = mod_install(&modlinkage); 621 if (err) 622 cmn_err(CE_WARN, "kiconv_ko: failed to load kernel module"); 623 624 return (err); 625 } 626 627 int 628 _fini(void) 629 { 630 int err; 631 632 /* 633 * If this module is being used, then, we cannot remove the module. 634 * The following checking will catch pretty much all usual cases. 635 * 636 * Any remaining will be catached by the kiconv_unregister_module() 637 * during mod_remove() at below. 638 */ 639 if (kiconv_module_ref_count(KICONV_MODULE_ID_KO)) 640 return (EBUSY); 641 642 err = mod_remove(&modlinkage); 643 if (err) 644 cmn_err(CE_WARN, "kiconv_ko: failed to remove kernel module"); 645 646 return (err); 647 } 648 649 int 650 _info(struct modinfo *modinfop) 651 { 652 return (mod_info(&modlinkage, modinfop)); 653 }