1 /* 2 * Copright 2010 Nexenta Systems, Inc. All rights reserved. 3 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 4 * at Electronni Visti IA, Kiev, Ukraine. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include "lint.h" 30 #include "file64.h" 31 #include <stdio.h> 32 #include <stdlib.h> 33 #include <stddef.h> 34 #include <string.h> 35 #include <wchar.h> 36 #include <errno.h> 37 #include <unistd.h> 38 #include <ctype.h> 39 #include <unistd.h> 40 #include <fcntl.h> 41 #include <assert.h> 42 #include <sys/stat.h> 43 #include <sys/mman.h> 44 45 #include "collate.h" 46 #include "setlocale.h" 47 #include "ldpart.h" 48 49 /* 50 * See the comments in usr/src/cmd/localedef/collate.c for further 51 * information. It would also be very helpful to have a copy of the 52 * POSIX standard for collation (in the locale format manual page) 53 * handy (www.opengroup.org). 54 */ 55 56 static collate_subst_t *subst_table[COLL_WEIGHTS_MAX]; 57 static collate_char_t *char_pri_table; 58 static collate_large_t *large_pri_table; 59 static collate_chain_t *chain_pri_table; 60 static char *cache = NULL; 61 static size_t cachesz; 62 static char collate_encoding[ENCODING_LEN + 1]; 63 64 /* Exposed externally to other parts of libc. */ 65 collate_info_t *_collate_info; 66 int _collate_load_error = 1; 67 68 struct xlocale_collate __xlocale_global_collate = { 69 {{0}, "C"}, 1, 0 70 }; 71 72 struct xlocale_collate __xlocale_C_collate = { 73 {{0}, "C"}, 1, 0 74 }; 75 76 static void 77 destruct_collate(void *t) 78 { 79 struct xlocale_collate *table = t; 80 81 /* XXX */; 82 } 83 84 void * 85 __collate_load(const char *encoding, locale_t unused) 86 { 87 struct xlocale_collate *table; 88 89 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { 90 return &__xlocale_C_collate; 91 } 92 93 table = calloc(sizeof(struct xlocale_collate), 1); 94 if (table == NULL) { 95 /* XXX */ 96 } 97 98 return (table); 99 } 100 101 int 102 _collate_load_tables(const char *encoding) 103 { 104 int i, chains, z; 105 char buf[PATH_MAX]; 106 char *TMP; 107 char *map; 108 collate_info_t *info; 109 struct stat sbuf; 110 int fd; 111 112 /* 'encoding' must be already checked. */ 113 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { 114 _collate_load_error = 1; 115 return (_LDP_CACHE); 116 } 117 118 /* 119 * If the locale name is the same as our cache, use the cache. 120 */ 121 if (cache && (strncmp(encoding, collate_encoding, ENCODING_LEN) == 0)) { 122 _collate_load_error = 0; 123 return (_LDP_CACHE); 124 } 125 126 /* 127 * Slurp the locale file into the cache. 128 */ 129 130 (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA", 131 _PathLocale, encoding); 132 133 if ((fd = open(buf, O_RDONLY)) < 0) 134 return (_LDP_ERROR); 135 if (fstat(fd, &sbuf) < 0) { 136 (void) close(fd); 137 return (_LDP_ERROR); 138 } 139 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) { 140 (void) close(fd); 141 errno = EINVAL; 142 return (_LDP_ERROR); 143 } 144 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 145 (void) close(fd); 146 if ((TMP = map) == NULL) { 147 return (_LDP_ERROR); 148 } 149 150 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) { 151 (void) munmap(map, sbuf.st_size); 152 errno = EINVAL; 153 return (_LDP_ERROR); 154 } 155 TMP += COLLATE_STR_LEN; 156 157 info = (void *)TMP; 158 TMP += sizeof (*info); 159 160 if ((info->directive_count < 1) || 161 (info->directive_count >= COLL_WEIGHTS_MAX) || 162 ((chains = info->chain_count) < 0)) { 163 (void) munmap(map, sbuf.st_size); 164 errno = EINVAL; 165 return (_LDP_ERROR); 166 } 167 168 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + 169 (sizeof (collate_chain_t) * chains) + 170 (sizeof (collate_large_t) * info->large_count); 171 for (z = 0; z < (info->directive_count); z++) { 172 i += sizeof (collate_subst_t) * info->subst_count[z]; 173 } 174 if (i != (sbuf.st_size - (TMP - map))) { 175 (void) munmap(map, sbuf.st_size); 176 errno = EINVAL; 177 return (_LDP_ERROR); 178 } 179 180 char_pri_table = (void *)TMP; 181 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); 182 183 for (z = 0; z < info->directive_count; z++) { 184 if (info->subst_count[z] > 0) { 185 subst_table[z] = (void *)TMP; 186 TMP += info->subst_count[z] * sizeof (collate_subst_t); 187 } else { 188 subst_table[z] = NULL; 189 } 190 } 191 192 if (chains > 0) { 193 chain_pri_table = (void *)TMP; 194 TMP += chains * sizeof (collate_chain_t); 195 } else 196 chain_pri_table = NULL; 197 if (info->large_count > 0) 198 large_pri_table = (void *)TMP; 199 else 200 large_pri_table = NULL; 201 202 (void) strlcpy(collate_encoding, encoding, ENCODING_LEN); 203 _collate_info = info; 204 205 if (cache) 206 (void) munmap(cache, cachesz); 207 208 cache = map; 209 cachesz = sbuf.st_size; 210 _collate_load_error = 0; 211 212 return (_LDP_LOADED); 213 } 214 215 static int32_t * 216 substsearch(const wchar_t key, int pass) 217 { 218 collate_subst_t *p; 219 int n = _collate_info->subst_count[pass]; 220 221 if (n == 0) 222 return (NULL); 223 224 if (pass >= _collate_info->directive_count) 225 return (NULL); 226 227 if (!(key & COLLATE_SUBST_PRIORITY)) 228 return (NULL); 229 230 p = subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); 231 assert(p->key == key); 232 return (p->pri); 233 } 234 235 /* 236 * Note: for performance reasons, we have expanded bsearch here. This avoids 237 * function call overhead with each comparison. 238 */ 239 240 static collate_chain_t * 241 chainsearch(const wchar_t *key, int *len) 242 { 243 int low; 244 int high; 245 int next, compar, l; 246 collate_chain_t *p; 247 collate_chain_t *tab; 248 249 if (_collate_info->chain_count == 0) 250 return (NULL); 251 252 low = 0; 253 high = _collate_info->chain_count - 1; 254 tab = chain_pri_table; 255 256 while (low <= high) { 257 next = (low + high) / 2; 258 p = tab + next; 259 compar = *key - *p->str; 260 if (compar == 0) { 261 l = wcsnlen(p->str, COLLATE_STR_LEN); 262 compar = wcsncmp(key, p->str, l); 263 if (compar == 0) { 264 *len = l; 265 return (p); 266 } 267 } 268 if (compar > 0) 269 low = next + 1; 270 else 271 high = next - 1; 272 } 273 return (NULL); 274 } 275 276 static collate_large_t * 277 largesearch(const wchar_t key) 278 { 279 int low = 0; 280 int high = _collate_info->large_count - 1; 281 int next, compar; 282 collate_large_t *p; 283 collate_large_t *tab = large_pri_table; 284 285 if (_collate_info->large_count == 0) 286 return (NULL); 287 288 while (low <= high) { 289 next = (low + high) / 2; 290 p = tab + next; 291 compar = key - p->val; 292 if (compar == 0) 293 return (p); 294 if (compar > 0) 295 low = next + 1; 296 else 297 high = next - 1; 298 } 299 return (NULL); 300 } 301 302 void 303 _collate_lookup(const wchar_t *t, int *len, int *pri, int which, int **state) 304 { 305 collate_chain_t *p2; 306 collate_large_t *match; 307 collate_info_t *info = _collate_info; 308 int p, l; 309 int *sptr; 310 311 /* 312 * If this is the "last" pass for the UNDEFINED, then 313 * we just return the priority itself. 314 */ 315 if (which >= info->directive_count) { 316 *pri = *t; 317 *len = 1; 318 *state = NULL; 319 return; 320 } 321 322 /* 323 * If we have remaining substitution data from a previous 324 * call, consume it first. 325 */ 326 if ((sptr = *state) != NULL) { 327 *pri = *sptr; 328 sptr++; 329 *state = *sptr ? sptr : NULL; 330 *len = 0; 331 return; 332 } 333 334 /* No active substitutions */ 335 *len = 1; 336 337 /* 338 * Check for composites such as dipthongs that collate as a 339 * single element (aka chains or collating-elements). 340 */ 341 if (((p2 = chainsearch(t, &l)) != NULL) && 342 ((p = p2->pri[which]) >= 0)) { 343 344 *len = l; 345 *pri = p; 346 347 } else if (*t <= UCHAR_MAX) { 348 349 /* 350 * Character is a small (8-bit) character. 351 * We just look these up directly for speed. 352 */ 353 *pri = char_pri_table[*t].pri[which]; 354 355 } else if ((info->large_count > 0) && 356 ((match = largesearch(*t)) != NULL)) { 357 358 /* 359 * Character was found in the extended table. 360 */ 361 *pri = match->pri.pri[which]; 362 363 } else { 364 /* 365 * Character lacks a specific definition. 366 */ 367 if (info->directive[which] & DIRECTIVE_UNDEFINED) { 368 /* Mask off sign bit to prevent ordering confusion. */ 369 *pri = (*t & COLLATE_MAX_PRIORITY); 370 } else { 371 *pri = info->undef_pri[which]; 372 } 373 /* No substitutions for undefined characters! */ 374 return; 375 } 376 377 /* 378 * Try substituting (expanding) the character. We are 379 * currently doing this *after* the chain compression. I 380 * think it should not matter, but this way might be slightly 381 * faster. 382 * 383 * We do this after the priority search, as this will help us 384 * to identify a single key value. In order for this to work, 385 * its important that the priority assigned to a given element 386 * to be substituted be unique for that level. The localedef 387 * code ensures this for us. 388 */ 389 if ((sptr = substsearch(*pri, which)) != NULL) { 390 if ((*pri = *sptr) != 0) { 391 sptr++; 392 *state = *sptr ? sptr : NULL; 393 } 394 } 395 396 } 397 398 /* 399 * This is the meaty part of wcsxfrm & strxfrm. Note that it does 400 * NOT NULL terminate. That is left to the caller. 401 */ 402 size_t 403 _collate_wxfrm(const wchar_t *src, wchar_t *xf, size_t room) 404 { 405 int pri; 406 int len; 407 const wchar_t *t; 408 wchar_t *tr = NULL; 409 int direc; 410 int pass; 411 int32_t *state; 412 size_t want = 0; 413 size_t need = 0; 414 415 assert(src); 416 417 for (pass = 0; pass <= _collate_info->directive_count; pass++) { 418 419 state = NULL; 420 421 if (pass != 0) { 422 /* insert level separator from the previous pass */ 423 if (room) { 424 *xf++ = 1; 425 room--; 426 } 427 want++; 428 } 429 430 /* special pass for undefined */ 431 if (pass == _collate_info->directive_count) { 432 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 433 } else { 434 direc = _collate_info->directive[pass]; 435 } 436 437 t = src; 438 439 if (direc & DIRECTIVE_BACKWARD) { 440 wchar_t *bp, *fp, c; 441 if (tr) 442 free(tr); 443 if ((tr = wcsdup(t)) == NULL) { 444 errno = ENOMEM; 445 goto fail; 446 } 447 bp = tr; 448 fp = tr + wcslen(tr) - 1; 449 while (bp < fp) { 450 c = *bp; 451 *bp++ = *fp; 452 *fp-- = c; 453 } 454 t = (const wchar_t *)tr; 455 } 456 457 if (direc & DIRECTIVE_POSITION) { 458 while (*t || state) { 459 _collate_lookup(t, &len, &pri, pass, &state); 460 t += len; 461 if (pri <= 0) { 462 if (pri < 0) { 463 errno = EINVAL; 464 goto fail; 465 } 466 pri = COLLATE_MAX_PRIORITY; 467 } 468 if (room) { 469 *xf++ = pri; 470 room--; 471 } 472 want++; 473 need = want; 474 } 475 } else { 476 while (*t || state) { 477 _collate_lookup(t, &len, &pri, pass, &state); 478 t += len; 479 if (pri <= 0) { 480 if (pri < 0) { 481 errno = EINVAL; 482 goto fail; 483 } 484 continue; 485 } 486 if (room) { 487 *xf++ = pri; 488 room--; 489 } 490 want++; 491 need = want; 492 } 493 } 494 } 495 496 end: 497 if (tr) 498 free(tr); 499 return (need); 500 501 fail: 502 if (tr) 503 free(tr); 504 return ((size_t)(-1)); 505 } 506 507 /* 508 * In the non-POSIX case, we transform each character into a string of 509 * characters representing the character's priority. Since char is usually 510 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 511 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 512 * bits per byte. 513 * 514 * It turns out that we sometimes have real priorities that are 515 * 31-bits wide. (But: be careful using priorities where the high 516 * order bit is set -- i.e. the priority is negative. The sort order 517 * may be surprising!) 518 * 519 * TODO: This would be a good area to optimize somewhat. It turns out 520 * that real prioririties *except for the last UNDEFINED pass* are generally 521 * very small. We need the localedef code to precalculate the max 522 * priority for us, and ideally also give us a mask, and then we could 523 * severely limit what we expand to. 524 */ 525 #define XFRM_BYTES 6 526 #define XFRM_OFFSET ('0') /* make all printable characters */ 527 #define XFRM_SHIFT 6 528 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 529 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 530 531 static int 532 xfrm(unsigned char *p, int pri, int pass) 533 { 534 /* we use unsigned to ensure zero fill on right shift */ 535 uint32_t val = (uint32_t)_collate_info->pri_count[pass]; 536 int nc = 0; 537 538 while (val) { 539 *p = (pri & XFRM_MASK) + XFRM_OFFSET; 540 pri >>= XFRM_SHIFT; 541 val >>= XFRM_SHIFT; 542 p++; 543 nc++; 544 } 545 return (nc); 546 } 547 548 size_t 549 _collate_sxfrm(const wchar_t *src, char *xf, size_t room) 550 { 551 int pri; 552 int len; 553 const wchar_t *t; 554 wchar_t *tr = NULL; 555 int direc; 556 int pass; 557 int32_t *state; 558 size_t want = 0; 559 size_t need = 0; 560 int b; 561 uint8_t buf[XFRM_BYTES]; 562 563 assert(src); 564 565 for (pass = 0; pass <= _collate_info->directive_count; pass++) { 566 567 state = NULL; 568 569 if (pass != 0) { 570 /* insert level separator from the previous pass */ 571 if (room) { 572 *xf++ = XFRM_SEP; 573 room--; 574 } 575 want++; 576 } 577 578 /* special pass for undefined */ 579 if (pass == _collate_info->directive_count) { 580 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 581 } else { 582 direc = _collate_info->directive[pass]; 583 } 584 585 t = src; 586 587 if (direc & DIRECTIVE_BACKWARD) { 588 wchar_t *bp, *fp, c; 589 if (tr) 590 free(tr); 591 if ((tr = wcsdup(t)) == NULL) { 592 errno = ENOMEM; 593 goto fail; 594 } 595 bp = tr; 596 fp = tr + wcslen(tr) - 1; 597 while (bp < fp) { 598 c = *bp; 599 *bp++ = *fp; 600 *fp-- = c; 601 } 602 t = (const wchar_t *)tr; 603 } 604 605 if (direc & DIRECTIVE_POSITION) { 606 while (*t || state) { 607 608 _collate_lookup(t, &len, &pri, pass, &state); 609 t += len; 610 if (pri <= 0) { 611 if (pri < 0) { 612 errno = EINVAL; 613 goto fail; 614 } 615 pri = COLLATE_MAX_PRIORITY; 616 } 617 618 b = xfrm(buf, pri, pass); 619 want += b; 620 if (room) { 621 while (b) { 622 b--; 623 if (room) { 624 *xf++ = buf[b]; 625 room--; 626 } 627 } 628 } 629 need = want; 630 } 631 } else { 632 while (*t || state) { 633 _collate_lookup(t, &len, &pri, pass, &state); 634 t += len; 635 if (pri <= 0) { 636 if (pri < 0) { 637 errno = EINVAL; 638 goto fail; 639 } 640 continue; 641 } 642 643 b = xfrm(buf, pri, pass); 644 want += b; 645 if (room) { 646 647 while (b) { 648 b--; 649 if (room) { 650 *xf++ = buf[b]; 651 room--; 652 } 653 } 654 } 655 need = want; 656 } 657 } 658 } 659 660 end: 661 if (tr) 662 free(tr); 663 return (need); 664 665 fail: 666 if (tr) 667 free(tr); 668 return ((size_t)(-1)); 669 }