1 /* 2 * Copright 2010 Nexenta Systems, Inc. All rights reserved. 3 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 4 * at Electronni Visti IA, Kiev, Ukraine. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include "lint.h" 30 #include "file64.h" 31 #include <stdio.h> 32 #include <stdlib.h> 33 #include <stddef.h> 34 #include <string.h> 35 #include <wchar.h> 36 #include <errno.h> 37 #include <unistd.h> 38 #include <ctype.h> 39 #include <unistd.h> 40 #include <fcntl.h> 41 #include <assert.h> 42 #include <sys/stat.h> 43 #include <sys/mman.h> 44 45 #include "collate.h" 46 #include "setlocale.h" 47 #include "ldpart.h" 48 49 /* 50 * See the comments in usr/src/cmd/localedef/collate.c for further 51 * information. It would also be very helpful to have a copy of the 52 * POSIX standard for collation (in the locale format manual page) 53 * handy (www.opengroup.org). 54 */ 55 56 static collate_subst_t *subst_table[COLL_WEIGHTS_MAX]; 57 static collate_char_t *char_pri_table; 58 static collate_large_t *large_pri_table; 59 static collate_chain_t *chain_pri_table; 60 static char *cache = NULL; 61 static size_t cachesz; 62 static char collate_encoding[ENCODING_LEN + 1]; 63 64 /* Exposed externally to other parts of libc. */ 65 collate_info_t *_collate_info; 66 int _collate_load_error = 1; 67 68 69 int 70 _collate_load_tables(const char *encoding) 71 { 72 int i, chains, z; 73 char buf[PATH_MAX]; 74 char *TMP; 75 char *map; 76 collate_info_t *info; 77 struct stat sbuf; 78 int fd; 79 80 /* 'encoding' must be already checked. */ 81 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { 82 _collate_load_error = 1; 83 return (_LDP_CACHE); 84 } 85 86 /* 87 * If the locale name is the same as our cache, use the cache. 88 */ 89 if (cache && (strncmp(encoding, collate_encoding, ENCODING_LEN) == 0)) { 90 _collate_load_error = 0; 91 return (_LDP_CACHE); 92 } 93 94 /* 95 * Slurp the locale file into the cache. 96 */ 97 98 (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA", 99 _PathLocale, encoding); 100 101 if ((fd = open(buf, O_RDONLY)) < 0) 102 return (_LDP_ERROR); 103 if (fstat(fd, &sbuf) < 0) { 104 (void) close(fd); 105 return (_LDP_ERROR); 106 } 107 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) { 108 (void) close(fd); 109 errno = EINVAL; 110 return (_LDP_ERROR); 111 } 112 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 113 (void) close(fd); 114 if ((TMP = map) == NULL) { 115 return (_LDP_ERROR); 116 } 117 118 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) { 119 (void) munmap(map, sbuf.st_size); 120 errno = EINVAL; 121 return (_LDP_ERROR); 122 } 123 TMP += COLLATE_STR_LEN; 124 125 info = (void *)TMP; 126 TMP += sizeof (*info); 127 128 if ((info->directive_count < 1) || 129 (info->directive_count >= COLL_WEIGHTS_MAX) || 130 ((chains = info->chain_count) < 0)) { 131 (void) munmap(map, sbuf.st_size); 132 errno = EINVAL; 133 return (_LDP_ERROR); 134 } 135 136 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + 137 (sizeof (collate_chain_t) * chains) + 138 (sizeof (collate_large_t) * info->large_count); 139 for (z = 0; z < (info->directive_count); z++) { 140 i += sizeof (collate_subst_t) * info->subst_count[z]; 141 } 142 if (i != (sbuf.st_size - (TMP - map))) { 143 (void) munmap(map, sbuf.st_size); 144 errno = EINVAL; 145 return (_LDP_ERROR); 146 } 147 148 char_pri_table = (void *)TMP; 149 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); 150 151 for (z = 0; z < info->directive_count; z++) { 152 if (info->subst_count[z] > 0) { 153 subst_table[z] = (void *)TMP; 154 TMP += info->subst_count[z] * sizeof (collate_subst_t); 155 } else { 156 subst_table[z] = NULL; 157 } 158 } 159 160 if (chains > 0) { 161 chain_pri_table = (void *)TMP; 162 TMP += chains * sizeof (collate_chain_t); 163 } else 164 chain_pri_table = NULL; 165 if (info->large_count > 0) 166 large_pri_table = (void *)TMP; 167 else 168 large_pri_table = NULL; 169 170 (void) strlcpy(collate_encoding, encoding, ENCODING_LEN); 171 _collate_info = info; 172 173 if (cache) 174 (void) munmap(cache, cachesz); 175 176 cache = map; 177 cachesz = sbuf.st_size; 178 _collate_load_error = 0; 179 180 return (_LDP_LOADED); 181 } 182 183 static int32_t * 184 substsearch(const wchar_t key, int pass) 185 { 186 collate_subst_t *p; 187 int n = _collate_info->subst_count[pass]; 188 189 if (n == 0) 190 return (NULL); 191 192 if (pass >= _collate_info->directive_count) 193 return (NULL); 194 195 if (!(key & COLLATE_SUBST_PRIORITY)) 196 return (NULL); 197 198 p = subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); 199 assert(p->key == key); 200 return (p->pri); 201 } 202 203 /* 204 * Note: for performance reasons, we have expanded bsearch here. This avoids 205 * function call overhead with each comparison. 206 */ 207 208 static collate_chain_t * 209 chainsearch(const wchar_t *key, int *len) 210 { 211 int low; 212 int high; 213 int next, compar, l; 214 collate_chain_t *p; 215 collate_chain_t *tab; 216 217 if (_collate_info->chain_count == 0) 218 return (NULL); 219 220 low = 0; 221 high = _collate_info->chain_count - 1; 222 tab = chain_pri_table; 223 224 while (low <= high) { 225 next = (low + high) / 2; 226 p = tab + next; 227 compar = *key - *p->str; 228 if (compar == 0) { 229 l = wcsnlen(p->str, COLLATE_STR_LEN); 230 compar = wcsncmp(key, p->str, l); 231 if (compar == 0) { 232 *len = l; 233 return (p); 234 } 235 } 236 if (compar > 0) 237 low = next + 1; 238 else 239 high = next - 1; 240 } 241 return (NULL); 242 } 243 244 static collate_large_t * 245 largesearch(const wchar_t key) 246 { 247 int low = 0; 248 int high = _collate_info->large_count - 1; 249 int next, compar; 250 collate_large_t *p; 251 collate_large_t *tab = large_pri_table; 252 253 if (_collate_info->large_count == 0) 254 return (NULL); 255 256 while (low <= high) { 257 next = (low + high) / 2; 258 p = tab + next; 259 compar = key - p->val; 260 if (compar == 0) 261 return (p); 262 if (compar > 0) 263 low = next + 1; 264 else 265 high = next - 1; 266 } 267 return (NULL); 268 } 269 270 void 271 _collate_lookup(const wchar_t *t, int *len, int *pri, int which, int **state) 272 { 273 collate_chain_t *p2; 274 collate_large_t *match; 275 collate_info_t *info = _collate_info; 276 int p, l; 277 int *sptr; 278 279 /* 280 * If this is the "last" pass for the UNDEFINED, then 281 * we just return the priority itself. 282 */ 283 if (which >= info->directive_count) { 284 *pri = *t; 285 *len = 1; 286 *state = NULL; 287 return; 288 } 289 290 /* 291 * If we have remaining substitution data from a previous 292 * call, consume it first. 293 */ 294 if ((sptr = *state) != NULL) { 295 *pri = *sptr; 296 sptr++; 297 *state = *sptr ? sptr : NULL; 298 *len = 0; 299 return; 300 } 301 302 /* No active substitutions */ 303 *len = 1; 304 305 /* 306 * Check for composites such as dipthongs that collate as a 307 * single element (aka chains or collating-elements). 308 */ 309 if (((p2 = chainsearch(t, &l)) != NULL) && 310 ((p = p2->pri[which]) >= 0)) { 311 312 *len = l; 313 *pri = p; 314 315 } else if (*t <= UCHAR_MAX) { 316 317 /* 318 * Character is a small (8-bit) character. 319 * We just look these up directly for speed. 320 */ 321 *pri = char_pri_table[*t].pri[which]; 322 323 } else if ((info->large_count > 0) && 324 ((match = largesearch(*t)) != NULL)) { 325 326 /* 327 * Character was found in the extended table. 328 */ 329 *pri = match->pri.pri[which]; 330 331 } else { 332 /* 333 * Character lacks a specific definition. 334 */ 335 if (info->directive[which] & DIRECTIVE_UNDEFINED) { 336 /* Mask off sign bit to prevent ordering confusion. */ 337 *pri = (*t & COLLATE_MAX_PRIORITY); 338 } else { 339 *pri = info->undef_pri[which]; 340 } 341 /* No substitutions for undefined characters! */ 342 return; 343 } 344 345 /* 346 * Try substituting (expanding) the character. We are 347 * currently doing this *after* the chain compression. I 348 * think it should not matter, but this way might be slightly 349 * faster. 350 * 351 * We do this after the priority search, as this will help us 352 * to identify a single key value. In order for this to work, 353 * its important that the priority assigned to a given element 354 * to be substituted be unique for that level. The localedef 355 * code ensures this for us. 356 */ 357 if ((sptr = substsearch(*pri, which)) != NULL) { 358 if ((*pri = *sptr) != 0) { 359 sptr++; 360 *state = *sptr ? sptr : NULL; 361 } 362 } 363 364 } 365 366 /* 367 * This is the meaty part of wcsxfrm & strxfrm. Note that it does 368 * NOT NULL terminate. That is left to the caller. 369 */ 370 size_t 371 _collate_wxfrm(const wchar_t *src, wchar_t *xf, size_t room) 372 { 373 int pri; 374 int len; 375 const wchar_t *t; 376 wchar_t *tr = NULL; 377 int direc; 378 int pass; 379 int32_t *state; 380 size_t want = 0; 381 size_t need = 0; 382 383 assert(src); 384 385 for (pass = 0; pass <= _collate_info->directive_count; pass++) { 386 387 state = NULL; 388 389 if (pass != 0) { 390 /* insert level separator from the previous pass */ 391 if (room) { 392 *xf++ = 1; 393 room--; 394 } 395 want++; 396 } 397 398 /* special pass for undefined */ 399 if (pass == _collate_info->directive_count) { 400 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 401 } else { 402 direc = _collate_info->directive[pass]; 403 } 404 405 t = src; 406 407 if (direc & DIRECTIVE_BACKWARD) { 408 wchar_t *bp, *fp, c; 409 if (tr) 410 free(tr); 411 if ((tr = wcsdup(t)) == NULL) { 412 errno = ENOMEM; 413 goto fail; 414 } 415 bp = tr; 416 fp = tr + wcslen(tr) - 1; 417 while (bp < fp) { 418 c = *bp; 419 *bp++ = *fp; 420 *fp-- = c; 421 } 422 t = (const wchar_t *)tr; 423 } 424 425 if (direc & DIRECTIVE_POSITION) { 426 while (*t || state) { 427 _collate_lookup(t, &len, &pri, pass, &state); 428 t += len; 429 if (pri <= 0) { 430 if (pri < 0) { 431 errno = EINVAL; 432 goto fail; 433 } 434 pri = COLLATE_MAX_PRIORITY; 435 } 436 if (room) { 437 *xf++ = pri; 438 room--; 439 } 440 want++; 441 need = want; 442 } 443 } else { 444 while (*t || state) { 445 _collate_lookup(t, &len, &pri, pass, &state); 446 t += len; 447 if (pri <= 0) { 448 if (pri < 0) { 449 errno = EINVAL; 450 goto fail; 451 } 452 continue; 453 } 454 if (room) { 455 *xf++ = pri; 456 room--; 457 } 458 want++; 459 need = want; 460 } 461 } 462 } 463 464 end: 465 if (tr) 466 free(tr); 467 return (need); 468 469 fail: 470 if (tr) 471 free(tr); 472 return ((size_t)(-1)); 473 } 474 475 /* 476 * In the non-POSIX case, we transform each character into a string of 477 * characters representing the character's priority. Since char is usually 478 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 479 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 480 * bits per byte. 481 * 482 * It turns out that we sometimes have real priorities that are 483 * 31-bits wide. (But: be careful using priorities where the high 484 * order bit is set -- i.e. the priority is negative. The sort order 485 * may be surprising!) 486 * 487 * TODO: This would be a good area to optimize somewhat. It turns out 488 * that real prioririties *except for the last UNDEFINED pass* are generally 489 * very small. We need the localedef code to precalculate the max 490 * priority for us, and ideally also give us a mask, and then we could 491 * severely limit what we expand to. 492 */ 493 #define XFRM_BYTES 6 494 #define XFRM_OFFSET ('0') /* make all printable characters */ 495 #define XFRM_SHIFT 6 496 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 497 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 498 499 static int 500 xfrm(unsigned char *p, int pri, int pass) 501 { 502 /* we use unsigned to ensure zero fill on right shift */ 503 uint32_t val = (uint32_t)_collate_info->pri_count[pass]; 504 int nc = 0; 505 506 while (val) { 507 *p = (pri & XFRM_MASK) + XFRM_OFFSET; 508 pri >>= XFRM_SHIFT; 509 val >>= XFRM_SHIFT; 510 p++; 511 nc++; 512 } 513 return (nc); 514 } 515 516 size_t 517 _collate_sxfrm(const wchar_t *src, char *xf, size_t room) 518 { 519 int pri; 520 int len; 521 const wchar_t *t; 522 wchar_t *tr = NULL; 523 int direc; 524 int pass; 525 int32_t *state; 526 size_t want = 0; 527 size_t need = 0; 528 int b; 529 uint8_t buf[XFRM_BYTES]; 530 531 assert(src); 532 533 for (pass = 0; pass <= _collate_info->directive_count; pass++) { 534 535 state = NULL; 536 537 if (pass != 0) { 538 /* insert level separator from the previous pass */ 539 if (room) { 540 *xf++ = XFRM_SEP; 541 room--; 542 } 543 want++; 544 } 545 546 /* special pass for undefined */ 547 if (pass == _collate_info->directive_count) { 548 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 549 } else { 550 direc = _collate_info->directive[pass]; 551 } 552 553 t = src; 554 555 if (direc & DIRECTIVE_BACKWARD) { 556 wchar_t *bp, *fp, c; 557 if (tr) 558 free(tr); 559 if ((tr = wcsdup(t)) == NULL) { 560 errno = ENOMEM; 561 goto fail; 562 } 563 bp = tr; 564 fp = tr + wcslen(tr) - 1; 565 while (bp < fp) { 566 c = *bp; 567 *bp++ = *fp; 568 *fp-- = c; 569 } 570 t = (const wchar_t *)tr; 571 } 572 573 if (direc & DIRECTIVE_POSITION) { 574 while (*t || state) { 575 576 _collate_lookup(t, &len, &pri, pass, &state); 577 t += len; 578 if (pri <= 0) { 579 if (pri < 0) { 580 errno = EINVAL; 581 goto fail; 582 } 583 pri = COLLATE_MAX_PRIORITY; 584 } 585 586 b = xfrm(buf, pri, pass); 587 want += b; 588 if (room) { 589 while (b) { 590 b--; 591 if (room) { 592 *xf++ = buf[b]; 593 room--; 594 } 595 } 596 } 597 need = want; 598 } 599 } else { 600 while (*t || state) { 601 _collate_lookup(t, &len, &pri, pass, &state); 602 t += len; 603 if (pri <= 0) { 604 if (pri < 0) { 605 errno = EINVAL; 606 goto fail; 607 } 608 continue; 609 } 610 611 b = xfrm(buf, pri, pass); 612 want += b; 613 if (room) { 614 615 while (b) { 616 b--; 617 if (room) { 618 *xf++ = buf[b]; 619 room--; 620 } 621 } 622 } 623 need = want; 624 } 625 } 626 } 627 628 end: 629 if (tr) 630 free(tr); 631 return (need); 632 633 fail: 634 if (tr) 635 free(tr); 636 return ((size_t)(-1)); 637 }