1 /* 2 * Copyright 2014 Garrett D'Amore <garrett@damore.org> 3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 4 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 5 * at Electronni Visti IA, Kiev, Ukraine. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include "lint.h" 31 #include "file64.h" 32 #include <stdio.h> 33 #include <stdlib.h> 34 #include <stddef.h> 35 #include <string.h> 36 #include <wchar.h> 37 #include <errno.h> 38 #include <unistd.h> 39 #include <ctype.h> 40 #include <unistd.h> 41 #include <fcntl.h> 42 #include <assert.h> 43 #include <sys/stat.h> 44 #include <sys/mman.h> 45 46 #include "collate.h" 47 #include "setlocale.h" 48 49 /* 50 * See the comments in usr/src/cmd/localedef/collate.c for further 51 * information. It would also be very helpful to have a copy of the 52 * POSIX standard for collation (in the locale format manual page) 53 * handy (www.opengroup.org). 54 */ 55 56 /* 57 * POSIX uses empty tables and falls down to strcmp. 58 */ 59 struct lc_collate lc_collate_posix = { 60 .lc_is_posix = 1, 61 }; 62 63 struct locdata __posix_collate_locdata = { 64 .l_lname = "C", 65 .l_data = { &lc_collate_posix } 66 }; 67 68 69 struct locdata * 70 __lc_collate_load(const char *locname) 71 { 72 int i, chains, z; 73 char buf[PATH_MAX]; 74 char *TMP; 75 char *map; 76 collate_info_t *info; 77 struct stat sbuf; 78 int fd; 79 struct locdata *ldata; 80 struct lc_collate *lcc; 81 82 /* 83 * Slurp the locale file into the cache. 84 */ 85 86 (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA", 87 _PathLocale, locname); 88 89 if ((fd = open(buf, O_RDONLY)) < 0) { 90 errno = EINVAL; 91 return (NULL); 92 } 93 if (fstat(fd, &sbuf) < 0) { 94 (void) close(fd); 95 errno = EINVAL; 96 return (NULL); 97 } 98 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) { 99 (void) close(fd); 100 errno = EINVAL; 101 return (NULL); 102 } 103 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 104 (void) close(fd); 105 if ((TMP = map) == NULL) { 106 errno = EINVAL; 107 return (NULL); 108 } 109 110 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) { 111 (void) munmap(map, sbuf.st_size); 112 errno = EINVAL; 113 return (NULL); 114 } 115 TMP += COLLATE_STR_LEN; 116 117 info = (void *)TMP; 118 TMP += sizeof (*info); 119 120 if ((info->directive_count < 1) || 121 (info->directive_count >= COLL_WEIGHTS_MAX) || 122 ((chains = info->chain_count) < 0)) { 123 (void) munmap(map, sbuf.st_size); 124 errno = EINVAL; 125 return (NULL); 126 } 127 128 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + 129 (sizeof (collate_chain_t) * chains) + 130 (sizeof (collate_large_t) * info->large_count); 131 for (z = 0; z < info->directive_count; z++) { 132 i += sizeof (collate_subst_t) * info->subst_count[z]; 133 } 134 if (i != (sbuf.st_size - (TMP - map))) { 135 (void) munmap(map, sbuf.st_size); 136 errno = EINVAL; 137 return (NULL); 138 } 139 140 141 if ((ldata = __locdata_alloc(locname, sizeof (*lcc))) == NULL) { 142 (void) munmap(map, sbuf.st_size); 143 return (NULL); 144 } 145 lcc = ldata->l_data[0]; 146 ldata->l_map = map; 147 ldata->l_map_len = sbuf.st_size; 148 149 lcc->lc_info = info; 150 lcc->lc_directive_count = info->directive_count; 151 lcc->lc_large_count = info->large_count; 152 153 for (z = 0; z < COLL_WEIGHTS_MAX; z++) { 154 lcc->lc_directive[z] = info->directive[z]; 155 lcc->lc_subst_count[z] = info->subst_count[z]; 156 lcc->lc_pri_count[z] = info->pri_count[z]; 157 lcc->lc_undef_pri[z] = info->undef_pri[z]; 158 } 159 160 lcc->lc_char_table = (void *)TMP; 161 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); 162 163 for (z = 0; z < lcc->lc_directive_count; z++) { 164 int count; 165 if ((count = lcc->lc_subst_count[z]) > 0) { 166 lcc->lc_subst_table[z] = (void *)TMP; 167 TMP += count * sizeof (collate_subst_t); 168 } else { 169 lcc->lc_subst_table[z] = NULL; 170 } 171 } 172 173 if (chains > 0) { 174 lcc->lc_chain_table = (void *)TMP; 175 TMP += chains * sizeof (collate_chain_t); 176 } else 177 lcc->lc_chain_table = NULL; 178 lcc->lc_chain_count = chains; 179 if (lcc->lc_large_count > 0) 180 lcc->lc_large_table = (void *)TMP; 181 else 182 lcc->lc_large_table = NULL; 183 184 return (ldata); 185 } 186 187 static const int32_t * 188 substsearch(const struct lc_collate *lcc, const wchar_t key, int pass) 189 { 190 const collate_subst_t *p; 191 int n = lcc->lc_subst_count[pass]; 192 193 if (n == 0) 194 return (NULL); 195 196 if (pass >= lcc->lc_directive_count) 197 return (NULL); 198 199 if (!(key & COLLATE_SUBST_PRIORITY)) 200 return (NULL); 201 202 p = lcc->lc_subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); 203 assert(p->key == key); 204 return (p->pri); 205 } 206 207 /* 208 * Note: for performance reasons, we have expanded bsearch here. This avoids 209 * function call overhead with each comparison. 210 */ 211 212 static collate_chain_t * 213 chainsearch(const struct lc_collate *lcc, const wchar_t *key, int *len) 214 { 215 int low; 216 int high; 217 int next, compar, l; 218 collate_chain_t *p; 219 collate_chain_t *tab; 220 221 if (lcc->lc_info->chain_count == 0) 222 return (NULL); 223 224 low = 0; 225 high = lcc->lc_info->chain_count - 1; 226 tab = lcc->lc_chain_table; 227 228 while (low <= high) { 229 next = (low + high) / 2; 230 p = tab + next; 231 compar = *key - *p->str; 232 if (compar == 0) { 233 l = wcsnlen(p->str, COLLATE_STR_LEN); 234 compar = wcsncmp(key, p->str, l); 235 if (compar == 0) { 236 *len = l; 237 return (p); 238 } 239 } 240 if (compar > 0) 241 low = next + 1; 242 else 243 high = next - 1; 244 } 245 return (NULL); 246 } 247 248 static collate_large_t * 249 largesearch(const struct lc_collate *lcc, const wchar_t key) 250 { 251 int low = 0; 252 int high = lcc->lc_info->large_count - 1; 253 int next, compar; 254 collate_large_t *p; 255 collate_large_t *tab = lcc->lc_large_table; 256 257 if (lcc->lc_info->large_count == 0) 258 return (NULL); 259 260 while (low <= high) { 261 next = (low + high) / 2; 262 p = tab + next; 263 compar = key - p->val; 264 if (compar == 0) 265 return (p); 266 if (compar > 0) 267 low = next + 1; 268 else 269 high = next - 1; 270 } 271 return (NULL); 272 } 273 274 void 275 _collate_lookup(const struct lc_collate *lcc, const wchar_t *t, 276 int *len, int *pri, int which, const int **state) 277 { 278 collate_chain_t *p2; 279 collate_large_t *match; 280 int p, l; 281 const int *sptr; 282 283 /* 284 * If this is the "last" pass for the UNDEFINED, then 285 * we just return the priority itself. 286 */ 287 if (which >= lcc->lc_directive_count) { 288 *pri = *t; 289 *len = 1; 290 *state = NULL; 291 return; 292 } 293 294 /* 295 * If we have remaining substitution data from a previous 296 * call, consume it first. 297 */ 298 if ((sptr = *state) != NULL) { 299 *pri = *sptr; 300 sptr++; 301 *state = *sptr ? sptr : NULL; 302 *len = 0; 303 return; 304 } 305 306 /* No active substitutions */ 307 *len = 1; 308 309 /* 310 * Check for composites such as dipthongs that collate as a 311 * single element (aka chains or collating-elements). 312 */ 313 if (((p2 = chainsearch(lcc, t, &l)) != NULL) && 314 ((p = p2->pri[which]) >= 0)) { 315 316 *len = l; 317 *pri = p; 318 319 } else if (*t <= UCHAR_MAX) { 320 321 /* 322 * Character is a small (8-bit) character. 323 * We just look these up directly for speed. 324 */ 325 *pri = lcc->lc_char_table[*t].pri[which]; 326 327 } else if ((lcc->lc_info->large_count > 0) && 328 ((match = largesearch(lcc, *t)) != NULL)) { 329 330 /* 331 * Character was found in the extended table. 332 */ 333 *pri = match->pri.pri[which]; 334 335 } else { 336 /* 337 * Character lacks a specific definition. 338 */ 339 if (lcc->lc_directive[which] & DIRECTIVE_UNDEFINED) { 340 /* Mask off sign bit to prevent ordering confusion. */ 341 *pri = (*t & COLLATE_MAX_PRIORITY); 342 } else { 343 *pri = lcc->lc_undef_pri[which]; 344 } 345 /* No substitutions for undefined characters! */ 346 return; 347 } 348 349 /* 350 * Try substituting (expanding) the character. We are 351 * currently doing this *after* the chain compression. I 352 * think it should not matter, but this way might be slightly 353 * faster. 354 * 355 * We do this after the priority search, as this will help us 356 * to identify a single key value. In order for this to work, 357 * its important that the priority assigned to a given element 358 * to be substituted be unique for that level. The localedef 359 * code ensures this for us. 360 */ 361 if ((sptr = substsearch(lcc, *pri, which)) != NULL) { 362 if ((*pri = *sptr) != 0) { 363 sptr++; 364 *state = *sptr ? sptr : NULL; 365 } 366 } 367 368 } 369 370 /* 371 * This is the meaty part of wcsxfrm & strxfrm. Note that it does 372 * NOT NULL terminate. That is left to the caller. 373 */ 374 size_t 375 _collate_wxfrm(const struct lc_collate *lcc, const wchar_t *src, wchar_t *xf, 376 size_t room) 377 { 378 int pri; 379 int len; 380 const wchar_t *t; 381 wchar_t *tr = NULL; 382 int direc; 383 int pass; 384 const int32_t *state; 385 size_t want = 0; 386 size_t need = 0; 387 int ndir = lcc->lc_directive_count; 388 389 assert(src); 390 391 for (pass = 0; pass <= ndir; pass++) { 392 393 state = NULL; 394 395 if (pass != 0) { 396 /* insert level separator from the previous pass */ 397 if (room) { 398 *xf++ = 1; 399 room--; 400 } 401 want++; 402 } 403 404 /* special pass for undefined */ 405 if (pass == ndir) { 406 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 407 } else { 408 direc = lcc->lc_directive[pass]; 409 } 410 411 t = src; 412 413 if (direc & DIRECTIVE_BACKWARD) { 414 wchar_t *bp, *fp, c; 415 if (tr) 416 free(tr); 417 if ((tr = wcsdup(t)) == NULL) { 418 errno = ENOMEM; 419 goto fail; 420 } 421 bp = tr; 422 fp = tr + wcslen(tr) - 1; 423 while (bp < fp) { 424 c = *bp; 425 *bp++ = *fp; 426 *fp-- = c; 427 } 428 t = (const wchar_t *)tr; 429 } 430 431 if (direc & DIRECTIVE_POSITION) { 432 while (*t || state) { 433 _collate_lookup(lcc, t, &len, &pri, pass, 434 &state); 435 t += len; 436 if (pri <= 0) { 437 if (pri < 0) { 438 errno = EINVAL; 439 goto fail; 440 } 441 pri = COLLATE_MAX_PRIORITY; 442 } 443 if (room) { 444 *xf++ = pri; 445 room--; 446 } 447 want++; 448 need = want; 449 } 450 } else { 451 while (*t || state) { 452 _collate_lookup(lcc, t, &len, &pri, pass, 453 &state); 454 t += len; 455 if (pri <= 0) { 456 if (pri < 0) { 457 errno = EINVAL; 458 goto fail; 459 } 460 continue; 461 } 462 if (room) { 463 *xf++ = pri; 464 room--; 465 } 466 want++; 467 need = want; 468 } 469 } 470 } 471 472 end: 473 if (tr) 474 free(tr); 475 return (need); 476 477 fail: 478 if (tr) 479 free(tr); 480 return ((size_t)(-1)); 481 } 482 483 /* 484 * In the non-POSIX case, we transform each character into a string of 485 * characters representing the character's priority. Since char is usually 486 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 487 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 488 * bits per byte. 489 * 490 * It turns out that we sometimes have real priorities that are 491 * 31-bits wide. (But: be careful using priorities where the high 492 * order bit is set -- i.e. the priority is negative. The sort order 493 * may be surprising!) 494 * 495 * TODO: This would be a good area to optimize somewhat. It turns out 496 * that real prioririties *except for the last UNDEFINED pass* are generally 497 * very small. We need the localedef code to precalculate the max 498 * priority for us, and ideally also give us a mask, and then we could 499 * severely limit what we expand to. 500 */ 501 #define XFRM_BYTES 6 502 #define XFRM_OFFSET ('0') /* make all printable characters */ 503 #define XFRM_SHIFT 6 504 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 505 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 506 507 static int 508 xfrm(locale_t loc, unsigned char *p, int pri, int pass) 509 { 510 /* we use unsigned to ensure zero fill on right shift */ 511 uint32_t val = (uint32_t)loc->collate->lc_pri_count[pass]; 512 int nc = 0; 513 514 while (val) { 515 *p = (pri & XFRM_MASK) + XFRM_OFFSET; 516 pri >>= XFRM_SHIFT; 517 val >>= XFRM_SHIFT; 518 p++; 519 nc++; 520 } 521 return (nc); 522 } 523 524 size_t 525 _collate_sxfrm(const wchar_t *src, char *xf, size_t room, locale_t loc) 526 { 527 int pri; 528 int len; 529 const wchar_t *t; 530 wchar_t *tr = NULL; 531 int direc; 532 int pass; 533 const int32_t *state; 534 size_t want = 0; 535 size_t need = 0; 536 int b; 537 uint8_t buf[XFRM_BYTES]; 538 const struct lc_collate *lcc = loc->collate; 539 int ndir = lcc->lc_directive_count; 540 541 assert(src); 542 543 for (pass = 0; pass <= ndir; pass++) { 544 545 state = NULL; 546 547 if (pass != 0) { 548 /* insert level separator from the previous pass */ 549 if (room) { 550 *xf++ = XFRM_SEP; 551 room--; 552 } 553 want++; 554 } 555 556 /* special pass for undefined */ 557 if (pass == ndir) { 558 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 559 } else { 560 direc = lcc->lc_directive[pass]; 561 } 562 563 t = src; 564 565 if (direc & DIRECTIVE_BACKWARD) { 566 wchar_t *bp, *fp, c; 567 if (tr) 568 free(tr); 569 if ((tr = wcsdup(t)) == NULL) { 570 errno = ENOMEM; 571 goto fail; 572 } 573 bp = tr; 574 fp = tr + wcslen(tr) - 1; 575 while (bp < fp) { 576 c = *bp; 577 *bp++ = *fp; 578 *fp-- = c; 579 } 580 t = (const wchar_t *)tr; 581 } 582 583 if (direc & DIRECTIVE_POSITION) { 584 while (*t || state) { 585 586 _collate_lookup(lcc, t, &len, &pri, pass, 587 &state); 588 t += len; 589 if (pri <= 0) { 590 if (pri < 0) { 591 errno = EINVAL; 592 goto fail; 593 } 594 pri = COLLATE_MAX_PRIORITY; 595 } 596 597 b = xfrm(loc, buf, pri, pass); 598 want += b; 599 if (room) { 600 while (b) { 601 b--; 602 if (room) { 603 *xf++ = buf[b]; 604 room--; 605 } 606 } 607 } 608 need = want; 609 } 610 } else { 611 while (*t || state) { 612 _collate_lookup(lcc, t, &len, &pri, pass, 613 &state); 614 t += len; 615 if (pri <= 0) { 616 if (pri < 0) { 617 errno = EINVAL; 618 goto fail; 619 } 620 continue; 621 } 622 623 b = xfrm(loc, buf, pri, pass); 624 want += b; 625 if (room) { 626 627 while (b) { 628 b--; 629 if (room) { 630 *xf++ = buf[b]; 631 room--; 632 } 633 } 634 } 635 need = want; 636 } 637 } 638 } 639 640 end: 641 if (tr) 642 free(tr); 643 return (need); 644 645 fail: 646 if (tr) 647 free(tr); 648 return ((size_t)(-1)); 649 }