1 /*
   2  * Copright 2010 Nexenta Systems, Inc.  All rights reserved.
   3  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
   4  *              at Electronni Visti IA, Kiev, Ukraine.
   5  *                      All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28 
  29 #include "lint.h"
  30 #include "file64.h"
  31 #include <stdio.h>
  32 #include <stdlib.h>
  33 #include <stddef.h>
  34 #include <string.h>
  35 #include <wchar.h>
  36 #include <errno.h>
  37 #include <unistd.h>
  38 #include <ctype.h>
  39 #include <unistd.h>
  40 #include <fcntl.h>
  41 #include <assert.h>
  42 #include <sys/stat.h>
  43 #include <sys/mman.h>
  44 
  45 #include "collate.h"
  46 #include "setlocale.h"
  47 #include "ldpart.h"
  48 
  49 /*
  50  * See the comments in usr/src/cmd/localedef/collate.c for further
  51  * information.  It would also be very helpful to have a copy of the
  52  * POSIX standard for collation (in the locale format manual page)
  53  * handy (www.opengroup.org).
  54  */
  55 
  56 static collate_subst_t          *subst_table[COLL_WEIGHTS_MAX];
  57 static collate_char_t           *char_pri_table;
  58 static collate_large_t          *large_pri_table;
  59 static collate_chain_t          *chain_pri_table;
  60 static char                     *cache = NULL;
  61 static size_t                   cachesz;
  62 static char                     collate_encoding[ENCODING_LEN + 1];
  63 
  64 /* Exposed externally to other parts of libc. */
  65 collate_info_t                  *_collate_info;
  66 int _collate_load_error = 1;
  67 
  68 struct xlocale_collate __xlocale_global_collate = {
  69         {{0}, "C"}, 1, 0
  70 };
  71 
  72 struct xlocale_collate __xlocale_C_collate = {
  73         {{0}, "C"}, 1, 0
  74 };
  75 
  76 static void
  77 destruct_collate(void *t)
  78 {
  79         struct xlocale_collate *table = t;
  80 
  81         /* XXX */;
  82 }
  83 
  84 void *
  85 __collate_load(const char *encoding, locale_t unused)
  86 {
  87         struct xlocale_collate *table;
  88 
  89         if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
  90                 return &__xlocale_C_collate;
  91         }
  92 
  93         table = calloc(sizeof(struct xlocale_collate), 1);
  94         if (table == NULL) {
  95                 /* XXX */
  96         }
  97 
  98         return (table);
  99 }
 100 
 101 int
 102 _collate_load_tables(const char *encoding)
 103 {
 104         int i, chains, z;
 105         char buf[PATH_MAX];
 106         char *TMP;
 107         char *map;
 108         collate_info_t *info;
 109         struct stat sbuf;
 110         int fd;
 111 
 112         /* 'encoding' must be already checked. */
 113         if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
 114                 _collate_load_error = 1;
 115                 return (_LDP_CACHE);
 116         }
 117 
 118         /*
 119          * If the locale name is the same as our cache, use the cache.
 120          */
 121         if (cache && (strncmp(encoding, collate_encoding, ENCODING_LEN) == 0)) {
 122                 _collate_load_error = 0;
 123                 return (_LDP_CACHE);
 124         }
 125 
 126         /*
 127          * Slurp the locale file into the cache.
 128          */
 129 
 130         (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA",
 131             _PathLocale, encoding);
 132 
 133         if ((fd = open(buf, O_RDONLY)) < 0)
 134                 return (_LDP_ERROR);
 135         if (fstat(fd, &sbuf) < 0) {
 136                 (void) close(fd);
 137                 return (_LDP_ERROR);
 138         }
 139         if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
 140                 (void) close(fd);
 141                 errno = EINVAL;
 142                 return (_LDP_ERROR);
 143         }
 144         map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
 145         (void) close(fd);
 146         if ((TMP = map) == NULL) {
 147                 return (_LDP_ERROR);
 148         }
 149 
 150         if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
 151                 (void) munmap(map, sbuf.st_size);
 152                 errno = EINVAL;
 153                 return (_LDP_ERROR);
 154         }
 155         TMP += COLLATE_STR_LEN;
 156 
 157         info = (void *)TMP;
 158         TMP += sizeof (*info);
 159 
 160         if ((info->directive_count < 1) ||
 161             (info->directive_count >= COLL_WEIGHTS_MAX) ||
 162             ((chains = info->chain_count) < 0)) {
 163                 (void) munmap(map, sbuf.st_size);
 164                 errno = EINVAL;
 165                 return (_LDP_ERROR);
 166         }
 167 
 168         i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
 169             (sizeof (collate_chain_t) * chains) +
 170             (sizeof (collate_large_t) * info->large_count);
 171         for (z = 0; z < (info->directive_count); z++) {
 172                 i += sizeof (collate_subst_t) * info->subst_count[z];
 173         }
 174         if (i != (sbuf.st_size - (TMP - map))) {
 175                 (void) munmap(map, sbuf.st_size);
 176                 errno = EINVAL;
 177                 return (_LDP_ERROR);
 178         }
 179 
 180         char_pri_table = (void *)TMP;
 181         TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
 182 
 183         for (z = 0; z < info->directive_count; z++) {
 184                 if (info->subst_count[z] > 0) {
 185                         subst_table[z] = (void *)TMP;
 186                         TMP += info->subst_count[z] * sizeof (collate_subst_t);
 187                 } else {
 188                         subst_table[z] = NULL;
 189                 }
 190         }
 191 
 192         if (chains > 0) {
 193                 chain_pri_table = (void *)TMP;
 194                 TMP += chains * sizeof (collate_chain_t);
 195         } else
 196                 chain_pri_table = NULL;
 197         if (info->large_count > 0)
 198                 large_pri_table = (void *)TMP;
 199         else
 200                 large_pri_table = NULL;
 201 
 202         (void) strlcpy(collate_encoding, encoding, ENCODING_LEN);
 203         _collate_info = info;
 204 
 205         if (cache)
 206                 (void) munmap(cache, cachesz);
 207 
 208         cache = map;
 209         cachesz = sbuf.st_size;
 210         _collate_load_error = 0;
 211 
 212         return (_LDP_LOADED);
 213 }
 214 
 215 static int32_t *
 216 substsearch(const wchar_t key, int pass)
 217 {
 218         collate_subst_t *p;
 219         int n = _collate_info->subst_count[pass];
 220 
 221         if (n == 0)
 222                 return (NULL);
 223 
 224         if (pass >= _collate_info->directive_count)
 225                 return (NULL);
 226 
 227         if (!(key & COLLATE_SUBST_PRIORITY))
 228                 return (NULL);
 229 
 230         p = subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
 231         assert(p->key == key);
 232         return (p->pri);
 233 }
 234 
 235 /*
 236  * Note: for performance reasons, we have expanded bsearch here.  This avoids
 237  * function call overhead with each comparison.
 238  */
 239 
 240 static collate_chain_t *
 241 chainsearch(const wchar_t *key, int *len)
 242 {
 243         int low;
 244         int high;
 245         int next, compar, l;
 246         collate_chain_t *p;
 247         collate_chain_t *tab;
 248 
 249         if (_collate_info->chain_count == 0)
 250                 return (NULL);
 251 
 252         low = 0;
 253         high = _collate_info->chain_count - 1;
 254         tab = chain_pri_table;
 255 
 256         while (low <= high) {
 257                 next = (low + high) / 2;
 258                 p = tab + next;
 259                 compar = *key - *p->str;
 260                 if (compar == 0) {
 261                         l = wcsnlen(p->str, COLLATE_STR_LEN);
 262                         compar = wcsncmp(key, p->str, l);
 263                         if (compar == 0) {
 264                                 *len = l;
 265                                 return (p);
 266                         }
 267                 }
 268                 if (compar > 0)
 269                         low = next + 1;
 270                 else
 271                         high = next - 1;
 272         }
 273         return (NULL);
 274 }
 275 
 276 static collate_large_t *
 277 largesearch(const wchar_t key)
 278 {
 279         int low = 0;
 280         int high = _collate_info->large_count - 1;
 281         int next, compar;
 282         collate_large_t *p;
 283         collate_large_t *tab = large_pri_table;
 284 
 285         if (_collate_info->large_count == 0)
 286                 return (NULL);
 287 
 288         while (low <= high) {
 289                 next = (low + high) / 2;
 290                 p = tab + next;
 291                 compar = key - p->val;
 292                 if (compar == 0)
 293                         return (p);
 294                 if (compar > 0)
 295                         low = next + 1;
 296                 else
 297                         high = next - 1;
 298         }
 299         return (NULL);
 300 }
 301 
 302 void
 303 _collate_lookup(const wchar_t *t, int *len, int *pri, int which, int **state)
 304 {
 305         collate_chain_t *p2;
 306         collate_large_t *match;
 307         collate_info_t *info = _collate_info;
 308         int p, l;
 309         int *sptr;
 310 
 311         /*
 312          * If this is the "last" pass for the UNDEFINED, then
 313          * we just return the priority itself.
 314          */
 315         if (which >= info->directive_count) {
 316                 *pri = *t;
 317                 *len = 1;
 318                 *state = NULL;
 319                 return;
 320         }
 321 
 322         /*
 323          * If we have remaining substitution data from a previous
 324          * call, consume it first.
 325          */
 326         if ((sptr = *state) != NULL) {
 327                 *pri = *sptr;
 328                 sptr++;
 329                 *state = *sptr ? sptr : NULL;
 330                 *len = 0;
 331                 return;
 332         }
 333 
 334         /* No active substitutions */
 335         *len = 1;
 336 
 337         /*
 338          * Check for composites such as dipthongs that collate as a
 339          * single element (aka chains or collating-elements).
 340          */
 341         if (((p2 = chainsearch(t, &l)) != NULL) &&
 342             ((p = p2->pri[which]) >= 0)) {
 343 
 344                 *len = l;
 345                 *pri = p;
 346 
 347         } else if (*t <= UCHAR_MAX) {
 348 
 349                 /*
 350                  * Character is a small (8-bit) character.
 351                  * We just look these up directly for speed.
 352                  */
 353                 *pri = char_pri_table[*t].pri[which];
 354 
 355         } else if ((info->large_count > 0) &&
 356             ((match = largesearch(*t)) != NULL)) {
 357 
 358                 /*
 359                  * Character was found in the extended table.
 360                  */
 361                 *pri = match->pri.pri[which];
 362 
 363         } else {
 364                 /*
 365                  * Character lacks a specific definition.
 366                  */
 367                 if (info->directive[which] & DIRECTIVE_UNDEFINED) {
 368                         /* Mask off sign bit to prevent ordering confusion. */
 369                         *pri = (*t & COLLATE_MAX_PRIORITY);
 370                 } else {
 371                         *pri = info->undef_pri[which];
 372                 }
 373                 /* No substitutions for undefined characters! */
 374                 return;
 375         }
 376 
 377         /*
 378          * Try substituting (expanding) the character.  We are
 379          * currently doing this *after* the chain compression.  I
 380          * think it should not matter, but this way might be slightly
 381          * faster.
 382          *
 383          * We do this after the priority search, as this will help us
 384          * to identify a single key value.  In order for this to work,
 385          * its important that the priority assigned to a given element
 386          * to be substituted be unique for that level.  The localedef
 387          * code ensures this for us.
 388          */
 389         if ((sptr = substsearch(*pri, which)) != NULL) {
 390                 if ((*pri = *sptr) != 0) {
 391                         sptr++;
 392                         *state = *sptr ? sptr : NULL;
 393                 }
 394         }
 395 
 396 }
 397 
 398 /*
 399  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
 400  * NOT NULL terminate.  That is left to the caller.
 401  */
 402 size_t
 403 _collate_wxfrm(const wchar_t *src, wchar_t *xf, size_t room)
 404 {
 405         int             pri;
 406         int             len;
 407         const wchar_t   *t;
 408         wchar_t         *tr = NULL;
 409         int             direc;
 410         int             pass;
 411         int32_t         *state;
 412         size_t          want = 0;
 413         size_t          need = 0;
 414 
 415         assert(src);
 416 
 417         for (pass = 0; pass <= _collate_info->directive_count; pass++) {
 418 
 419                 state = NULL;
 420 
 421                 if (pass != 0) {
 422                         /* insert level separator from the previous pass */
 423                         if (room) {
 424                                 *xf++ = 1;
 425                                 room--;
 426                         }
 427                         want++;
 428                 }
 429 
 430                 /* special pass for undefined */
 431                 if (pass == _collate_info->directive_count) {
 432                         direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
 433                 } else {
 434                         direc = _collate_info->directive[pass];
 435                 }
 436 
 437                 t = src;
 438 
 439                 if (direc & DIRECTIVE_BACKWARD) {
 440                         wchar_t *bp, *fp, c;
 441                         if (tr)
 442                                 free(tr);
 443                         if ((tr = wcsdup(t)) == NULL) {
 444                                 errno = ENOMEM;
 445                                 goto fail;
 446                         }
 447                         bp = tr;
 448                         fp = tr + wcslen(tr) - 1;
 449                         while (bp < fp) {
 450                                 c = *bp;
 451                                 *bp++ = *fp;
 452                                 *fp-- = c;
 453                         }
 454                         t = (const wchar_t *)tr;
 455                 }
 456 
 457                 if (direc & DIRECTIVE_POSITION) {
 458                         while (*t || state) {
 459                                 _collate_lookup(t, &len, &pri, pass, &state);
 460                                 t += len;
 461                                 if (pri <= 0) {
 462                                         if (pri < 0) {
 463                                                 errno = EINVAL;
 464                                                 goto fail;
 465                                         }
 466                                         pri = COLLATE_MAX_PRIORITY;
 467                                 }
 468                                 if (room) {
 469                                         *xf++ = pri;
 470                                         room--;
 471                                 }
 472                                 want++;
 473                                 need = want;
 474                         }
 475                 } else {
 476                         while (*t || state) {
 477                                 _collate_lookup(t, &len, &pri, pass, &state);
 478                                 t += len;
 479                                 if (pri <= 0) {
 480                                         if (pri < 0) {
 481                                                 errno = EINVAL;
 482                                                 goto fail;
 483                                         }
 484                                         continue;
 485                                 }
 486                                 if (room) {
 487                                         *xf++ = pri;
 488                                         room--;
 489                                 }
 490                                 want++;
 491                                 need = want;
 492                         }
 493                 }
 494         }
 495 
 496 end:
 497         if (tr)
 498                 free(tr);
 499         return (need);
 500 
 501 fail:
 502         if (tr)
 503                 free(tr);
 504         return ((size_t)(-1));
 505 }
 506 
 507 /*
 508  * In the non-POSIX case, we transform each character into a string of
 509  * characters representing the character's priority.  Since char is usually
 510  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
 511  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
 512  * bits per byte.
 513  *
 514  * It turns out that we sometimes have real priorities that are
 515  * 31-bits wide.  (But: be careful using priorities where the high
 516  * order bit is set -- i.e. the priority is negative.  The sort order
 517  * may be surprising!)
 518  *
 519  * TODO: This would be a good area to optimize somewhat.  It turns out
 520  * that real prioririties *except for the last UNDEFINED pass* are generally
 521  * very small.  We need the localedef code to precalculate the max
 522  * priority for us, and ideally also give us a mask, and then we could
 523  * severely limit what we expand to.
 524  */
 525 #define XFRM_BYTES      6
 526 #define XFRM_OFFSET     ('0')   /* make all printable characters */
 527 #define XFRM_SHIFT      6
 528 #define XFRM_MASK       ((1 << XFRM_SHIFT) - 1)
 529 #define XFRM_SEP        ('.')   /* chosen to be less than XFRM_OFFSET */
 530 
 531 static int
 532 xfrm(unsigned char *p, int pri, int pass)
 533 {
 534         /* we use unsigned to ensure zero fill on right shift */
 535         uint32_t val = (uint32_t)_collate_info->pri_count[pass];
 536         int nc = 0;
 537 
 538         while (val) {
 539                 *p = (pri & XFRM_MASK) + XFRM_OFFSET;
 540                 pri >>= XFRM_SHIFT;
 541                 val >>= XFRM_SHIFT;
 542                 p++;
 543                 nc++;
 544         }
 545         return (nc);
 546 }
 547 
 548 size_t
 549 _collate_sxfrm(const wchar_t *src, char *xf, size_t room)
 550 {
 551         int             pri;
 552         int             len;
 553         const wchar_t   *t;
 554         wchar_t         *tr = NULL;
 555         int             direc;
 556         int             pass;
 557         int32_t         *state;
 558         size_t          want = 0;
 559         size_t          need = 0;
 560         int             b;
 561         uint8_t         buf[XFRM_BYTES];
 562 
 563         assert(src);
 564 
 565         for (pass = 0; pass <= _collate_info->directive_count; pass++) {
 566 
 567                 state = NULL;
 568 
 569                 if (pass != 0) {
 570                         /* insert level separator from the previous pass */
 571                         if (room) {
 572                                 *xf++ = XFRM_SEP;
 573                                 room--;
 574                         }
 575                         want++;
 576                 }
 577 
 578                 /* special pass for undefined */
 579                 if (pass == _collate_info->directive_count) {
 580                         direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
 581                 } else {
 582                         direc = _collate_info->directive[pass];
 583                 }
 584 
 585                 t = src;
 586 
 587                 if (direc & DIRECTIVE_BACKWARD) {
 588                         wchar_t *bp, *fp, c;
 589                         if (tr)
 590                                 free(tr);
 591                         if ((tr = wcsdup(t)) == NULL) {
 592                                 errno = ENOMEM;
 593                                 goto fail;
 594                         }
 595                         bp = tr;
 596                         fp = tr + wcslen(tr) - 1;
 597                         while (bp < fp) {
 598                                 c = *bp;
 599                                 *bp++ = *fp;
 600                                 *fp-- = c;
 601                         }
 602                         t = (const wchar_t *)tr;
 603                 }
 604 
 605                 if (direc & DIRECTIVE_POSITION) {
 606                         while (*t || state) {
 607 
 608                                 _collate_lookup(t, &len, &pri, pass, &state);
 609                                 t += len;
 610                                 if (pri <= 0) {
 611                                         if (pri < 0) {
 612                                                 errno = EINVAL;
 613                                                 goto fail;
 614                                         }
 615                                         pri = COLLATE_MAX_PRIORITY;
 616                                 }
 617 
 618                                 b = xfrm(buf, pri, pass);
 619                                 want += b;
 620                                 if (room) {
 621                                         while (b) {
 622                                                 b--;
 623                                                 if (room) {
 624                                                         *xf++ = buf[b];
 625                                                         room--;
 626                                                 }
 627                                         }
 628                                 }
 629                                 need = want;
 630                         }
 631                 } else {
 632                         while (*t || state) {
 633                                 _collate_lookup(t, &len, &pri, pass, &state);
 634                                 t += len;
 635                                 if (pri <= 0) {
 636                                         if (pri < 0) {
 637                                                 errno = EINVAL;
 638                                                 goto fail;
 639                                         }
 640                                         continue;
 641                                 }
 642 
 643                                 b = xfrm(buf, pri, pass);
 644                                 want += b;
 645                                 if (room) {
 646 
 647                                         while (b) {
 648                                                 b--;
 649                                                 if (room) {
 650                                                         *xf++ = buf[b];
 651                                                         room--;
 652                                                 }
 653                                         }
 654                                 }
 655                                 need = want;
 656                         }
 657                 }
 658         }
 659 
 660 end:
 661         if (tr)
 662                 free(tr);
 663         return (need);
 664 
 665 fail:
 666         if (tr)
 667                 free(tr);
 668         return ((size_t)(-1));
 669 }