1 /*
2 * Copright 2010 Nexenta Systems, Inc. All rights reserved.
3 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
4 * at Electronni Visti IA, Kiev, Ukraine.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include "lint.h"
30 #include "file64.h"
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <stddef.h>
34 #include <string.h>
35 #include <wchar.h>
36 #include <errno.h>
37 #include <unistd.h>
38 #include <ctype.h>
39 #include <unistd.h>
40 #include <fcntl.h>
41 #include <assert.h>
42 #include <sys/stat.h>
43 #include <sys/mman.h>
44
45 #include "collate.h"
46 #include "setlocale.h"
47 #include "ldpart.h"
48
49 /*
50 * See the comments in usr/src/cmd/localedef/collate.c for further
51 * information. It would also be very helpful to have a copy of the
52 * POSIX standard for collation (in the locale format manual page)
53 * handy (www.opengroup.org).
54 */
55
56 static collate_subst_t *subst_table[COLL_WEIGHTS_MAX];
57 static collate_char_t *char_pri_table;
58 static collate_large_t *large_pri_table;
59 static collate_chain_t *chain_pri_table;
60 static char *cache = NULL;
61 static size_t cachesz;
62 static char collate_encoding[ENCODING_LEN + 1];
63
64 /* Exposed externally to other parts of libc. */
65 collate_info_t *_collate_info;
66 int _collate_load_error = 1;
67
68 struct xlocale_collate __xlocale_global_collate = {
69 {{0}, "C"}, 1, 0
70 };
71
72 struct xlocale_collate __xlocale_C_collate = {
73 {{0}, "C"}, 1, 0
74 };
75
76 static void
77 destruct_collate(void *t)
78 {
79 struct xlocale_collate *table = t;
80
81 /* XXX */;
82 }
83
84 void *
85 __collate_load(const char *encoding, locale_t unused)
86 {
87 struct xlocale_collate *table;
88
89 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
90 return &__xlocale_C_collate;
91 }
92
93 table = calloc(sizeof(struct xlocale_collate), 1);
94 if (table == NULL) {
95 /* XXX */
96 }
97
98 return (table);
99 }
100
101 int
102 _collate_load_tables(const char *encoding)
103 {
104 int i, chains, z;
105 char buf[PATH_MAX];
106 char *TMP;
107 char *map;
108 collate_info_t *info;
109 struct stat sbuf;
110 int fd;
111
112 /* 'encoding' must be already checked. */
113 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
114 _collate_load_error = 1;
115 return (_LDP_CACHE);
116 }
117
118 /*
119 * If the locale name is the same as our cache, use the cache.
120 */
121 if (cache && (strncmp(encoding, collate_encoding, ENCODING_LEN) == 0)) {
122 _collate_load_error = 0;
123 return (_LDP_CACHE);
124 }
125
126 /*
127 * Slurp the locale file into the cache.
128 */
129
130 (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA",
131 _PathLocale, encoding);
132
133 if ((fd = open(buf, O_RDONLY)) < 0)
134 return (_LDP_ERROR);
135 if (fstat(fd, &sbuf) < 0) {
136 (void) close(fd);
137 return (_LDP_ERROR);
138 }
139 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
140 (void) close(fd);
141 errno = EINVAL;
142 return (_LDP_ERROR);
143 }
144 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
145 (void) close(fd);
146 if ((TMP = map) == NULL) {
147 return (_LDP_ERROR);
148 }
149
150 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
151 (void) munmap(map, sbuf.st_size);
152 errno = EINVAL;
153 return (_LDP_ERROR);
154 }
155 TMP += COLLATE_STR_LEN;
156
157 info = (void *)TMP;
158 TMP += sizeof (*info);
159
160 if ((info->directive_count < 1) ||
161 (info->directive_count >= COLL_WEIGHTS_MAX) ||
162 ((chains = info->chain_count) < 0)) {
163 (void) munmap(map, sbuf.st_size);
164 errno = EINVAL;
165 return (_LDP_ERROR);
166 }
167
168 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
169 (sizeof (collate_chain_t) * chains) +
170 (sizeof (collate_large_t) * info->large_count);
171 for (z = 0; z < (info->directive_count); z++) {
172 i += sizeof (collate_subst_t) * info->subst_count[z];
173 }
174 if (i != (sbuf.st_size - (TMP - map))) {
175 (void) munmap(map, sbuf.st_size);
176 errno = EINVAL;
177 return (_LDP_ERROR);
178 }
179
180 char_pri_table = (void *)TMP;
181 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
182
183 for (z = 0; z < info->directive_count; z++) {
184 if (info->subst_count[z] > 0) {
185 subst_table[z] = (void *)TMP;
186 TMP += info->subst_count[z] * sizeof (collate_subst_t);
187 } else {
188 subst_table[z] = NULL;
189 }
190 }
191
192 if (chains > 0) {
193 chain_pri_table = (void *)TMP;
194 TMP += chains * sizeof (collate_chain_t);
195 } else
196 chain_pri_table = NULL;
197 if (info->large_count > 0)
198 large_pri_table = (void *)TMP;
199 else
200 large_pri_table = NULL;
201
202 (void) strlcpy(collate_encoding, encoding, ENCODING_LEN);
203 _collate_info = info;
204
205 if (cache)
206 (void) munmap(cache, cachesz);
207
208 cache = map;
209 cachesz = sbuf.st_size;
210 _collate_load_error = 0;
211
212 return (_LDP_LOADED);
213 }
214
215 static int32_t *
216 substsearch(const wchar_t key, int pass)
217 {
218 collate_subst_t *p;
219 int n = _collate_info->subst_count[pass];
220
221 if (n == 0)
222 return (NULL);
223
224 if (pass >= _collate_info->directive_count)
225 return (NULL);
226
227 if (!(key & COLLATE_SUBST_PRIORITY))
228 return (NULL);
229
230 p = subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
231 assert(p->key == key);
232 return (p->pri);
233 }
234
235 /*
236 * Note: for performance reasons, we have expanded bsearch here. This avoids
237 * function call overhead with each comparison.
238 */
239
240 static collate_chain_t *
241 chainsearch(const wchar_t *key, int *len)
242 {
243 int low;
244 int high;
245 int next, compar, l;
246 collate_chain_t *p;
247 collate_chain_t *tab;
248
249 if (_collate_info->chain_count == 0)
250 return (NULL);
251
252 low = 0;
253 high = _collate_info->chain_count - 1;
254 tab = chain_pri_table;
255
256 while (low <= high) {
257 next = (low + high) / 2;
258 p = tab + next;
259 compar = *key - *p->str;
260 if (compar == 0) {
261 l = wcsnlen(p->str, COLLATE_STR_LEN);
262 compar = wcsncmp(key, p->str, l);
263 if (compar == 0) {
264 *len = l;
265 return (p);
266 }
267 }
268 if (compar > 0)
269 low = next + 1;
270 else
271 high = next - 1;
272 }
273 return (NULL);
274 }
275
276 static collate_large_t *
277 largesearch(const wchar_t key)
278 {
279 int low = 0;
280 int high = _collate_info->large_count - 1;
281 int next, compar;
282 collate_large_t *p;
283 collate_large_t *tab = large_pri_table;
284
285 if (_collate_info->large_count == 0)
286 return (NULL);
287
288 while (low <= high) {
289 next = (low + high) / 2;
290 p = tab + next;
291 compar = key - p->val;
292 if (compar == 0)
293 return (p);
294 if (compar > 0)
295 low = next + 1;
296 else
297 high = next - 1;
298 }
299 return (NULL);
300 }
301
302 void
303 _collate_lookup(const wchar_t *t, int *len, int *pri, int which, int **state)
304 {
305 collate_chain_t *p2;
306 collate_large_t *match;
307 collate_info_t *info = _collate_info;
308 int p, l;
309 int *sptr;
310
311 /*
312 * If this is the "last" pass for the UNDEFINED, then
313 * we just return the priority itself.
314 */
315 if (which >= info->directive_count) {
316 *pri = *t;
317 *len = 1;
318 *state = NULL;
319 return;
320 }
321
322 /*
323 * If we have remaining substitution data from a previous
324 * call, consume it first.
325 */
326 if ((sptr = *state) != NULL) {
327 *pri = *sptr;
328 sptr++;
329 *state = *sptr ? sptr : NULL;
330 *len = 0;
331 return;
332 }
333
334 /* No active substitutions */
335 *len = 1;
336
337 /*
338 * Check for composites such as dipthongs that collate as a
339 * single element (aka chains or collating-elements).
340 */
341 if (((p2 = chainsearch(t, &l)) != NULL) &&
342 ((p = p2->pri[which]) >= 0)) {
343
344 *len = l;
345 *pri = p;
346
347 } else if (*t <= UCHAR_MAX) {
348
349 /*
350 * Character is a small (8-bit) character.
351 * We just look these up directly for speed.
352 */
353 *pri = char_pri_table[*t].pri[which];
354
355 } else if ((info->large_count > 0) &&
356 ((match = largesearch(*t)) != NULL)) {
357
358 /*
359 * Character was found in the extended table.
360 */
361 *pri = match->pri.pri[which];
362
363 } else {
364 /*
365 * Character lacks a specific definition.
366 */
367 if (info->directive[which] & DIRECTIVE_UNDEFINED) {
368 /* Mask off sign bit to prevent ordering confusion. */
369 *pri = (*t & COLLATE_MAX_PRIORITY);
370 } else {
371 *pri = info->undef_pri[which];
372 }
373 /* No substitutions for undefined characters! */
374 return;
375 }
376
377 /*
378 * Try substituting (expanding) the character. We are
379 * currently doing this *after* the chain compression. I
380 * think it should not matter, but this way might be slightly
381 * faster.
382 *
383 * We do this after the priority search, as this will help us
384 * to identify a single key value. In order for this to work,
385 * its important that the priority assigned to a given element
386 * to be substituted be unique for that level. The localedef
387 * code ensures this for us.
388 */
389 if ((sptr = substsearch(*pri, which)) != NULL) {
390 if ((*pri = *sptr) != 0) {
391 sptr++;
392 *state = *sptr ? sptr : NULL;
393 }
394 }
395
396 }
397
398 /*
399 * This is the meaty part of wcsxfrm & strxfrm. Note that it does
400 * NOT NULL terminate. That is left to the caller.
401 */
402 size_t
403 _collate_wxfrm(const wchar_t *src, wchar_t *xf, size_t room)
404 {
405 int pri;
406 int len;
407 const wchar_t *t;
408 wchar_t *tr = NULL;
409 int direc;
410 int pass;
411 int32_t *state;
412 size_t want = 0;
413 size_t need = 0;
414
415 assert(src);
416
417 for (pass = 0; pass <= _collate_info->directive_count; pass++) {
418
419 state = NULL;
420
421 if (pass != 0) {
422 /* insert level separator from the previous pass */
423 if (room) {
424 *xf++ = 1;
425 room--;
426 }
427 want++;
428 }
429
430 /* special pass for undefined */
431 if (pass == _collate_info->directive_count) {
432 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
433 } else {
434 direc = _collate_info->directive[pass];
435 }
436
437 t = src;
438
439 if (direc & DIRECTIVE_BACKWARD) {
440 wchar_t *bp, *fp, c;
441 if (tr)
442 free(tr);
443 if ((tr = wcsdup(t)) == NULL) {
444 errno = ENOMEM;
445 goto fail;
446 }
447 bp = tr;
448 fp = tr + wcslen(tr) - 1;
449 while (bp < fp) {
450 c = *bp;
451 *bp++ = *fp;
452 *fp-- = c;
453 }
454 t = (const wchar_t *)tr;
455 }
456
457 if (direc & DIRECTIVE_POSITION) {
458 while (*t || state) {
459 _collate_lookup(t, &len, &pri, pass, &state);
460 t += len;
461 if (pri <= 0) {
462 if (pri < 0) {
463 errno = EINVAL;
464 goto fail;
465 }
466 pri = COLLATE_MAX_PRIORITY;
467 }
468 if (room) {
469 *xf++ = pri;
470 room--;
471 }
472 want++;
473 need = want;
474 }
475 } else {
476 while (*t || state) {
477 _collate_lookup(t, &len, &pri, pass, &state);
478 t += len;
479 if (pri <= 0) {
480 if (pri < 0) {
481 errno = EINVAL;
482 goto fail;
483 }
484 continue;
485 }
486 if (room) {
487 *xf++ = pri;
488 room--;
489 }
490 want++;
491 need = want;
492 }
493 }
494 }
495
496 end:
497 if (tr)
498 free(tr);
499 return (need);
500
501 fail:
502 if (tr)
503 free(tr);
504 return ((size_t)(-1));
505 }
506
507 /*
508 * In the non-POSIX case, we transform each character into a string of
509 * characters representing the character's priority. Since char is usually
510 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add
511 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6
512 * bits per byte.
513 *
514 * It turns out that we sometimes have real priorities that are
515 * 31-bits wide. (But: be careful using priorities where the high
516 * order bit is set -- i.e. the priority is negative. The sort order
517 * may be surprising!)
518 *
519 * TODO: This would be a good area to optimize somewhat. It turns out
520 * that real prioririties *except for the last UNDEFINED pass* are generally
521 * very small. We need the localedef code to precalculate the max
522 * priority for us, and ideally also give us a mask, and then we could
523 * severely limit what we expand to.
524 */
525 #define XFRM_BYTES 6
526 #define XFRM_OFFSET ('0') /* make all printable characters */
527 #define XFRM_SHIFT 6
528 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1)
529 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */
530
531 static int
532 xfrm(unsigned char *p, int pri, int pass)
533 {
534 /* we use unsigned to ensure zero fill on right shift */
535 uint32_t val = (uint32_t)_collate_info->pri_count[pass];
536 int nc = 0;
537
538 while (val) {
539 *p = (pri & XFRM_MASK) + XFRM_OFFSET;
540 pri >>= XFRM_SHIFT;
541 val >>= XFRM_SHIFT;
542 p++;
543 nc++;
544 }
545 return (nc);
546 }
547
548 size_t
549 _collate_sxfrm(const wchar_t *src, char *xf, size_t room)
550 {
551 int pri;
552 int len;
553 const wchar_t *t;
554 wchar_t *tr = NULL;
555 int direc;
556 int pass;
557 int32_t *state;
558 size_t want = 0;
559 size_t need = 0;
560 int b;
561 uint8_t buf[XFRM_BYTES];
562
563 assert(src);
564
565 for (pass = 0; pass <= _collate_info->directive_count; pass++) {
566
567 state = NULL;
568
569 if (pass != 0) {
570 /* insert level separator from the previous pass */
571 if (room) {
572 *xf++ = XFRM_SEP;
573 room--;
574 }
575 want++;
576 }
577
578 /* special pass for undefined */
579 if (pass == _collate_info->directive_count) {
580 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
581 } else {
582 direc = _collate_info->directive[pass];
583 }
584
585 t = src;
586
587 if (direc & DIRECTIVE_BACKWARD) {
588 wchar_t *bp, *fp, c;
589 if (tr)
590 free(tr);
591 if ((tr = wcsdup(t)) == NULL) {
592 errno = ENOMEM;
593 goto fail;
594 }
595 bp = tr;
596 fp = tr + wcslen(tr) - 1;
597 while (bp < fp) {
598 c = *bp;
599 *bp++ = *fp;
600 *fp-- = c;
601 }
602 t = (const wchar_t *)tr;
603 }
604
605 if (direc & DIRECTIVE_POSITION) {
606 while (*t || state) {
607
608 _collate_lookup(t, &len, &pri, pass, &state);
609 t += len;
610 if (pri <= 0) {
611 if (pri < 0) {
612 errno = EINVAL;
613 goto fail;
614 }
615 pri = COLLATE_MAX_PRIORITY;
616 }
617
618 b = xfrm(buf, pri, pass);
619 want += b;
620 if (room) {
621 while (b) {
622 b--;
623 if (room) {
624 *xf++ = buf[b];
625 room--;
626 }
627 }
628 }
629 need = want;
630 }
631 } else {
632 while (*t || state) {
633 _collate_lookup(t, &len, &pri, pass, &state);
634 t += len;
635 if (pri <= 0) {
636 if (pri < 0) {
637 errno = EINVAL;
638 goto fail;
639 }
640 continue;
641 }
642
643 b = xfrm(buf, pri, pass);
644 want += b;
645 if (room) {
646
647 while (b) {
648 b--;
649 if (room) {
650 *xf++ = buf[b];
651 room--;
652 }
653 }
654 }
655 need = want;
656 }
657 }
658 }
659
660 end:
661 if (tr)
662 free(tr);
663 return (need);
664
665 fail:
666 if (tr)
667 free(tr);
668 return ((size_t)(-1));
669 }