1 /*
2 * Copright 2010 Nexenta Systems, Inc. All rights reserved.
3 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
4 * at Electronni Visti IA, Kiev, Ukraine.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 */
28
29 #include "lint.h"
30 #include "file64.h"
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <stddef.h>
34 #include <string.h>
35 #include <wchar.h>
36 #include <errno.h>
37 #include <unistd.h>
38 #include <ctype.h>
39 #include <unistd.h>
40 #include <fcntl.h>
41 #include <assert.h>
42 #include <sys/stat.h>
43 #include <sys/mman.h>
44
45 #include "collate.h"
46 #include "setlocale.h"
47 #include "ldpart.h"
48
49 /*
50 * See the comments in usr/src/cmd/localedef/collate.c for further
51 * information. It would also be very helpful to have a copy of the
52 * POSIX standard for collation (in the locale format manual page)
53 * handy (www.opengroup.org).
54 */
55
56 static collate_subst_t *subst_table[COLL_WEIGHTS_MAX];
57 static collate_char_t *char_pri_table;
58 static collate_large_t *large_pri_table;
59 static collate_chain_t *chain_pri_table;
60 static char *cache = NULL;
61 static size_t cachesz;
62 static char collate_encoding[ENCODING_LEN + 1];
63
64 /* Exposed externally to other parts of libc. */
65 collate_info_t *_collate_info;
66 int _collate_load_error = 1;
67
68
69 int
70 _collate_load_tables(const char *encoding)
71 {
72 int i, chains, z;
73 char buf[PATH_MAX];
74 char *TMP;
75 char *map;
76 collate_info_t *info;
77 struct stat sbuf;
78 int fd;
79
80 /* 'encoding' must be already checked. */
81 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
82 _collate_load_error = 1;
83 return (_LDP_CACHE);
84 }
85
86 /*
87 * If the locale name is the same as our cache, use the cache.
88 */
89 if (cache && (strncmp(encoding, collate_encoding, ENCODING_LEN) == 0)) {
90 _collate_load_error = 0;
91 return (_LDP_CACHE);
92 }
93
94 /*
95 * Slurp the locale file into the cache.
96 */
97
98 (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA",
99 _PathLocale, encoding);
100
101 if ((fd = open(buf, O_RDONLY)) < 0)
102 return (_LDP_ERROR);
103 if (fstat(fd, &sbuf) < 0) {
104 (void) close(fd);
105 return (_LDP_ERROR);
106 }
107 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
108 (void) close(fd);
109 errno = EINVAL;
110 return (_LDP_ERROR);
111 }
112 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
113 (void) close(fd);
114 if ((TMP = map) == NULL) {
115 return (_LDP_ERROR);
116 }
117
118 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
119 (void) munmap(map, sbuf.st_size);
120 errno = EINVAL;
121 return (_LDP_ERROR);
122 }
123 TMP += COLLATE_STR_LEN;
124
125 info = (void *)TMP;
126 TMP += sizeof (*info);
127
128 if ((info->directive_count < 1) ||
129 (info->directive_count >= COLL_WEIGHTS_MAX) ||
130 ((chains = info->chain_count) < 0)) {
131 (void) munmap(map, sbuf.st_size);
132 errno = EINVAL;
133 return (_LDP_ERROR);
134 }
135
136 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
137 (sizeof (collate_chain_t) * chains) +
138 (sizeof (collate_large_t) * info->large_count);
139 for (z = 0; z < (info->directive_count); z++) {
140 i += sizeof (collate_subst_t) * info->subst_count[z];
141 }
142 if (i != (sbuf.st_size - (TMP - map))) {
143 (void) munmap(map, sbuf.st_size);
144 errno = EINVAL;
145 return (_LDP_ERROR);
146 }
147
148 char_pri_table = (void *)TMP;
149 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
150
151 for (z = 0; z < info->directive_count; z++) {
152 if (info->subst_count[z] > 0) {
153 subst_table[z] = (void *)TMP;
154 TMP += info->subst_count[z] * sizeof (collate_subst_t);
155 } else {
156 subst_table[z] = NULL;
157 }
158 }
159
160 if (chains > 0) {
161 chain_pri_table = (void *)TMP;
162 TMP += chains * sizeof (collate_chain_t);
163 } else
164 chain_pri_table = NULL;
165 if (info->large_count > 0)
166 large_pri_table = (void *)TMP;
167 else
168 large_pri_table = NULL;
169
170 (void) strlcpy(collate_encoding, encoding, ENCODING_LEN);
171 _collate_info = info;
172
173 if (cache)
174 (void) munmap(cache, cachesz);
175
176 cache = map;
177 cachesz = sbuf.st_size;
178 _collate_load_error = 0;
179
180 return (_LDP_LOADED);
181 }
182
183 static int32_t *
184 substsearch(const wchar_t key, int pass)
185 {
186 collate_subst_t *p;
187 int n = _collate_info->subst_count[pass];
188
189 if (n == 0)
190 return (NULL);
191
192 if (pass >= _collate_info->directive_count)
193 return (NULL);
194
195 if (!(key & COLLATE_SUBST_PRIORITY))
196 return (NULL);
197
198 p = subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
199 assert(p->key == key);
200 return (p->pri);
201 }
202
203 /*
204 * Note: for performance reasons, we have expanded bsearch here. This avoids
205 * function call overhead with each comparison.
206 */
207
208 static collate_chain_t *
209 chainsearch(const wchar_t *key, int *len)
210 {
211 int low;
212 int high;
213 int next, compar, l;
214 collate_chain_t *p;
215 collate_chain_t *tab;
216
217 if (_collate_info->chain_count == 0)
218 return (NULL);
219
220 low = 0;
221 high = _collate_info->chain_count - 1;
222 tab = chain_pri_table;
223
224 while (low <= high) {
225 next = (low + high) / 2;
226 p = tab + next;
227 compar = *key - *p->str;
228 if (compar == 0) {
229 l = wcsnlen(p->str, COLLATE_STR_LEN);
230 compar = wcsncmp(key, p->str, l);
231 if (compar == 0) {
232 *len = l;
233 return (p);
234 }
235 }
236 if (compar > 0)
237 low = next + 1;
238 else
239 high = next - 1;
240 }
241 return (NULL);
242 }
243
244 static collate_large_t *
245 largesearch(const wchar_t key)
246 {
247 int low = 0;
248 int high = _collate_info->large_count - 1;
249 int next, compar;
250 collate_large_t *p;
251 collate_large_t *tab = large_pri_table;
252
253 if (_collate_info->large_count == 0)
254 return (NULL);
255
256 while (low <= high) {
257 next = (low + high) / 2;
258 p = tab + next;
259 compar = key - p->val;
260 if (compar == 0)
261 return (p);
262 if (compar > 0)
263 low = next + 1;
264 else
265 high = next - 1;
266 }
267 return (NULL);
268 }
269
270 void
271 _collate_lookup(const wchar_t *t, int *len, int *pri, int which, int **state)
272 {
273 collate_chain_t *p2;
274 collate_large_t *match;
275 collate_info_t *info = _collate_info;
276 int p, l;
277 int *sptr;
278
279 /*
280 * If this is the "last" pass for the UNDEFINED, then
281 * we just return the priority itself.
282 */
283 if (which >= info->directive_count) {
284 *pri = *t;
285 *len = 1;
286 *state = NULL;
287 return;
288 }
289
290 /*
291 * If we have remaining substitution data from a previous
292 * call, consume it first.
293 */
294 if ((sptr = *state) != NULL) {
295 *pri = *sptr;
296 sptr++;
297 *state = *sptr ? sptr : NULL;
298 *len = 0;
299 return;
300 }
301
302 /* No active substitutions */
303 *len = 1;
304
305 /*
306 * Check for composites such as dipthongs that collate as a
307 * single element (aka chains or collating-elements).
308 */
309 if (((p2 = chainsearch(t, &l)) != NULL) &&
310 ((p = p2->pri[which]) >= 0)) {
311
312 *len = l;
313 *pri = p;
314
315 } else if (*t <= UCHAR_MAX) {
316
317 /*
318 * Character is a small (8-bit) character.
319 * We just look these up directly for speed.
320 */
321 *pri = char_pri_table[*t].pri[which];
322
323 } else if ((info->large_count > 0) &&
324 ((match = largesearch(*t)) != NULL)) {
325
326 /*
327 * Character was found in the extended table.
328 */
329 *pri = match->pri.pri[which];
330
331 } else {
332 /*
333 * Character lacks a specific definition.
334 */
335 if (info->directive[which] & DIRECTIVE_UNDEFINED) {
336 /* Mask off sign bit to prevent ordering confusion. */
337 *pri = (*t & COLLATE_MAX_PRIORITY);
338 } else {
339 *pri = info->undef_pri[which];
340 }
341 /* No substitutions for undefined characters! */
342 return;
343 }
344
345 /*
346 * Try substituting (expanding) the character. We are
347 * currently doing this *after* the chain compression. I
348 * think it should not matter, but this way might be slightly
349 * faster.
350 *
351 * We do this after the priority search, as this will help us
352 * to identify a single key value. In order for this to work,
353 * its important that the priority assigned to a given element
354 * to be substituted be unique for that level. The localedef
355 * code ensures this for us.
356 */
357 if ((sptr = substsearch(*pri, which)) != NULL) {
358 if ((*pri = *sptr) != 0) {
359 sptr++;
360 *state = *sptr ? sptr : NULL;
361 }
362 }
363
364 }
365
366 /*
367 * This is the meaty part of wcsxfrm & strxfrm. Note that it does
368 * NOT NULL terminate. That is left to the caller.
369 */
370 size_t
371 _collate_wxfrm(const wchar_t *src, wchar_t *xf, size_t room)
372 {
373 int pri;
374 int len;
375 const wchar_t *t;
376 wchar_t *tr = NULL;
377 int direc;
378 int pass;
379 int32_t *state;
380 size_t want = 0;
381 size_t need = 0;
382
383 assert(src);
384
385 for (pass = 0; pass <= _collate_info->directive_count; pass++) {
386
387 state = NULL;
388
389 if (pass != 0) {
390 /* insert level separator from the previous pass */
391 if (room) {
392 *xf++ = 1;
393 room--;
394 }
395 want++;
396 }
397
398 /* special pass for undefined */
399 if (pass == _collate_info->directive_count) {
400 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
401 } else {
402 direc = _collate_info->directive[pass];
403 }
404
405 t = src;
406
407 if (direc & DIRECTIVE_BACKWARD) {
408 wchar_t *bp, *fp, c;
409 if (tr)
410 free(tr);
411 if ((tr = wcsdup(t)) == NULL) {
412 errno = ENOMEM;
413 goto fail;
414 }
415 bp = tr;
416 fp = tr + wcslen(tr) - 1;
417 while (bp < fp) {
418 c = *bp;
419 *bp++ = *fp;
420 *fp-- = c;
421 }
422 t = (const wchar_t *)tr;
423 }
424
425 if (direc & DIRECTIVE_POSITION) {
426 while (*t || state) {
427 _collate_lookup(t, &len, &pri, pass, &state);
428 t += len;
429 if (pri <= 0) {
430 if (pri < 0) {
431 errno = EINVAL;
432 goto fail;
433 }
434 pri = COLLATE_MAX_PRIORITY;
435 }
436 if (room) {
437 *xf++ = pri;
438 room--;
439 }
440 want++;
441 need = want;
442 }
443 } else {
444 while (*t || state) {
445 _collate_lookup(t, &len, &pri, pass, &state);
446 t += len;
447 if (pri <= 0) {
448 if (pri < 0) {
449 errno = EINVAL;
450 goto fail;
451 }
452 continue;
453 }
454 if (room) {
455 *xf++ = pri;
456 room--;
457 }
458 want++;
459 need = want;
460 }
461 }
462 }
463
464 end:
465 if (tr)
480 * bits per byte.
481 *
482 * It turns out that we sometimes have real priorities that are
483 * 31-bits wide. (But: be careful using priorities where the high
484 * order bit is set -- i.e. the priority is negative. The sort order
485 * may be surprising!)
486 *
487 * TODO: This would be a good area to optimize somewhat. It turns out
488 * that real prioririties *except for the last UNDEFINED pass* are generally
489 * very small. We need the localedef code to precalculate the max
490 * priority for us, and ideally also give us a mask, and then we could
491 * severely limit what we expand to.
492 */
493 #define XFRM_BYTES 6
494 #define XFRM_OFFSET ('0') /* make all printable characters */
495 #define XFRM_SHIFT 6
496 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1)
497 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */
498
499 static int
500 xfrm(unsigned char *p, int pri, int pass)
501 {
502 /* we use unsigned to ensure zero fill on right shift */
503 uint32_t val = (uint32_t)_collate_info->pri_count[pass];
504 int nc = 0;
505
506 while (val) {
507 *p = (pri & XFRM_MASK) + XFRM_OFFSET;
508 pri >>= XFRM_SHIFT;
509 val >>= XFRM_SHIFT;
510 p++;
511 nc++;
512 }
513 return (nc);
514 }
515
516 size_t
517 _collate_sxfrm(const wchar_t *src, char *xf, size_t room)
518 {
519 int pri;
520 int len;
521 const wchar_t *t;
522 wchar_t *tr = NULL;
523 int direc;
524 int pass;
525 int32_t *state;
526 size_t want = 0;
527 size_t need = 0;
528 int b;
529 uint8_t buf[XFRM_BYTES];
530
531 assert(src);
532
533 for (pass = 0; pass <= _collate_info->directive_count; pass++) {
534
535 state = NULL;
536
537 if (pass != 0) {
538 /* insert level separator from the previous pass */
539 if (room) {
540 *xf++ = XFRM_SEP;
541 room--;
542 }
543 want++;
544 }
545
546 /* special pass for undefined */
547 if (pass == _collate_info->directive_count) {
548 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
549 } else {
550 direc = _collate_info->directive[pass];
551 }
552
553 t = src;
554
555 if (direc & DIRECTIVE_BACKWARD) {
556 wchar_t *bp, *fp, c;
557 if (tr)
558 free(tr);
559 if ((tr = wcsdup(t)) == NULL) {
560 errno = ENOMEM;
561 goto fail;
562 }
563 bp = tr;
564 fp = tr + wcslen(tr) - 1;
565 while (bp < fp) {
566 c = *bp;
567 *bp++ = *fp;
568 *fp-- = c;
569 }
570 t = (const wchar_t *)tr;
571 }
572
573 if (direc & DIRECTIVE_POSITION) {
574 while (*t || state) {
575
576 _collate_lookup(t, &len, &pri, pass, &state);
577 t += len;
578 if (pri <= 0) {
579 if (pri < 0) {
580 errno = EINVAL;
581 goto fail;
582 }
583 pri = COLLATE_MAX_PRIORITY;
584 }
585
586 b = xfrm(buf, pri, pass);
587 want += b;
588 if (room) {
589 while (b) {
590 b--;
591 if (room) {
592 *xf++ = buf[b];
593 room--;
594 }
595 }
596 }
597 need = want;
598 }
599 } else {
600 while (*t || state) {
601 _collate_lookup(t, &len, &pri, pass, &state);
602 t += len;
603 if (pri <= 0) {
604 if (pri < 0) {
605 errno = EINVAL;
606 goto fail;
607 }
608 continue;
609 }
610
611 b = xfrm(buf, pri, pass);
612 want += b;
613 if (room) {
614
615 while (b) {
616 b--;
617 if (room) {
618 *xf++ = buf[b];
619 room--;
620 }
621 }
622 }
623 need = want;
624 }
625 }
626 }
627
628 end:
629 if (tr)
630 free(tr);
631 return (need);
|
1 /*
2 * Copyright 2014 Garrett D'Amore <garrett@damore.org>
3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
4 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
5 * at Electronni Visti IA, Kiev, Ukraine.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 */
29
30 #include "lint.h"
31 #include "file64.h"
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <stddef.h>
35 #include <string.h>
36 #include <wchar.h>
37 #include <errno.h>
38 #include <unistd.h>
39 #include <ctype.h>
40 #include <unistd.h>
41 #include <fcntl.h>
42 #include <assert.h>
43 #include <sys/stat.h>
44 #include <sys/mman.h>
45
46 #include "collate.h"
47 #include "setlocale.h"
48
49 /*
50 * See the comments in usr/src/cmd/localedef/collate.c for further
51 * information. It would also be very helpful to have a copy of the
52 * POSIX standard for collation (in the locale format manual page)
53 * handy (www.opengroup.org).
54 */
55
56 /*
57 * POSIX uses empty tables and falls down to strcmp.
58 */
59 struct lc_collate lc_collate_posix = {
60 .lc_is_posix = 1,
61 };
62
63 struct locdata __posix_collate_locdata = {
64 .l_lname = "C",
65 .l_refcnt = (uint32_t)-1,
66 .l_data = { &lc_collate_posix }
67 };
68
69
70 struct locdata *
71 __lc_collate_load(const char *locname)
72 {
73 int i, chains, z;
74 char buf[PATH_MAX];
75 char *TMP;
76 char *map;
77 collate_info_t *info;
78 struct stat sbuf;
79 int fd;
80 struct locdata *ldata;
81 struct lc_collate *lcc;
82
83 /*
84 * Slurp the locale file into the cache.
85 */
86
87 (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA",
88 _PathLocale, locname);
89
90 if ((fd = open(buf, O_RDONLY)) < 0) {
91 errno = EINVAL;
92 return (NULL);
93 }
94 if (fstat(fd, &sbuf) < 0) {
95 (void) close(fd);
96 errno = EINVAL;
97 return (NULL);
98 }
99 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
100 (void) close(fd);
101 errno = EINVAL;
102 return (NULL);
103 }
104 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
105 (void) close(fd);
106 if ((TMP = map) == NULL) {
107 errno = EINVAL;
108 return (NULL);
109 }
110
111 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
112 (void) munmap(map, sbuf.st_size);
113 errno = EINVAL;
114 return (NULL);
115 }
116 TMP += COLLATE_STR_LEN;
117
118 info = (void *)TMP;
119 TMP += sizeof (*info);
120
121 if ((info->directive_count < 1) ||
122 (info->directive_count >= COLL_WEIGHTS_MAX) ||
123 ((chains = info->chain_count) < 0)) {
124 (void) munmap(map, sbuf.st_size);
125 errno = EINVAL;
126 return (NULL);
127 }
128
129 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
130 (sizeof (collate_chain_t) * chains) +
131 (sizeof (collate_large_t) * info->large_count);
132 for (z = 0; z < info->directive_count; z++) {
133 i += sizeof (collate_subst_t) * info->subst_count[z];
134 }
135 if (i != (sbuf.st_size - (TMP - map))) {
136 (void) munmap(map, sbuf.st_size);
137 errno = EINVAL;
138 return (NULL);
139 }
140
141
142 if ((ldata = __locdata_alloc(locname, sizeof (*lcc))) == NULL) {
143 (void) munmap(map, sbuf.st_size);
144 return (NULL);
145 }
146 lcc = ldata->l_data[0];
147 ldata->l_map = map;
148 ldata->l_map_len = sbuf.st_size;
149
150 lcc->lc_info = info;
151 lcc->lc_directive_count = info->directive_count;
152 lcc->lc_large_count = info->large_count;
153
154 for (z = 0; z < COLL_WEIGHTS_MAX; z++) {
155 lcc->lc_directive[z] = info->directive[z];
156 lcc->lc_subst_count[z] = info->subst_count[z];
157 lcc->lc_pri_count[z] = info->pri_count[z];
158 lcc->lc_undef_pri[z] = info->undef_pri[z];
159 }
160
161 lcc->lc_char_table = (void *)TMP;
162 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
163
164 for (z = 0; z < lcc->lc_directive_count; z++) {
165 int count;
166 if ((count = lcc->lc_subst_count[z]) > 0) {
167 lcc->lc_subst_table[z] = (void *)TMP;
168 TMP += count * sizeof (collate_subst_t);
169 } else {
170 lcc->lc_subst_table[z] = NULL;
171 }
172 }
173
174 if (chains > 0) {
175 lcc->lc_chain_table = (void *)TMP;
176 TMP += chains * sizeof (collate_chain_t);
177 } else
178 lcc->lc_chain_table = NULL;
179 lcc->lc_chain_count = chains;
180 if (lcc->lc_large_count > 0)
181 lcc->lc_large_table = (void *)TMP;
182 else
183 lcc->lc_large_table = NULL;
184
185 return (ldata);
186 }
187
188 static const int32_t *
189 substsearch(const struct lc_collate *lcc, const wchar_t key, int pass)
190 {
191 const collate_subst_t *p;
192 int n = lcc->lc_subst_count[pass];
193
194 if (n == 0)
195 return (NULL);
196
197 if (pass >= lcc->lc_directive_count)
198 return (NULL);
199
200 if (!(key & COLLATE_SUBST_PRIORITY))
201 return (NULL);
202
203 p = lcc->lc_subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
204 assert(p->key == key);
205 return (p->pri);
206 }
207
208 /*
209 * Note: for performance reasons, we have expanded bsearch here. This avoids
210 * function call overhead with each comparison.
211 */
212
213 static collate_chain_t *
214 chainsearch(const struct lc_collate *lcc, const wchar_t *key, int *len)
215 {
216 int low;
217 int high;
218 int next, compar, l;
219 collate_chain_t *p;
220 collate_chain_t *tab;
221
222 if (lcc->lc_info->chain_count == 0)
223 return (NULL);
224
225 low = 0;
226 high = lcc->lc_info->chain_count - 1;
227 tab = lcc->lc_chain_table;
228
229 while (low <= high) {
230 next = (low + high) / 2;
231 p = tab + next;
232 compar = *key - *p->str;
233 if (compar == 0) {
234 l = wcsnlen(p->str, COLLATE_STR_LEN);
235 compar = wcsncmp(key, p->str, l);
236 if (compar == 0) {
237 *len = l;
238 return (p);
239 }
240 }
241 if (compar > 0)
242 low = next + 1;
243 else
244 high = next - 1;
245 }
246 return (NULL);
247 }
248
249 static collate_large_t *
250 largesearch(const struct lc_collate *lcc, const wchar_t key)
251 {
252 int low = 0;
253 int high = lcc->lc_info->large_count - 1;
254 int next, compar;
255 collate_large_t *p;
256 collate_large_t *tab = lcc->lc_large_table;
257
258 if (lcc->lc_info->large_count == 0)
259 return (NULL);
260
261 while (low <= high) {
262 next = (low + high) / 2;
263 p = tab + next;
264 compar = key - p->val;
265 if (compar == 0)
266 return (p);
267 if (compar > 0)
268 low = next + 1;
269 else
270 high = next - 1;
271 }
272 return (NULL);
273 }
274
275 void
276 _collate_lookup(const struct lc_collate *lcc, const wchar_t *t,
277 int *len, int *pri, int which, const int **state)
278 {
279 collate_chain_t *p2;
280 collate_large_t *match;
281 int p, l;
282 const int *sptr;
283
284 /*
285 * If this is the "last" pass for the UNDEFINED, then
286 * we just return the priority itself.
287 */
288 if (which >= lcc->lc_directive_count) {
289 *pri = *t;
290 *len = 1;
291 *state = NULL;
292 return;
293 }
294
295 /*
296 * If we have remaining substitution data from a previous
297 * call, consume it first.
298 */
299 if ((sptr = *state) != NULL) {
300 *pri = *sptr;
301 sptr++;
302 *state = *sptr ? sptr : NULL;
303 *len = 0;
304 return;
305 }
306
307 /* No active substitutions */
308 *len = 1;
309
310 /*
311 * Check for composites such as dipthongs that collate as a
312 * single element (aka chains or collating-elements).
313 */
314 if (((p2 = chainsearch(lcc, t, &l)) != NULL) &&
315 ((p = p2->pri[which]) >= 0)) {
316
317 *len = l;
318 *pri = p;
319
320 } else if (*t <= UCHAR_MAX) {
321
322 /*
323 * Character is a small (8-bit) character.
324 * We just look these up directly for speed.
325 */
326 *pri = lcc->lc_char_table[*t].pri[which];
327
328 } else if ((lcc->lc_info->large_count > 0) &&
329 ((match = largesearch(lcc, *t)) != NULL)) {
330
331 /*
332 * Character was found in the extended table.
333 */
334 *pri = match->pri.pri[which];
335
336 } else {
337 /*
338 * Character lacks a specific definition.
339 */
340 if (lcc->lc_directive[which] & DIRECTIVE_UNDEFINED) {
341 /* Mask off sign bit to prevent ordering confusion. */
342 *pri = (*t & COLLATE_MAX_PRIORITY);
343 } else {
344 *pri = lcc->lc_undef_pri[which];
345 }
346 /* No substitutions for undefined characters! */
347 return;
348 }
349
350 /*
351 * Try substituting (expanding) the character. We are
352 * currently doing this *after* the chain compression. I
353 * think it should not matter, but this way might be slightly
354 * faster.
355 *
356 * We do this after the priority search, as this will help us
357 * to identify a single key value. In order for this to work,
358 * its important that the priority assigned to a given element
359 * to be substituted be unique for that level. The localedef
360 * code ensures this for us.
361 */
362 if ((sptr = substsearch(lcc, *pri, which)) != NULL) {
363 if ((*pri = *sptr) != 0) {
364 sptr++;
365 *state = *sptr ? sptr : NULL;
366 }
367 }
368
369 }
370
371 /*
372 * This is the meaty part of wcsxfrm & strxfrm. Note that it does
373 * NOT NULL terminate. That is left to the caller.
374 */
375 size_t
376 _collate_wxfrm(const struct lc_collate *lcc, const wchar_t *src, wchar_t *xf,
377 size_t room)
378 {
379 int pri;
380 int len;
381 const wchar_t *t;
382 wchar_t *tr = NULL;
383 int direc;
384 int pass;
385 const int32_t *state;
386 size_t want = 0;
387 size_t need = 0;
388 int ndir = lcc->lc_directive_count;
389
390 assert(src);
391
392 for (pass = 0; pass <= ndir; pass++) {
393
394 state = NULL;
395
396 if (pass != 0) {
397 /* insert level separator from the previous pass */
398 if (room) {
399 *xf++ = 1;
400 room--;
401 }
402 want++;
403 }
404
405 /* special pass for undefined */
406 if (pass == ndir) {
407 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
408 } else {
409 direc = lcc->lc_directive[pass];
410 }
411
412 t = src;
413
414 if (direc & DIRECTIVE_BACKWARD) {
415 wchar_t *bp, *fp, c;
416 if (tr)
417 free(tr);
418 if ((tr = wcsdup(t)) == NULL) {
419 errno = ENOMEM;
420 goto fail;
421 }
422 bp = tr;
423 fp = tr + wcslen(tr) - 1;
424 while (bp < fp) {
425 c = *bp;
426 *bp++ = *fp;
427 *fp-- = c;
428 }
429 t = (const wchar_t *)tr;
430 }
431
432 if (direc & DIRECTIVE_POSITION) {
433 while (*t || state) {
434 _collate_lookup(lcc, t, &len, &pri, pass,
435 &state);
436 t += len;
437 if (pri <= 0) {
438 if (pri < 0) {
439 errno = EINVAL;
440 goto fail;
441 }
442 pri = COLLATE_MAX_PRIORITY;
443 }
444 if (room) {
445 *xf++ = pri;
446 room--;
447 }
448 want++;
449 need = want;
450 }
451 } else {
452 while (*t || state) {
453 _collate_lookup(lcc, t, &len, &pri, pass,
454 &state);
455 t += len;
456 if (pri <= 0) {
457 if (pri < 0) {
458 errno = EINVAL;
459 goto fail;
460 }
461 continue;
462 }
463 if (room) {
464 *xf++ = pri;
465 room--;
466 }
467 want++;
468 need = want;
469 }
470 }
471 }
472
473 end:
474 if (tr)
489 * bits per byte.
490 *
491 * It turns out that we sometimes have real priorities that are
492 * 31-bits wide. (But: be careful using priorities where the high
493 * order bit is set -- i.e. the priority is negative. The sort order
494 * may be surprising!)
495 *
496 * TODO: This would be a good area to optimize somewhat. It turns out
497 * that real prioririties *except for the last UNDEFINED pass* are generally
498 * very small. We need the localedef code to precalculate the max
499 * priority for us, and ideally also give us a mask, and then we could
500 * severely limit what we expand to.
501 */
502 #define XFRM_BYTES 6
503 #define XFRM_OFFSET ('0') /* make all printable characters */
504 #define XFRM_SHIFT 6
505 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1)
506 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */
507
508 static int
509 xfrm(locale_t loc, unsigned char *p, int pri, int pass)
510 {
511 /* we use unsigned to ensure zero fill on right shift */
512 uint32_t val = (uint32_t)loc->collate->lc_pri_count[pass];
513 int nc = 0;
514
515 while (val) {
516 *p = (pri & XFRM_MASK) + XFRM_OFFSET;
517 pri >>= XFRM_SHIFT;
518 val >>= XFRM_SHIFT;
519 p++;
520 nc++;
521 }
522 return (nc);
523 }
524
525 size_t
526 _collate_sxfrm(const wchar_t *src, char *xf, size_t room, locale_t loc)
527 {
528 int pri;
529 int len;
530 const wchar_t *t;
531 wchar_t *tr = NULL;
532 int direc;
533 int pass;
534 const int32_t *state;
535 size_t want = 0;
536 size_t need = 0;
537 int b;
538 uint8_t buf[XFRM_BYTES];
539 const struct lc_collate *lcc = loc->collate;
540 int ndir = lcc->lc_directive_count;
541
542 assert(src);
543
544 for (pass = 0; pass <= ndir; pass++) {
545
546 state = NULL;
547
548 if (pass != 0) {
549 /* insert level separator from the previous pass */
550 if (room) {
551 *xf++ = XFRM_SEP;
552 room--;
553 }
554 want++;
555 }
556
557 /* special pass for undefined */
558 if (pass == ndir) {
559 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
560 } else {
561 direc = lcc->lc_directive[pass];
562 }
563
564 t = src;
565
566 if (direc & DIRECTIVE_BACKWARD) {
567 wchar_t *bp, *fp, c;
568 if (tr)
569 free(tr);
570 if ((tr = wcsdup(t)) == NULL) {
571 errno = ENOMEM;
572 goto fail;
573 }
574 bp = tr;
575 fp = tr + wcslen(tr) - 1;
576 while (bp < fp) {
577 c = *bp;
578 *bp++ = *fp;
579 *fp-- = c;
580 }
581 t = (const wchar_t *)tr;
582 }
583
584 if (direc & DIRECTIVE_POSITION) {
585 while (*t || state) {
586
587 _collate_lookup(lcc, t, &len, &pri, pass,
588 &state);
589 t += len;
590 if (pri <= 0) {
591 if (pri < 0) {
592 errno = EINVAL;
593 goto fail;
594 }
595 pri = COLLATE_MAX_PRIORITY;
596 }
597
598 b = xfrm(loc, buf, pri, pass);
599 want += b;
600 if (room) {
601 while (b) {
602 b--;
603 if (room) {
604 *xf++ = buf[b];
605 room--;
606 }
607 }
608 }
609 need = want;
610 }
611 } else {
612 while (*t || state) {
613 _collate_lookup(lcc, t, &len, &pri, pass,
614 &state);
615 t += len;
616 if (pri <= 0) {
617 if (pri < 0) {
618 errno = EINVAL;
619 goto fail;
620 }
621 continue;
622 }
623
624 b = xfrm(loc, buf, pri, pass);
625 want += b;
626 if (room) {
627
628 while (b) {
629 b--;
630 if (room) {
631 *xf++ = buf[b];
632 room--;
633 }
634 }
635 }
636 need = want;
637 }
638 }
639 }
640
641 end:
642 if (tr)
643 free(tr);
644 return (need);
|