Print this page
2964 need POSIX 2008 locale object support
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/lib/libc/port/locale/collate.c
+++ new/usr/src/lib/libc/port/locale/collate.c
1 1 /*
2 2 * Copright 2010 Nexenta Systems, Inc. All rights reserved.
3 3 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
4 4 * at Electronni Visti IA, Kiev, Ukraine.
5 5 * All rights reserved.
6 6 *
7 7 * Redistribution and use in source and binary forms, with or without
8 8 * modification, are permitted provided that the following conditions
9 9 * are met:
10 10 * 1. Redistributions of source code must retain the above copyright
11 11 * notice, this list of conditions and the following disclaimer.
12 12 * 2. Redistributions in binary form must reproduce the above copyright
13 13 * notice, this list of conditions and the following disclaimer in the
14 14 * documentation and/or other materials provided with the distribution.
15 15 *
16 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
20 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 26 * SUCH DAMAGE.
27 27 */
28 28
29 29 #include "lint.h"
30 30 #include "file64.h"
31 31 #include <stdio.h>
32 32 #include <stdlib.h>
33 33 #include <stddef.h>
34 34 #include <string.h>
35 35 #include <wchar.h>
36 36 #include <errno.h>
37 37 #include <unistd.h>
38 38 #include <ctype.h>
39 39 #include <unistd.h>
40 40 #include <fcntl.h>
41 41 #include <assert.h>
42 42 #include <sys/stat.h>
43 43 #include <sys/mman.h>
44 44
45 45 #include "collate.h"
46 46 #include "setlocale.h"
47 47 #include "ldpart.h"
48 48
49 49 /*
50 50 * See the comments in usr/src/cmd/localedef/collate.c for further
51 51 * information. It would also be very helpful to have a copy of the
52 52 * POSIX standard for collation (in the locale format manual page)
53 53 * handy (www.opengroup.org).
54 54 */
55 55
56 56 static collate_subst_t *subst_table[COLL_WEIGHTS_MAX];
57 57 static collate_char_t *char_pri_table;
↓ open down ↓ |
57 lines elided |
↑ open up ↑ |
58 58 static collate_large_t *large_pri_table;
59 59 static collate_chain_t *chain_pri_table;
60 60 static char *cache = NULL;
61 61 static size_t cachesz;
62 62 static char collate_encoding[ENCODING_LEN + 1];
63 63
64 64 /* Exposed externally to other parts of libc. */
65 65 collate_info_t *_collate_info;
66 66 int _collate_load_error = 1;
67 67
68 +struct xlocale_collate __xlocale_global_collate = {
69 + {{0}, "C"}, 1, 0
70 +};
71 +
72 +struct xlocale_collate __xlocale_C_collate = {
73 + {{0}, "C"}, 1, 0
74 +};
75 +
76 +static void
77 +destruct_collate(void *t)
78 +{
79 + struct xlocale_collate *table = t;
80 +
81 + /* XXX */;
82 +}
83 +
84 +void *
85 +__collate_load(const char *encoding, locale_t unused)
86 +{
87 + struct xlocale_collate *table;
88 +
89 + if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
90 + return &__xlocale_C_collate;
91 + }
92 +
93 + table = calloc(sizeof(struct xlocale_collate), 1);
94 + if (table == NULL) {
95 + /* XXX */
96 + }
97 +
98 + return (table);
99 +}
68 100
69 101 int
70 102 _collate_load_tables(const char *encoding)
71 103 {
72 104 int i, chains, z;
73 105 char buf[PATH_MAX];
74 106 char *TMP;
75 107 char *map;
76 108 collate_info_t *info;
77 109 struct stat sbuf;
78 110 int fd;
79 111
80 112 /* 'encoding' must be already checked. */
81 113 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
82 114 _collate_load_error = 1;
83 115 return (_LDP_CACHE);
84 116 }
85 117
86 118 /*
87 119 * If the locale name is the same as our cache, use the cache.
88 120 */
89 121 if (cache && (strncmp(encoding, collate_encoding, ENCODING_LEN) == 0)) {
90 122 _collate_load_error = 0;
91 123 return (_LDP_CACHE);
92 124 }
93 125
94 126 /*
95 127 * Slurp the locale file into the cache.
96 128 */
97 129
98 130 (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA",
99 131 _PathLocale, encoding);
100 132
101 133 if ((fd = open(buf, O_RDONLY)) < 0)
102 134 return (_LDP_ERROR);
103 135 if (fstat(fd, &sbuf) < 0) {
104 136 (void) close(fd);
105 137 return (_LDP_ERROR);
106 138 }
107 139 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
108 140 (void) close(fd);
109 141 errno = EINVAL;
110 142 return (_LDP_ERROR);
111 143 }
112 144 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
113 145 (void) close(fd);
114 146 if ((TMP = map) == NULL) {
115 147 return (_LDP_ERROR);
116 148 }
117 149
118 150 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
119 151 (void) munmap(map, sbuf.st_size);
120 152 errno = EINVAL;
121 153 return (_LDP_ERROR);
122 154 }
123 155 TMP += COLLATE_STR_LEN;
124 156
125 157 info = (void *)TMP;
126 158 TMP += sizeof (*info);
127 159
128 160 if ((info->directive_count < 1) ||
129 161 (info->directive_count >= COLL_WEIGHTS_MAX) ||
130 162 ((chains = info->chain_count) < 0)) {
131 163 (void) munmap(map, sbuf.st_size);
132 164 errno = EINVAL;
133 165 return (_LDP_ERROR);
134 166 }
135 167
136 168 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
137 169 (sizeof (collate_chain_t) * chains) +
138 170 (sizeof (collate_large_t) * info->large_count);
139 171 for (z = 0; z < (info->directive_count); z++) {
140 172 i += sizeof (collate_subst_t) * info->subst_count[z];
141 173 }
142 174 if (i != (sbuf.st_size - (TMP - map))) {
143 175 (void) munmap(map, sbuf.st_size);
144 176 errno = EINVAL;
145 177 return (_LDP_ERROR);
146 178 }
147 179
148 180 char_pri_table = (void *)TMP;
149 181 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
150 182
151 183 for (z = 0; z < info->directive_count; z++) {
152 184 if (info->subst_count[z] > 0) {
153 185 subst_table[z] = (void *)TMP;
154 186 TMP += info->subst_count[z] * sizeof (collate_subst_t);
155 187 } else {
156 188 subst_table[z] = NULL;
157 189 }
158 190 }
159 191
160 192 if (chains > 0) {
161 193 chain_pri_table = (void *)TMP;
162 194 TMP += chains * sizeof (collate_chain_t);
163 195 } else
164 196 chain_pri_table = NULL;
165 197 if (info->large_count > 0)
166 198 large_pri_table = (void *)TMP;
167 199 else
168 200 large_pri_table = NULL;
169 201
170 202 (void) strlcpy(collate_encoding, encoding, ENCODING_LEN);
171 203 _collate_info = info;
172 204
173 205 if (cache)
174 206 (void) munmap(cache, cachesz);
175 207
176 208 cache = map;
177 209 cachesz = sbuf.st_size;
178 210 _collate_load_error = 0;
179 211
180 212 return (_LDP_LOADED);
181 213 }
182 214
183 215 static int32_t *
184 216 substsearch(const wchar_t key, int pass)
185 217 {
186 218 collate_subst_t *p;
187 219 int n = _collate_info->subst_count[pass];
188 220
189 221 if (n == 0)
190 222 return (NULL);
191 223
192 224 if (pass >= _collate_info->directive_count)
193 225 return (NULL);
194 226
195 227 if (!(key & COLLATE_SUBST_PRIORITY))
196 228 return (NULL);
197 229
198 230 p = subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
199 231 assert(p->key == key);
200 232 return (p->pri);
201 233 }
202 234
203 235 /*
204 236 * Note: for performance reasons, we have expanded bsearch here. This avoids
205 237 * function call overhead with each comparison.
206 238 */
207 239
208 240 static collate_chain_t *
209 241 chainsearch(const wchar_t *key, int *len)
210 242 {
211 243 int low;
212 244 int high;
213 245 int next, compar, l;
214 246 collate_chain_t *p;
215 247 collate_chain_t *tab;
216 248
217 249 if (_collate_info->chain_count == 0)
218 250 return (NULL);
219 251
220 252 low = 0;
221 253 high = _collate_info->chain_count - 1;
222 254 tab = chain_pri_table;
223 255
224 256 while (low <= high) {
225 257 next = (low + high) / 2;
226 258 p = tab + next;
227 259 compar = *key - *p->str;
228 260 if (compar == 0) {
229 261 l = wcsnlen(p->str, COLLATE_STR_LEN);
230 262 compar = wcsncmp(key, p->str, l);
231 263 if (compar == 0) {
232 264 *len = l;
233 265 return (p);
234 266 }
235 267 }
236 268 if (compar > 0)
237 269 low = next + 1;
238 270 else
239 271 high = next - 1;
240 272 }
241 273 return (NULL);
242 274 }
243 275
244 276 static collate_large_t *
245 277 largesearch(const wchar_t key)
246 278 {
247 279 int low = 0;
248 280 int high = _collate_info->large_count - 1;
249 281 int next, compar;
250 282 collate_large_t *p;
251 283 collate_large_t *tab = large_pri_table;
252 284
253 285 if (_collate_info->large_count == 0)
254 286 return (NULL);
255 287
256 288 while (low <= high) {
257 289 next = (low + high) / 2;
258 290 p = tab + next;
259 291 compar = key - p->val;
260 292 if (compar == 0)
261 293 return (p);
262 294 if (compar > 0)
263 295 low = next + 1;
264 296 else
265 297 high = next - 1;
266 298 }
267 299 return (NULL);
268 300 }
269 301
270 302 void
271 303 _collate_lookup(const wchar_t *t, int *len, int *pri, int which, int **state)
272 304 {
273 305 collate_chain_t *p2;
274 306 collate_large_t *match;
275 307 collate_info_t *info = _collate_info;
276 308 int p, l;
277 309 int *sptr;
278 310
279 311 /*
280 312 * If this is the "last" pass for the UNDEFINED, then
281 313 * we just return the priority itself.
282 314 */
283 315 if (which >= info->directive_count) {
284 316 *pri = *t;
285 317 *len = 1;
286 318 *state = NULL;
287 319 return;
288 320 }
289 321
290 322 /*
291 323 * If we have remaining substitution data from a previous
292 324 * call, consume it first.
293 325 */
294 326 if ((sptr = *state) != NULL) {
295 327 *pri = *sptr;
296 328 sptr++;
297 329 *state = *sptr ? sptr : NULL;
298 330 *len = 0;
299 331 return;
300 332 }
301 333
302 334 /* No active substitutions */
303 335 *len = 1;
304 336
305 337 /*
306 338 * Check for composites such as dipthongs that collate as a
307 339 * single element (aka chains or collating-elements).
308 340 */
309 341 if (((p2 = chainsearch(t, &l)) != NULL) &&
310 342 ((p = p2->pri[which]) >= 0)) {
311 343
312 344 *len = l;
313 345 *pri = p;
314 346
315 347 } else if (*t <= UCHAR_MAX) {
316 348
317 349 /*
318 350 * Character is a small (8-bit) character.
319 351 * We just look these up directly for speed.
320 352 */
321 353 *pri = char_pri_table[*t].pri[which];
322 354
323 355 } else if ((info->large_count > 0) &&
324 356 ((match = largesearch(*t)) != NULL)) {
325 357
326 358 /*
327 359 * Character was found in the extended table.
328 360 */
329 361 *pri = match->pri.pri[which];
330 362
331 363 } else {
332 364 /*
333 365 * Character lacks a specific definition.
334 366 */
335 367 if (info->directive[which] & DIRECTIVE_UNDEFINED) {
336 368 /* Mask off sign bit to prevent ordering confusion. */
337 369 *pri = (*t & COLLATE_MAX_PRIORITY);
338 370 } else {
339 371 *pri = info->undef_pri[which];
340 372 }
341 373 /* No substitutions for undefined characters! */
342 374 return;
343 375 }
344 376
345 377 /*
346 378 * Try substituting (expanding) the character. We are
347 379 * currently doing this *after* the chain compression. I
348 380 * think it should not matter, but this way might be slightly
349 381 * faster.
350 382 *
351 383 * We do this after the priority search, as this will help us
352 384 * to identify a single key value. In order for this to work,
353 385 * its important that the priority assigned to a given element
354 386 * to be substituted be unique for that level. The localedef
355 387 * code ensures this for us.
356 388 */
357 389 if ((sptr = substsearch(*pri, which)) != NULL) {
358 390 if ((*pri = *sptr) != 0) {
359 391 sptr++;
360 392 *state = *sptr ? sptr : NULL;
361 393 }
362 394 }
363 395
364 396 }
365 397
366 398 /*
367 399 * This is the meaty part of wcsxfrm & strxfrm. Note that it does
368 400 * NOT NULL terminate. That is left to the caller.
369 401 */
370 402 size_t
371 403 _collate_wxfrm(const wchar_t *src, wchar_t *xf, size_t room)
372 404 {
373 405 int pri;
374 406 int len;
375 407 const wchar_t *t;
376 408 wchar_t *tr = NULL;
377 409 int direc;
378 410 int pass;
379 411 int32_t *state;
380 412 size_t want = 0;
381 413 size_t need = 0;
382 414
383 415 assert(src);
384 416
385 417 for (pass = 0; pass <= _collate_info->directive_count; pass++) {
386 418
387 419 state = NULL;
388 420
389 421 if (pass != 0) {
390 422 /* insert level separator from the previous pass */
391 423 if (room) {
392 424 *xf++ = 1;
393 425 room--;
394 426 }
395 427 want++;
396 428 }
397 429
398 430 /* special pass for undefined */
399 431 if (pass == _collate_info->directive_count) {
400 432 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
401 433 } else {
402 434 direc = _collate_info->directive[pass];
403 435 }
404 436
405 437 t = src;
406 438
407 439 if (direc & DIRECTIVE_BACKWARD) {
408 440 wchar_t *bp, *fp, c;
409 441 if (tr)
410 442 free(tr);
411 443 if ((tr = wcsdup(t)) == NULL) {
412 444 errno = ENOMEM;
413 445 goto fail;
414 446 }
415 447 bp = tr;
416 448 fp = tr + wcslen(tr) - 1;
417 449 while (bp < fp) {
418 450 c = *bp;
419 451 *bp++ = *fp;
420 452 *fp-- = c;
421 453 }
422 454 t = (const wchar_t *)tr;
423 455 }
424 456
425 457 if (direc & DIRECTIVE_POSITION) {
426 458 while (*t || state) {
427 459 _collate_lookup(t, &len, &pri, pass, &state);
428 460 t += len;
429 461 if (pri <= 0) {
430 462 if (pri < 0) {
431 463 errno = EINVAL;
432 464 goto fail;
433 465 }
434 466 pri = COLLATE_MAX_PRIORITY;
435 467 }
436 468 if (room) {
437 469 *xf++ = pri;
438 470 room--;
439 471 }
440 472 want++;
441 473 need = want;
442 474 }
443 475 } else {
444 476 while (*t || state) {
445 477 _collate_lookup(t, &len, &pri, pass, &state);
446 478 t += len;
447 479 if (pri <= 0) {
448 480 if (pri < 0) {
449 481 errno = EINVAL;
450 482 goto fail;
451 483 }
452 484 continue;
453 485 }
454 486 if (room) {
455 487 *xf++ = pri;
456 488 room--;
457 489 }
458 490 want++;
459 491 need = want;
460 492 }
461 493 }
462 494 }
463 495
464 496 end:
465 497 if (tr)
466 498 free(tr);
467 499 return (need);
468 500
469 501 fail:
470 502 if (tr)
471 503 free(tr);
472 504 return ((size_t)(-1));
473 505 }
474 506
475 507 /*
476 508 * In the non-POSIX case, we transform each character into a string of
477 509 * characters representing the character's priority. Since char is usually
478 510 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add
479 511 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6
480 512 * bits per byte.
481 513 *
482 514 * It turns out that we sometimes have real priorities that are
483 515 * 31-bits wide. (But: be careful using priorities where the high
484 516 * order bit is set -- i.e. the priority is negative. The sort order
485 517 * may be surprising!)
486 518 *
487 519 * TODO: This would be a good area to optimize somewhat. It turns out
488 520 * that real prioririties *except for the last UNDEFINED pass* are generally
489 521 * very small. We need the localedef code to precalculate the max
490 522 * priority for us, and ideally also give us a mask, and then we could
491 523 * severely limit what we expand to.
492 524 */
493 525 #define XFRM_BYTES 6
494 526 #define XFRM_OFFSET ('0') /* make all printable characters */
495 527 #define XFRM_SHIFT 6
496 528 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1)
497 529 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */
498 530
499 531 static int
500 532 xfrm(unsigned char *p, int pri, int pass)
501 533 {
502 534 /* we use unsigned to ensure zero fill on right shift */
503 535 uint32_t val = (uint32_t)_collate_info->pri_count[pass];
504 536 int nc = 0;
505 537
506 538 while (val) {
507 539 *p = (pri & XFRM_MASK) + XFRM_OFFSET;
508 540 pri >>= XFRM_SHIFT;
509 541 val >>= XFRM_SHIFT;
510 542 p++;
511 543 nc++;
512 544 }
513 545 return (nc);
514 546 }
515 547
516 548 size_t
517 549 _collate_sxfrm(const wchar_t *src, char *xf, size_t room)
518 550 {
519 551 int pri;
520 552 int len;
521 553 const wchar_t *t;
522 554 wchar_t *tr = NULL;
523 555 int direc;
524 556 int pass;
525 557 int32_t *state;
526 558 size_t want = 0;
527 559 size_t need = 0;
528 560 int b;
529 561 uint8_t buf[XFRM_BYTES];
530 562
531 563 assert(src);
532 564
533 565 for (pass = 0; pass <= _collate_info->directive_count; pass++) {
534 566
535 567 state = NULL;
536 568
537 569 if (pass != 0) {
538 570 /* insert level separator from the previous pass */
539 571 if (room) {
540 572 *xf++ = XFRM_SEP;
541 573 room--;
542 574 }
543 575 want++;
544 576 }
545 577
546 578 /* special pass for undefined */
547 579 if (pass == _collate_info->directive_count) {
548 580 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
549 581 } else {
550 582 direc = _collate_info->directive[pass];
551 583 }
552 584
553 585 t = src;
554 586
555 587 if (direc & DIRECTIVE_BACKWARD) {
556 588 wchar_t *bp, *fp, c;
557 589 if (tr)
558 590 free(tr);
559 591 if ((tr = wcsdup(t)) == NULL) {
560 592 errno = ENOMEM;
561 593 goto fail;
562 594 }
563 595 bp = tr;
564 596 fp = tr + wcslen(tr) - 1;
565 597 while (bp < fp) {
566 598 c = *bp;
567 599 *bp++ = *fp;
568 600 *fp-- = c;
569 601 }
570 602 t = (const wchar_t *)tr;
571 603 }
572 604
573 605 if (direc & DIRECTIVE_POSITION) {
574 606 while (*t || state) {
575 607
576 608 _collate_lookup(t, &len, &pri, pass, &state);
577 609 t += len;
578 610 if (pri <= 0) {
579 611 if (pri < 0) {
580 612 errno = EINVAL;
581 613 goto fail;
582 614 }
583 615 pri = COLLATE_MAX_PRIORITY;
584 616 }
585 617
586 618 b = xfrm(buf, pri, pass);
587 619 want += b;
588 620 if (room) {
589 621 while (b) {
590 622 b--;
591 623 if (room) {
592 624 *xf++ = buf[b];
593 625 room--;
594 626 }
595 627 }
596 628 }
597 629 need = want;
598 630 }
599 631 } else {
600 632 while (*t || state) {
601 633 _collate_lookup(t, &len, &pri, pass, &state);
602 634 t += len;
603 635 if (pri <= 0) {
604 636 if (pri < 0) {
605 637 errno = EINVAL;
606 638 goto fail;
607 639 }
608 640 continue;
609 641 }
610 642
611 643 b = xfrm(buf, pri, pass);
612 644 want += b;
613 645 if (room) {
614 646
615 647 while (b) {
616 648 b--;
617 649 if (room) {
618 650 *xf++ = buf[b];
619 651 room--;
620 652 }
621 653 }
622 654 }
623 655 need = want;
624 656 }
625 657 }
626 658 }
627 659
628 660 end:
629 661 if (tr)
630 662 free(tr);
631 663 return (need);
632 664
633 665 fail:
634 666 if (tr)
635 667 free(tr);
636 668 return ((size_t)(-1));
637 669 }
↓ open down ↓ |
560 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX