Print this page
3154 Nonconforming tolower and toupper with UTF-8 locales
Reviewed by: Garrett D'Amore <garrett.damore@gmail.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/cmd/localedef/ctype.c
+++ new/usr/src/cmd/localedef/ctype.c
1 1 /*
2 2 * This file and its contents are supplied under the terms of the
3 3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 4 * You may only use this file in accordance with the terms of version
5 5 * 1.0 of the CDDL.
6 6 *
7 7 * A full copy of the text of the CDDL should have accompanied this
8 8 * source. A copy of the CDDL is also available via the Internet at
9 9 * http://www.illumos.org/license/CDDL.
10 10 */
11 11
12 12 /*
13 13 * Copyright 2010,2011 Nexenta Systems, Inc. All rights reserved.
14 14 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved.
15 15 */
16 16
17 17 /*
18 18 * LC_CTYPE database generation routines for localedef.
19 19 */
20 20
21 21 #include <stdio.h>
22 22 #include <stdlib.h>
23 23 #include <string.h>
24 24 #include <sys/types.h>
25 25 #include <sys/avl.h>
26 26 #include <wchar.h>
27 27 #include <ctype.h>
28 28 #include <wctype.h>
29 29 #include <unistd.h>
30 30 #include "localedef.h"
31 31 #include "parser.tab.h"
32 32 #include "runefile.h"
33 33
34 34 static avl_tree_t ctypes;
35 35
36 36 static wchar_t last_ctype;
37 37
38 38 typedef struct ctype_node {
39 39 wchar_t wc;
40 40 int32_t ctype;
41 41 int32_t toupper;
42 42 int32_t tolower;
43 43 avl_node_t avl;
44 44 } ctype_node_t;
45 45
46 46 static int
47 47 ctype_compare(const void *n1, const void *n2)
48 48 {
49 49 const ctype_node_t *c1 = n1;
50 50 const ctype_node_t *c2 = n2;
51 51
52 52 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
53 53 }
54 54
55 55 void
56 56 init_ctype(void)
57 57 {
58 58 avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t),
59 59 offsetof(ctype_node_t, avl));
60 60 }
61 61
62 62
63 63 static void
64 64 add_ctype_impl(ctype_node_t *ctn)
65 65 {
66 66 switch (last_kw) {
67 67 case T_ISUPPER:
68 68 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
69 69 break;
70 70 case T_ISLOWER:
71 71 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
72 72 break;
73 73 case T_ISALPHA:
74 74 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
75 75 break;
76 76 case T_ISDIGIT:
77 77 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT);
78 78 break;
79 79 case T_ISSPACE:
80 80 ctn->ctype |= _ISSPACE;
81 81 break;
82 82 case T_ISCNTRL:
83 83 ctn->ctype |= _ISCNTRL;
84 84 break;
85 85 case T_ISGRAPH:
86 86 ctn->ctype |= (_ISGRAPH | _ISPRINT);
87 87 break;
88 88 case T_ISPRINT:
89 89 ctn->ctype |= _ISPRINT;
90 90 break;
91 91 case T_ISPUNCT:
92 92 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
93 93 break;
94 94 case T_ISXDIGIT:
95 95 ctn->ctype |= (_ISXDIGIT | _ISPRINT);
96 96 break;
97 97 case T_ISBLANK:
98 98 ctn->ctype |= (_ISBLANK | _ISSPACE);
99 99 break;
100 100 case T_ISPHONOGRAM:
101 101 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
102 102 break;
103 103 case T_ISIDEOGRAM:
104 104 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
105 105 break;
106 106 case T_ISENGLISH:
107 107 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
108 108 break;
109 109 case T_ISNUMBER:
110 110 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
111 111 break;
112 112 case T_ISSPECIAL:
113 113 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
114 114 break;
115 115 case T_ISALNUM:
116 116 /*
117 117 * We can't do anything with this. The character
118 118 * should already be specified as a digit or alpha.
119 119 */
120 120 break;
121 121 default:
122 122 errf(_("not a valid character class"));
123 123 }
124 124 }
125 125
126 126 static ctype_node_t *
127 127 get_ctype(wchar_t wc)
128 128 {
129 129 ctype_node_t srch;
130 130 ctype_node_t *ctn;
131 131 avl_index_t where;
132 132
133 133 srch.wc = wc;
134 134 if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) {
135 135 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
136 136 errf(_("out of memory"));
137 137 return (NULL);
138 138 }
139 139 ctn->wc = wc;
140 140
141 141 avl_insert(&ctypes, ctn, where);
142 142 }
143 143 return (ctn);
144 144 }
145 145
146 146 void
147 147 add_ctype(int val)
148 148 {
149 149 ctype_node_t *ctn;
150 150
151 151 if ((ctn = get_ctype(val)) == NULL) {
152 152 INTERR;
153 153 return;
154 154 }
155 155 add_ctype_impl(ctn);
156 156 last_ctype = ctn->wc;
157 157 }
158 158
159 159 void
160 160 add_ctype_range(int end)
161 161 {
162 162 ctype_node_t *ctn;
163 163 wchar_t cur;
164 164
165 165 if (end < last_ctype) {
166 166 errf(_("malformed character range (%u ... %u))"),
167 167 last_ctype, end);
168 168 return;
169 169 }
170 170 for (cur = last_ctype + 1; cur <= end; cur++) {
171 171 if ((ctn = get_ctype(cur)) == NULL) {
172 172 INTERR;
173 173 return;
174 174 }
175 175 add_ctype_impl(ctn);
176 176 }
177 177 last_ctype = end;
178 178
179 179 }
180 180
181 181 void
182 182 add_caseconv(int val, int wc)
183 183 {
184 184 ctype_node_t *ctn;
185 185
186 186 ctn = get_ctype(val);
187 187 if (ctn == NULL) {
188 188 INTERR;
189 189 return;
190 190 }
191 191
192 192 switch (last_kw) {
193 193 case T_TOUPPER:
194 194 ctn->toupper = wc;
195 195 break;
196 196 case T_TOLOWER:
197 197 ctn->tolower = wc;
198 198 break;
199 199 default:
200 200 INTERR;
201 201 break;
202 202 }
203 203 }
↓ open down ↓ |
203 lines elided |
↑ open up ↑ |
204 204
205 205 void
206 206 dump_ctype(void)
207 207 {
208 208 FILE *f;
209 209 _FileRuneLocale rl;
210 210 ctype_node_t *ctn, *last_ct, *last_lo, *last_up;
211 211 _FileRuneEntry *ct = NULL;
212 212 _FileRuneEntry *lo = NULL;
213 213 _FileRuneEntry *up = NULL;
214 + wchar_t wc;
214 215
215 216 (void) memset(&rl, 0, sizeof (rl));
216 217 last_ct = NULL;
217 218 last_lo = NULL;
218 219 last_up = NULL;
219 220
220 221 if ((f = open_category()) == NULL)
221 222 return;
222 223
223 224 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
224 225 (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
225 226
226 - for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) {
227 + /*
228 + * Initialize the identity map.
229 + */
230 + for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
231 + rl.maplower[wc] = wc;
232 + rl.mapupper[wc] = wc;
233 + }
227 234
228 - wchar_t wc = ctn->wc;
235 + for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) {
229 236 int conflict = 0;
230 237
238 + wc = ctn->wc;
239 +
231 240 /*
232 241 * POSIX requires certain portable characters have
233 242 * certain types. Add them if they are missing.
234 243 */
235 244 if ((wc >= 1) && (wc <= 127)) {
236 245 if ((wc >= 'A') && (wc <= 'Z'))
237 246 ctn->ctype |= _ISUPPER;
238 247 if ((wc >= 'a') && (wc <= 'z'))
239 248 ctn->ctype |= _ISLOWER;
240 249 if ((wc >= '0') && (wc <= '9'))
241 250 ctn->ctype |= _ISDIGIT;
242 251 if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
243 252 ctn->ctype |= _ISSPACE;
244 253 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
245 254 ctn->ctype |= _ISXDIGIT;
246 255 if (strchr(" \t", (char)wc))
247 256 ctn->ctype |= _ISBLANK;
248 257
249 258 /*
250 259 * Technically these settings are only
251 260 * required for the C locale. However, it
252 261 * turns out that because of the historical
253 262 * version of isprint(), we need them for all
254 263 * locales as well. Note that these are not
255 264 * necessarily valid punctation characters in
256 265 * the current language, but ispunct() needs
257 266 * to return TRUE for them.
258 267 */
259 268 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
260 269 (char)wc))
261 270 ctn->ctype |= _ISPUNCT;
262 271 }
263 272
264 273 /*
265 274 * POSIX also requires that certain types imply
266 275 * others. Add any inferred types here.
267 276 */
268 277 if (ctn->ctype & (_ISUPPER |_ISLOWER))
269 278 ctn->ctype |= _ISALPHA;
270 279 if (ctn->ctype & _ISDIGIT)
271 280 ctn->ctype |= _ISXDIGIT;
272 281 if (ctn->ctype & _ISBLANK)
273 282 ctn->ctype |= _ISSPACE;
274 283 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
275 284 ctn->ctype |= _ISGRAPH;
276 285 if (ctn->ctype & _ISGRAPH)
277 286 ctn->ctype |= _ISPRINT;
278 287
279 288 /*
280 289 * Finally, POSIX requires that certain combinations
281 290 * are invalid. We don't flag this as a fatal error,
282 291 * but we will warn about.
283 292 */
284 293 if ((ctn->ctype & _ISALPHA) &&
285 294 (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
286 295 conflict++;
287 296 if ((ctn->ctype & _ISPUNCT) &
288 297 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
289 298 conflict++;
290 299 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
291 300 conflict++;
292 301 if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
293 302 conflict++;
294 303 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
295 304 conflict++;
296 305
297 306 if (conflict) {
↓ open down ↓ |
57 lines elided |
↑ open up ↑ |
298 307 warn("conflicting classes for character 0x%x (%x)",
299 308 wc, ctn->ctype);
300 309 }
301 310 /*
302 311 * Handle the lower 256 characters using the simple
303 312 * optimization. Note that if we have not defined the
304 313 * upper/lower case, then we identity map it.
305 314 */
306 315 if ((unsigned)wc < _CACHED_RUNES) {
307 316 rl.runetype[wc] = ctn->ctype;
308 - rl.maplower[wc] = ctn->tolower ? ctn->tolower : wc;
309 - rl.mapupper[wc] = ctn->toupper ? ctn->toupper : wc;
317 + if (ctn->tolower)
318 + rl.maplower[wc] = ctn->tolower;
319 + if (ctn->toupper)
320 + rl.mapupper[wc] = ctn->toupper;
310 321 continue;
311 322 }
312 323
313 324 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) {
314 325 ct[rl.runetype_ext_nranges-1].max = wc;
315 326 last_ct = ctn;
316 327 } else {
317 328 rl.runetype_ext_nranges++;
318 329 ct = realloc(ct,
319 330 sizeof (*ct) * rl.runetype_ext_nranges);
320 331 ct[rl.runetype_ext_nranges - 1].min = wc;
321 332 ct[rl.runetype_ext_nranges - 1].max = wc;
322 333 ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
323 334 last_ct = ctn;
324 335 }
325 336 if (ctn->tolower == 0) {
326 337 last_lo = NULL;
327 338 } else if ((last_lo != NULL) &&
328 339 (last_lo->tolower + 1 == ctn->tolower)) {
329 340 lo[rl.maplower_ext_nranges-1].max = wc;
330 341 last_lo = ctn;
331 342 } else {
332 343 rl.maplower_ext_nranges++;
333 344 lo = realloc(lo,
334 345 sizeof (*lo) * rl.maplower_ext_nranges);
335 346 lo[rl.maplower_ext_nranges - 1].min = wc;
336 347 lo[rl.maplower_ext_nranges - 1].max = wc;
337 348 lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
338 349 last_lo = ctn;
339 350 }
340 351
341 352 if (ctn->toupper == 0) {
342 353 last_up = NULL;
343 354 } else if ((last_up != NULL) &&
344 355 (last_up->toupper + 1 == ctn->toupper)) {
345 356 up[rl.mapupper_ext_nranges-1].max = wc;
346 357 last_up = ctn;
347 358 } else {
348 359 rl.mapupper_ext_nranges++;
349 360 up = realloc(up,
350 361 sizeof (*up) * rl.mapupper_ext_nranges);
351 362 up[rl.mapupper_ext_nranges - 1].min = wc;
352 363 up[rl.mapupper_ext_nranges - 1].max = wc;
353 364 up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
354 365 last_up = ctn;
355 366 }
356 367 }
357 368
358 369 if ((wr_category(&rl, sizeof (rl), f) < 0) ||
359 370 (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
360 371 (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
361 372 (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
362 373 return;
363 374 }
364 375
365 376 close_category(f);
366 377 }
↓ open down ↓ |
47 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX