1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2010,2011 Nexenta Systems, Inc. All rights reserved.
14 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved.
15 */
16
17 /*
18 * LC_CTYPE database generation routines for localedef.
19 */
20
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <sys/types.h>
25 #include <sys/avl.h>
26 #include <wchar.h>
27 #include <ctype.h>
28 #include <wctype.h>
29 #include <unistd.h>
30 #include "localedef.h"
31 #include "parser.tab.h"
32 #include "runefile.h"
33
34 static avl_tree_t ctypes;
35
36 static wchar_t last_ctype;
37
38 typedef struct ctype_node {
39 wchar_t wc;
40 int32_t ctype;
41 int32_t toupper;
42 int32_t tolower;
43 avl_node_t avl;
44 } ctype_node_t;
45
46 static int
47 ctype_compare(const void *n1, const void *n2)
48 {
49 const ctype_node_t *c1 = n1;
50 const ctype_node_t *c2 = n2;
51
52 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
53 }
54
55 void
56 init_ctype(void)
57 {
58 avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t),
59 offsetof(ctype_node_t, avl));
60 }
61
62
63 static void
64 add_ctype_impl(ctype_node_t *ctn)
65 {
66 switch (last_kw) {
67 case T_ISUPPER:
68 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
69 break;
70 case T_ISLOWER:
71 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
72 break;
73 case T_ISALPHA:
74 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
75 break;
76 case T_ISDIGIT:
77 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT);
78 break;
79 case T_ISSPACE:
80 ctn->ctype |= _ISSPACE;
81 break;
82 case T_ISCNTRL:
83 ctn->ctype |= _ISCNTRL;
84 break;
85 case T_ISGRAPH:
86 ctn->ctype |= (_ISGRAPH | _ISPRINT);
87 break;
88 case T_ISPRINT:
89 ctn->ctype |= _ISPRINT;
90 break;
91 case T_ISPUNCT:
92 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
93 break;
94 case T_ISXDIGIT:
95 ctn->ctype |= (_ISXDIGIT | _ISPRINT);
96 break;
97 case T_ISBLANK:
98 ctn->ctype |= (_ISBLANK | _ISSPACE);
99 break;
100 case T_ISPHONOGRAM:
101 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
102 break;
103 case T_ISIDEOGRAM:
104 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
105 break;
106 case T_ISENGLISH:
107 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
108 break;
109 case T_ISNUMBER:
110 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
111 break;
112 case T_ISSPECIAL:
113 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
114 break;
115 case T_ISALNUM:
116 /*
117 * We can't do anything with this. The character
118 * should already be specified as a digit or alpha.
119 */
120 break;
121 default:
122 errf(_("not a valid character class"));
123 }
124 }
125
126 static ctype_node_t *
127 get_ctype(wchar_t wc)
128 {
129 ctype_node_t srch;
130 ctype_node_t *ctn;
131 avl_index_t where;
132
133 srch.wc = wc;
134 if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) {
135 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
136 errf(_("out of memory"));
137 return (NULL);
138 }
139 ctn->wc = wc;
140
141 avl_insert(&ctypes, ctn, where);
142 }
143 return (ctn);
144 }
145
146 void
147 add_ctype(int val)
148 {
149 ctype_node_t *ctn;
150
151 if ((ctn = get_ctype(val)) == NULL) {
152 INTERR;
153 return;
154 }
155 add_ctype_impl(ctn);
156 last_ctype = ctn->wc;
157 }
158
159 void
160 add_ctype_range(int end)
161 {
162 ctype_node_t *ctn;
163 wchar_t cur;
164
165 if (end < last_ctype) {
166 errf(_("malformed character range (%u ... %u))"),
167 last_ctype, end);
168 return;
169 }
170 for (cur = last_ctype + 1; cur <= end; cur++) {
171 if ((ctn = get_ctype(cur)) == NULL) {
172 INTERR;
173 return;
174 }
175 add_ctype_impl(ctn);
176 }
177 last_ctype = end;
178
179 }
180
181 void
182 add_caseconv(int val, int wc)
183 {
184 ctype_node_t *ctn;
185
186 ctn = get_ctype(val);
187 if (ctn == NULL) {
188 INTERR;
189 return;
190 }
191
192 switch (last_kw) {
193 case T_TOUPPER:
194 ctn->toupper = wc;
195 break;
196 case T_TOLOWER:
197 ctn->tolower = wc;
198 break;
199 default:
200 INTERR;
201 break;
202 }
203 }
204
205 void
206 dump_ctype(void)
207 {
208 FILE *f;
209 _FileRuneLocale rl;
210 ctype_node_t *ctn, *last_ct, *last_lo, *last_up;
211 _FileRuneEntry *ct = NULL;
212 _FileRuneEntry *lo = NULL;
213 _FileRuneEntry *up = NULL;
214 wchar_t wc;
215
216 (void) memset(&rl, 0, sizeof (rl));
217 last_ct = NULL;
218 last_lo = NULL;
219 last_up = NULL;
220
221 if ((f = open_category()) == NULL)
222 return;
223
224 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
225 (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
226
227 /*
228 * Initialize the identity map.
229 */
230 for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
231 rl.maplower[wc] = wc;
232 rl.mapupper[wc] = wc;
233 }
234
235 for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) {
236 int conflict = 0;
237
238 wc = ctn->wc;
239
240 /*
241 * POSIX requires certain portable characters have
242 * certain types. Add them if they are missing.
243 */
244 if ((wc >= 1) && (wc <= 127)) {
245 if ((wc >= 'A') && (wc <= 'Z'))
246 ctn->ctype |= _ISUPPER;
247 if ((wc >= 'a') && (wc <= 'z'))
248 ctn->ctype |= _ISLOWER;
249 if ((wc >= '0') && (wc <= '9'))
250 ctn->ctype |= _ISDIGIT;
251 if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
252 ctn->ctype |= _ISSPACE;
253 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
254 ctn->ctype |= _ISXDIGIT;
255 if (strchr(" \t", (char)wc))
256 ctn->ctype |= _ISBLANK;
257
258 /*
259 * Technically these settings are only
260 * required for the C locale. However, it
261 * turns out that because of the historical
262 * version of isprint(), we need them for all
263 * locales as well. Note that these are not
264 * necessarily valid punctation characters in
265 * the current language, but ispunct() needs
266 * to return TRUE for them.
267 */
268 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
269 (char)wc))
270 ctn->ctype |= _ISPUNCT;
271 }
272
273 /*
274 * POSIX also requires that certain types imply
275 * others. Add any inferred types here.
276 */
277 if (ctn->ctype & (_ISUPPER |_ISLOWER))
278 ctn->ctype |= _ISALPHA;
279 if (ctn->ctype & _ISDIGIT)
280 ctn->ctype |= _ISXDIGIT;
281 if (ctn->ctype & _ISBLANK)
282 ctn->ctype |= _ISSPACE;
283 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
284 ctn->ctype |= _ISGRAPH;
285 if (ctn->ctype & _ISGRAPH)
286 ctn->ctype |= _ISPRINT;
287
288 /*
289 * Finally, POSIX requires that certain combinations
290 * are invalid. We don't flag this as a fatal error,
291 * but we will warn about.
292 */
293 if ((ctn->ctype & _ISALPHA) &&
294 (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
295 conflict++;
296 if ((ctn->ctype & _ISPUNCT) &
297 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
298 conflict++;
299 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
300 conflict++;
301 if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
302 conflict++;
303 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
304 conflict++;
305
306 if (conflict) {
307 warn("conflicting classes for character 0x%x (%x)",
308 wc, ctn->ctype);
309 }
310 /*
311 * Handle the lower 256 characters using the simple
312 * optimization. Note that if we have not defined the
313 * upper/lower case, then we identity map it.
314 */
315 if ((unsigned)wc < _CACHED_RUNES) {
316 rl.runetype[wc] = ctn->ctype;
317 if (ctn->tolower)
318 rl.maplower[wc] = ctn->tolower;
319 if (ctn->toupper)
320 rl.mapupper[wc] = ctn->toupper;
321 continue;
322 }
323
324 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) {
325 ct[rl.runetype_ext_nranges-1].max = wc;
326 last_ct = ctn;
327 } else {
328 rl.runetype_ext_nranges++;
329 ct = realloc(ct,
330 sizeof (*ct) * rl.runetype_ext_nranges);
331 ct[rl.runetype_ext_nranges - 1].min = wc;
332 ct[rl.runetype_ext_nranges - 1].max = wc;
333 ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
334 last_ct = ctn;
335 }
336 if (ctn->tolower == 0) {
337 last_lo = NULL;
338 } else if ((last_lo != NULL) &&
339 (last_lo->tolower + 1 == ctn->tolower)) {
340 lo[rl.maplower_ext_nranges-1].max = wc;
341 last_lo = ctn;
342 } else {
343 rl.maplower_ext_nranges++;
344 lo = realloc(lo,
345 sizeof (*lo) * rl.maplower_ext_nranges);
346 lo[rl.maplower_ext_nranges - 1].min = wc;
347 lo[rl.maplower_ext_nranges - 1].max = wc;
348 lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
349 last_lo = ctn;
350 }
351
352 if (ctn->toupper == 0) {
353 last_up = NULL;
354 } else if ((last_up != NULL) &&
355 (last_up->toupper + 1 == ctn->toupper)) {
356 up[rl.mapupper_ext_nranges-1].max = wc;
357 last_up = ctn;
358 } else {
359 rl.mapupper_ext_nranges++;
360 up = realloc(up,
361 sizeof (*up) * rl.mapupper_ext_nranges);
362 up[rl.mapupper_ext_nranges - 1].min = wc;
363 up[rl.mapupper_ext_nranges - 1].max = wc;
364 up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
365 last_up = ctn;
366 }
367 }
368
369 if ((wr_category(&rl, sizeof (rl), f) < 0) ||
370 (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
371 (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
372 (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
373 return;
374 }
375
376 close_category(f);
377 }