1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2010,2011 Nexenta Systems, Inc. All rights reserved.
14 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved.
15 */
16
17 /*
18 * LC_CTYPE database generation routines for localedef.
19 */
20
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <sys/types.h>
25 #include <sys/avl.h>
26 #include <wchar.h>
27 #include <ctype.h>
28 #include <wctype.h>
29 #include <unistd.h>
30 #include "localedef.h"
31 #include "parser.tab.h"
32 #include "runefile.h"
33
34 static avl_tree_t ctypes;
35
36 static wchar_t last_ctype;
37
38 typedef struct ctype_node {
39 wchar_t wc;
40 int32_t ctype;
41 int32_t toupper;
42 int32_t tolower;
43 avl_node_t avl;
44 } ctype_node_t;
45
46 static int
47 ctype_compare(const void *n1, const void *n2)
48 {
49 const ctype_node_t *c1 = n1;
50 const ctype_node_t *c2 = n2;
51
52 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
53 }
54
55 void
56 init_ctype(void)
57 {
58 avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t),
59 offsetof(ctype_node_t, avl));
60 }
61
62
63 static void
64 add_ctype_impl(ctype_node_t *ctn)
65 {
66 switch (last_kw) {
67 case T_ISUPPER:
68 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
69 break;
70 case T_ISLOWER:
71 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
72 break;
73 case T_ISALPHA:
74 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
75 break;
76 case T_ISDIGIT:
77 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT);
78 break;
79 case T_ISSPACE:
80 ctn->ctype |= _ISSPACE;
81 break;
82 case T_ISCNTRL:
83 ctn->ctype |= _ISCNTRL;
84 break;
85 case T_ISGRAPH:
86 ctn->ctype |= (_ISGRAPH | _ISPRINT);
87 break;
88 case T_ISPRINT:
89 ctn->ctype |= _ISPRINT;
90 break;
91 case T_ISPUNCT:
92 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
93 break;
94 case T_ISXDIGIT:
95 ctn->ctype |= (_ISXDIGIT | _ISPRINT);
96 break;
97 case T_ISBLANK:
98 ctn->ctype |= (_ISBLANK | _ISSPACE);
99 break;
100 case T_ISPHONOGRAM:
101 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
102 break;
103 case T_ISIDEOGRAM:
104 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
105 break;
106 case T_ISENGLISH:
107 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
108 break;
109 case T_ISNUMBER:
110 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
111 break;
112 case T_ISSPECIAL:
113 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
114 break;
115 case T_ISALNUM:
116 /*
117 * We can't do anything with this. The character
118 * should already be specified as a digit or alpha.
119 */
120 break;
121 default:
122 errf(_("not a valid character class"));
123 }
124 }
125
126 static ctype_node_t *
127 get_ctype(wchar_t wc)
128 {
129 ctype_node_t srch;
130 ctype_node_t *ctn;
131 avl_index_t where;
132
133 srch.wc = wc;
134 if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) {
135 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
136 errf(_("out of memory"));
137 return (NULL);
138 }
139 ctn->wc = wc;
140
141 avl_insert(&ctypes, ctn, where);
142 }
143 return (ctn);
144 }
145
146 void
147 add_ctype(int val)
148 {
149 ctype_node_t *ctn;
150
151 if ((ctn = get_ctype(val)) == NULL) {
152 INTERR;
153 return;
154 }
155 add_ctype_impl(ctn);
156 last_ctype = ctn->wc;
157 }
158
159 void
160 add_ctype_range(int end)
161 {
162 ctype_node_t *ctn;
163 wchar_t cur;
164
165 if (end < last_ctype) {
166 errf(_("malformed character range (%u ... %u))"),
167 last_ctype, end);
168 return;
169 }
170 for (cur = last_ctype + 1; cur <= end; cur++) {
171 if ((ctn = get_ctype(cur)) == NULL) {
172 INTERR;
173 return;
174 }
175 add_ctype_impl(ctn);
176 }
177 last_ctype = end;
178
179 }
180
181 void
182 add_caseconv(int val, int wc)
183 {
184 ctype_node_t *ctn;
185
186 ctn = get_ctype(val);
187 if (ctn == NULL) {
188 INTERR;
189 return;
190 }
191
192 switch (last_kw) {
193 case T_TOUPPER:
194 ctn->toupper = wc;
195 break;
196 case T_TOLOWER:
197 ctn->tolower = wc;
198 break;
199 default:
200 INTERR;
201 break;
202 }
203 }
204
205 void
206 dump_ctype(void)
207 {
208 FILE *f;
209 _FileRuneLocale rl;
210 ctype_node_t *ctn, *last_ct, *last_lo, *last_up;
211 _FileRuneEntry *ct = NULL;
212 _FileRuneEntry *lo = NULL;
213 _FileRuneEntry *up = NULL;
214
215 (void) memset(&rl, 0, sizeof (rl));
216 last_ct = NULL;
217 last_lo = NULL;
218 last_up = NULL;
219
220 if ((f = open_category()) == NULL)
221 return;
222
223 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
224 (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
225
226 for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) {
227
228 wchar_t wc = ctn->wc;
229 int conflict = 0;
230
231 /*
232 * POSIX requires certain portable characters have
233 * certain types. Add them if they are missing.
234 */
235 if ((wc >= 1) && (wc <= 127)) {
236 if ((wc >= 'A') && (wc <= 'Z'))
237 ctn->ctype |= _ISUPPER;
238 if ((wc >= 'a') && (wc <= 'z'))
239 ctn->ctype |= _ISLOWER;
240 if ((wc >= '0') && (wc <= '9'))
241 ctn->ctype |= _ISDIGIT;
242 if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
243 ctn->ctype |= _ISSPACE;
244 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
245 ctn->ctype |= _ISXDIGIT;
246 if (strchr(" \t", (char)wc))
247 ctn->ctype |= _ISBLANK;
248
249 /*
250 * Technically these settings are only
251 * required for the C locale. However, it
252 * turns out that because of the historical
253 * version of isprint(), we need them for all
254 * locales as well. Note that these are not
255 * necessarily valid punctation characters in
256 * the current language, but ispunct() needs
257 * to return TRUE for them.
258 */
259 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
260 (char)wc))
261 ctn->ctype |= _ISPUNCT;
262 }
263
264 /*
265 * POSIX also requires that certain types imply
266 * others. Add any inferred types here.
267 */
268 if (ctn->ctype & (_ISUPPER |_ISLOWER))
269 ctn->ctype |= _ISALPHA;
270 if (ctn->ctype & _ISDIGIT)
271 ctn->ctype |= _ISXDIGIT;
272 if (ctn->ctype & _ISBLANK)
273 ctn->ctype |= _ISSPACE;
274 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
275 ctn->ctype |= _ISGRAPH;
276 if (ctn->ctype & _ISGRAPH)
277 ctn->ctype |= _ISPRINT;
278
279 /*
280 * Finally, POSIX requires that certain combinations
281 * are invalid. We don't flag this as a fatal error,
282 * but we will warn about.
283 */
284 if ((ctn->ctype & _ISALPHA) &&
285 (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
286 conflict++;
287 if ((ctn->ctype & _ISPUNCT) &
288 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
289 conflict++;
290 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
291 conflict++;
292 if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
293 conflict++;
294 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
295 conflict++;
296
297 if (conflict) {
298 warn("conflicting classes for character 0x%x (%x)",
299 wc, ctn->ctype);
300 }
301 /*
302 * Handle the lower 256 characters using the simple
303 * optimization. Note that if we have not defined the
304 * upper/lower case, then we identity map it.
305 */
306 if ((unsigned)wc < _CACHED_RUNES) {
307 rl.runetype[wc] = ctn->ctype;
308 rl.maplower[wc] = ctn->tolower ? ctn->tolower : wc;
309 rl.mapupper[wc] = ctn->toupper ? ctn->toupper : wc;
310 continue;
311 }
312
313 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) {
314 ct[rl.runetype_ext_nranges-1].max = wc;
315 last_ct = ctn;
316 } else {
317 rl.runetype_ext_nranges++;
318 ct = realloc(ct,
319 sizeof (*ct) * rl.runetype_ext_nranges);
320 ct[rl.runetype_ext_nranges - 1].min = wc;
321 ct[rl.runetype_ext_nranges - 1].max = wc;
322 ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
323 last_ct = ctn;
324 }
325 if (ctn->tolower == 0) {
326 last_lo = NULL;
327 } else if ((last_lo != NULL) &&
328 (last_lo->tolower + 1 == ctn->tolower)) {
329 lo[rl.maplower_ext_nranges-1].max = wc;
330 last_lo = ctn;
331 } else {
332 rl.maplower_ext_nranges++;
333 lo = realloc(lo,
334 sizeof (*lo) * rl.maplower_ext_nranges);
335 lo[rl.maplower_ext_nranges - 1].min = wc;
336 lo[rl.maplower_ext_nranges - 1].max = wc;
337 lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
338 last_lo = ctn;
339 }
340
341 if (ctn->toupper == 0) {
342 last_up = NULL;
343 } else if ((last_up != NULL) &&
344 (last_up->toupper + 1 == ctn->toupper)) {
345 up[rl.mapupper_ext_nranges-1].max = wc;
346 last_up = ctn;
347 } else {
348 rl.mapupper_ext_nranges++;
349 up = realloc(up,
350 sizeof (*up) * rl.mapupper_ext_nranges);
351 up[rl.mapupper_ext_nranges - 1].min = wc;
352 up[rl.mapupper_ext_nranges - 1].max = wc;
353 up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
354 last_up = ctn;
355 }
356 }
357
358 if ((wr_category(&rl, sizeof (rl), f) < 0) ||
359 (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
360 (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
361 (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
362 return;
363 }
364
365 close_category(f);
366 }