1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2010,2011 Nexenta Systems, Inc. All rights reserved. 14 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved. 15 */ 16 17 /* 18 * LC_CTYPE database generation routines for localedef. 19 */ 20 21 #include <stdio.h> 22 #include <stdlib.h> 23 #include <string.h> 24 #include <sys/types.h> 25 #include <sys/avl.h> 26 #include <wchar.h> 27 #include <ctype.h> 28 #include <wctype.h> 29 #include <unistd.h> 30 #include "localedef.h" 31 #include "parser.tab.h" 32 #include "runefile.h" 33 34 static avl_tree_t ctypes; 35 36 static wchar_t last_ctype; 37 38 typedef struct ctype_node { 39 wchar_t wc; 40 int32_t ctype; 41 int32_t toupper; 42 int32_t tolower; 43 avl_node_t avl; 44 } ctype_node_t; 45 46 static int 47 ctype_compare(const void *n1, const void *n2) 48 { 49 const ctype_node_t *c1 = n1; 50 const ctype_node_t *c2 = n2; 51 52 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0); 53 } 54 55 void 56 init_ctype(void) 57 { 58 avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t), 59 offsetof(ctype_node_t, avl)); 60 } 61 62 63 static void 64 add_ctype_impl(ctype_node_t *ctn) 65 { 66 switch (last_kw) { 67 case T_ISUPPER: 68 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT); 69 break; 70 case T_ISLOWER: 71 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT); 72 break; 73 case T_ISALPHA: 74 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT); 75 break; 76 case T_ISDIGIT: 77 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT); 78 break; 79 case T_ISSPACE: 80 ctn->ctype |= _ISSPACE; 81 break; 82 case T_ISCNTRL: 83 ctn->ctype |= _ISCNTRL; 84 break; 85 case T_ISGRAPH: 86 ctn->ctype |= (_ISGRAPH | _ISPRINT); 87 break; 88 case T_ISPRINT: 89 ctn->ctype |= _ISPRINT; 90 break; 91 case T_ISPUNCT: 92 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT); 93 break; 94 case T_ISXDIGIT: 95 ctn->ctype |= (_ISXDIGIT | _ISPRINT); 96 break; 97 case T_ISBLANK: 98 ctn->ctype |= (_ISBLANK | _ISSPACE); 99 break; 100 case T_ISPHONOGRAM: 101 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH); 102 break; 103 case T_ISIDEOGRAM: 104 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH); 105 break; 106 case T_ISENGLISH: 107 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH); 108 break; 109 case T_ISNUMBER: 110 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH); 111 break; 112 case T_ISSPECIAL: 113 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH); 114 break; 115 case T_ISALNUM: 116 /* 117 * We can't do anything with this. The character 118 * should already be specified as a digit or alpha. 119 */ 120 break; 121 default: 122 errf(_("not a valid character class")); 123 } 124 } 125 126 static ctype_node_t * 127 get_ctype(wchar_t wc) 128 { 129 ctype_node_t srch; 130 ctype_node_t *ctn; 131 avl_index_t where; 132 133 srch.wc = wc; 134 if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) { 135 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) { 136 errf(_("out of memory")); 137 return (NULL); 138 } 139 ctn->wc = wc; 140 141 avl_insert(&ctypes, ctn, where); 142 } 143 return (ctn); 144 } 145 146 void 147 add_ctype(int val) 148 { 149 ctype_node_t *ctn; 150 151 if ((ctn = get_ctype(val)) == NULL) { 152 INTERR; 153 return; 154 } 155 add_ctype_impl(ctn); 156 last_ctype = ctn->wc; 157 } 158 159 void 160 add_ctype_range(int end) 161 { 162 ctype_node_t *ctn; 163 wchar_t cur; 164 165 if (end < last_ctype) { 166 errf(_("malformed character range (%u ... %u))"), 167 last_ctype, end); 168 return; 169 } 170 for (cur = last_ctype + 1; cur <= end; cur++) { 171 if ((ctn = get_ctype(cur)) == NULL) { 172 INTERR; 173 return; 174 } 175 add_ctype_impl(ctn); 176 } 177 last_ctype = end; 178 179 } 180 181 void 182 add_caseconv(int val, int wc) 183 { 184 ctype_node_t *ctn; 185 186 ctn = get_ctype(val); 187 if (ctn == NULL) { 188 INTERR; 189 return; 190 } 191 192 switch (last_kw) { 193 case T_TOUPPER: 194 ctn->toupper = wc; 195 break; 196 case T_TOLOWER: 197 ctn->tolower = wc; 198 break; 199 default: 200 INTERR; 201 break; 202 } 203 } 204 205 void 206 dump_ctype(void) 207 { 208 FILE *f; 209 _FileRuneLocale rl; 210 ctype_node_t *ctn, *last_ct, *last_lo, *last_up; 211 _FileRuneEntry *ct = NULL; 212 _FileRuneEntry *lo = NULL; 213 _FileRuneEntry *up = NULL; 214 wchar_t wc; 215 216 (void) memset(&rl, 0, sizeof (rl)); 217 last_ct = NULL; 218 last_lo = NULL; 219 last_up = NULL; 220 221 if ((f = open_category()) == NULL) 222 return; 223 224 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8); 225 (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding)); 226 227 /* 228 * Preinit the identity map. 229 */ 230 for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) { 231 rl.maplower[wc] = wc; 232 rl.mapupper[wc] = wc; 233 } 234 235 for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) { 236 int conflict = 0; 237 238 wc = ctn->wc; 239 240 /* 241 * POSIX requires certain portable characters have 242 * certain types. Add them if they are missing. 243 */ 244 if ((wc >= 1) && (wc <= 127)) { 245 if ((wc >= 'A') && (wc <= 'Z')) 246 ctn->ctype |= _ISUPPER; 247 if ((wc >= 'a') && (wc <= 'z')) 248 ctn->ctype |= _ISLOWER; 249 if ((wc >= '0') && (wc <= '9')) 250 ctn->ctype |= _ISDIGIT; 251 if (strchr(" \f\n\r\t\v", (char)wc) != NULL) 252 ctn->ctype |= _ISSPACE; 253 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL) 254 ctn->ctype |= _ISXDIGIT; 255 if (strchr(" \t", (char)wc)) 256 ctn->ctype |= _ISBLANK; 257 258 /* 259 * Technically these settings are only 260 * required for the C locale. However, it 261 * turns out that because of the historical 262 * version of isprint(), we need them for all 263 * locales as well. Note that these are not 264 * necessarily valid punctation characters in 265 * the current language, but ispunct() needs 266 * to return TRUE for them. 267 */ 268 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", 269 (char)wc)) 270 ctn->ctype |= _ISPUNCT; 271 } 272 273 /* 274 * POSIX also requires that certain types imply 275 * others. Add any inferred types here. 276 */ 277 if (ctn->ctype & (_ISUPPER |_ISLOWER)) 278 ctn->ctype |= _ISALPHA; 279 if (ctn->ctype & _ISDIGIT) 280 ctn->ctype |= _ISXDIGIT; 281 if (ctn->ctype & _ISBLANK) 282 ctn->ctype |= _ISSPACE; 283 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT)) 284 ctn->ctype |= _ISGRAPH; 285 if (ctn->ctype & _ISGRAPH) 286 ctn->ctype |= _ISPRINT; 287 288 /* 289 * Finally, POSIX requires that certain combinations 290 * are invalid. We don't flag this as a fatal error, 291 * but we will warn about. 292 */ 293 if ((ctn->ctype & _ISALPHA) && 294 (ctn->ctype & (_ISPUNCT|_ISDIGIT))) 295 conflict++; 296 if ((ctn->ctype & _ISPUNCT) & 297 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT))) 298 conflict++; 299 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH)) 300 conflict++; 301 if ((ctn->ctype & _ISCNTRL) & _ISPRINT) 302 conflict++; 303 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH))) 304 conflict++; 305 306 if (conflict) { 307 warn("conflicting classes for character 0x%x (%x)", 308 wc, ctn->ctype); 309 } 310 /* 311 * Handle the lower 256 characters using the simple 312 * optimization. Note that if we have not defined the 313 * upper/lower case, then we identity map it. 314 */ 315 if ((unsigned)wc < _CACHED_RUNES) { 316 rl.runetype[wc] = ctn->ctype; 317 if (ctn->tolower) 318 rl.maplower[wc] = ctn->tolower; 319 if (ctn->toupper) 320 rl.mapupper[wc] = ctn->toupper; 321 continue; 322 } 323 324 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) { 325 ct[rl.runetype_ext_nranges-1].max = wc; 326 last_ct = ctn; 327 } else { 328 rl.runetype_ext_nranges++; 329 ct = realloc(ct, 330 sizeof (*ct) * rl.runetype_ext_nranges); 331 ct[rl.runetype_ext_nranges - 1].min = wc; 332 ct[rl.runetype_ext_nranges - 1].max = wc; 333 ct[rl.runetype_ext_nranges - 1].map = ctn->ctype; 334 last_ct = ctn; 335 } 336 if (ctn->tolower == 0) { 337 last_lo = NULL; 338 } else if ((last_lo != NULL) && 339 (last_lo->tolower + 1 == ctn->tolower)) { 340 lo[rl.maplower_ext_nranges-1].max = wc; 341 last_lo = ctn; 342 } else { 343 rl.maplower_ext_nranges++; 344 lo = realloc(lo, 345 sizeof (*lo) * rl.maplower_ext_nranges); 346 lo[rl.maplower_ext_nranges - 1].min = wc; 347 lo[rl.maplower_ext_nranges - 1].max = wc; 348 lo[rl.maplower_ext_nranges - 1].map = ctn->tolower; 349 last_lo = ctn; 350 } 351 352 if (ctn->toupper == 0) { 353 last_up = NULL; 354 } else if ((last_up != NULL) && 355 (last_up->toupper + 1 == ctn->toupper)) { 356 up[rl.mapupper_ext_nranges-1].max = wc; 357 last_up = ctn; 358 } else { 359 rl.mapupper_ext_nranges++; 360 up = realloc(up, 361 sizeof (*up) * rl.mapupper_ext_nranges); 362 up[rl.mapupper_ext_nranges - 1].min = wc; 363 up[rl.mapupper_ext_nranges - 1].max = wc; 364 up[rl.mapupper_ext_nranges - 1].map = ctn->toupper; 365 last_up = ctn; 366 } 367 } 368 369 if ((wr_category(&rl, sizeof (rl), f) < 0) || 370 (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) || 371 (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) || 372 (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) { 373 return; 374 } 375 376 close_category(f); 377 }