1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2010,2011 Nexenta Systems, Inc. All rights reserved. 14 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved. 15 */ 16 17 /* 18 * LC_CTYPE database generation routines for localedef. 19 */ 20 21 #include <stdio.h> 22 #include <stdlib.h> 23 #include <string.h> 24 #include <sys/types.h> 25 #include <sys/avl.h> 26 #include <wchar.h> 27 #include <ctype.h> 28 #include <wctype.h> 29 #include <unistd.h> 30 #include "localedef.h" 31 #include "parser.tab.h" 32 #include "runefile.h" 33 34 static avl_tree_t ctypes; 35 36 static wchar_t last_ctype; 37 38 typedef struct ctype_node { 39 wchar_t wc; 40 int32_t ctype; 41 int32_t toupper; 42 int32_t tolower; 43 avl_node_t avl; 44 } ctype_node_t; 45 46 static int 47 ctype_compare(const void *n1, const void *n2) 48 { 49 const ctype_node_t *c1 = n1; 50 const ctype_node_t *c2 = n2; 51 52 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0); 53 } 54 55 void 56 init_ctype(void) 57 { 58 avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t), 59 offsetof(ctype_node_t, avl)); 60 } 61 62 63 static void 64 add_ctype_impl(ctype_node_t *ctn) 65 { 66 switch (last_kw) { 67 case T_ISUPPER: 68 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT); 69 break; 70 case T_ISLOWER: 71 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT); 72 break; 73 case T_ISALPHA: 74 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT); 75 break; 76 case T_ISDIGIT: 77 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT); 78 break; 79 case T_ISSPACE: 80 ctn->ctype |= _ISSPACE; 81 break; 82 case T_ISCNTRL: 83 ctn->ctype |= _ISCNTRL; 84 break; 85 case T_ISGRAPH: 86 ctn->ctype |= (_ISGRAPH | _ISPRINT); 87 break; 88 case T_ISPRINT: 89 ctn->ctype |= _ISPRINT; 90 break; 91 case T_ISPUNCT: 92 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT); 93 break; 94 case T_ISXDIGIT: 95 ctn->ctype |= (_ISXDIGIT | _ISPRINT); 96 break; 97 case T_ISBLANK: 98 ctn->ctype |= (_ISBLANK | _ISSPACE); 99 break; 100 case T_ISPHONOGRAM: 101 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH); 102 break; 103 case T_ISIDEOGRAM: 104 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH); 105 break; 106 case T_ISENGLISH: 107 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH); 108 break; 109 case T_ISNUMBER: 110 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH); 111 break; 112 case T_ISSPECIAL: 113 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH); 114 break; 115 case T_ISALNUM: 116 /* 117 * We can't do anything with this. The character 118 * should already be specified as a digit or alpha. 119 */ 120 break; 121 default: 122 errf(_("not a valid character class")); 123 } 124 } 125 126 static ctype_node_t * 127 get_ctype(wchar_t wc) 128 { 129 ctype_node_t srch; 130 ctype_node_t *ctn; 131 avl_index_t where; 132 133 srch.wc = wc; 134 if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) { 135 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) { 136 errf(_("out of memory")); 137 return (NULL); 138 } 139 ctn->wc = wc; 140 141 avl_insert(&ctypes, ctn, where); 142 } 143 return (ctn); 144 } 145 146 void 147 add_ctype(int val) 148 { 149 ctype_node_t *ctn; 150 151 if ((ctn = get_ctype(val)) == NULL) { 152 INTERR; 153 return; 154 } 155 add_ctype_impl(ctn); 156 last_ctype = ctn->wc; 157 } 158 159 void 160 add_ctype_range(int end) 161 { 162 ctype_node_t *ctn; 163 wchar_t cur; 164 165 if (end < last_ctype) { 166 errf(_("malformed character range (%u ... %u))"), 167 last_ctype, end); 168 return; 169 } 170 for (cur = last_ctype + 1; cur <= end; cur++) { 171 if ((ctn = get_ctype(cur)) == NULL) { 172 INTERR; 173 return; 174 } 175 add_ctype_impl(ctn); 176 } 177 last_ctype = end; 178 179 } 180 181 void 182 add_caseconv(int val, int wc) 183 { 184 ctype_node_t *ctn; 185 186 ctn = get_ctype(val); 187 if (ctn == NULL) { 188 INTERR; 189 return; 190 } 191 192 switch (last_kw) { 193 case T_TOUPPER: 194 ctn->toupper = wc; 195 break; 196 case T_TOLOWER: 197 ctn->tolower = wc; 198 break; 199 default: 200 INTERR; 201 break; 202 } 203 } 204 205 void 206 dump_ctype(void) 207 { 208 FILE *f; 209 _FileRuneLocale rl; 210 ctype_node_t *ctn, *last_ct, *last_lo, *last_up; 211 _FileRuneEntry *ct = NULL; 212 _FileRuneEntry *lo = NULL; 213 _FileRuneEntry *up = NULL; 214 215 (void) memset(&rl, 0, sizeof (rl)); 216 last_ct = NULL; 217 last_lo = NULL; 218 last_up = NULL; 219 220 if ((f = open_category()) == NULL) 221 return; 222 223 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8); 224 (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding)); 225 226 for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) { 227 228 wchar_t wc = ctn->wc; 229 int conflict = 0; 230 231 /* 232 * POSIX requires certain portable characters have 233 * certain types. Add them if they are missing. 234 */ 235 if ((wc >= 1) && (wc <= 127)) { 236 if ((wc >= 'A') && (wc <= 'Z')) 237 ctn->ctype |= _ISUPPER; 238 if ((wc >= 'a') && (wc <= 'z')) 239 ctn->ctype |= _ISLOWER; 240 if ((wc >= '0') && (wc <= '9')) 241 ctn->ctype |= _ISDIGIT; 242 if (strchr(" \f\n\r\t\v", (char)wc) != NULL) 243 ctn->ctype |= _ISSPACE; 244 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL) 245 ctn->ctype |= _ISXDIGIT; 246 if (strchr(" \t", (char)wc)) 247 ctn->ctype |= _ISBLANK; 248 249 /* 250 * Technically these settings are only 251 * required for the C locale. However, it 252 * turns out that because of the historical 253 * version of isprint(), we need them for all 254 * locales as well. Note that these are not 255 * necessarily valid punctation characters in 256 * the current language, but ispunct() needs 257 * to return TRUE for them. 258 */ 259 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", 260 (char)wc)) 261 ctn->ctype |= _ISPUNCT; 262 } 263 264 /* 265 * POSIX also requires that certain types imply 266 * others. Add any inferred types here. 267 */ 268 if (ctn->ctype & (_ISUPPER |_ISLOWER)) 269 ctn->ctype |= _ISALPHA; 270 if (ctn->ctype & _ISDIGIT) 271 ctn->ctype |= _ISXDIGIT; 272 if (ctn->ctype & _ISBLANK) 273 ctn->ctype |= _ISSPACE; 274 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT)) 275 ctn->ctype |= _ISGRAPH; 276 if (ctn->ctype & _ISGRAPH) 277 ctn->ctype |= _ISPRINT; 278 279 /* 280 * Finally, POSIX requires that certain combinations 281 * are invalid. We don't flag this as a fatal error, 282 * but we will warn about. 283 */ 284 if ((ctn->ctype & _ISALPHA) && 285 (ctn->ctype & (_ISPUNCT|_ISDIGIT))) 286 conflict++; 287 if ((ctn->ctype & _ISPUNCT) & 288 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT))) 289 conflict++; 290 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH)) 291 conflict++; 292 if ((ctn->ctype & _ISCNTRL) & _ISPRINT) 293 conflict++; 294 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH))) 295 conflict++; 296 297 if (conflict) { 298 warn("conflicting classes for character 0x%x (%x)", 299 wc, ctn->ctype); 300 } 301 /* 302 * Handle the lower 256 characters using the simple 303 * optimization. Note that if we have not defined the 304 * upper/lower case, then we identity map it. 305 */ 306 if ((unsigned)wc < _CACHED_RUNES) { 307 rl.runetype[wc] = ctn->ctype; 308 rl.maplower[wc] = ctn->tolower ? ctn->tolower : wc; 309 rl.mapupper[wc] = ctn->toupper ? ctn->toupper : wc; 310 continue; 311 } 312 313 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) { 314 ct[rl.runetype_ext_nranges-1].max = wc; 315 last_ct = ctn; 316 } else { 317 rl.runetype_ext_nranges++; 318 ct = realloc(ct, 319 sizeof (*ct) * rl.runetype_ext_nranges); 320 ct[rl.runetype_ext_nranges - 1].min = wc; 321 ct[rl.runetype_ext_nranges - 1].max = wc; 322 ct[rl.runetype_ext_nranges - 1].map = ctn->ctype; 323 last_ct = ctn; 324 } 325 if (ctn->tolower == 0) { 326 last_lo = NULL; 327 } else if ((last_lo != NULL) && 328 (last_lo->tolower + 1 == ctn->tolower)) { 329 lo[rl.maplower_ext_nranges-1].max = wc; 330 last_lo = ctn; 331 } else { 332 rl.maplower_ext_nranges++; 333 lo = realloc(lo, 334 sizeof (*lo) * rl.maplower_ext_nranges); 335 lo[rl.maplower_ext_nranges - 1].min = wc; 336 lo[rl.maplower_ext_nranges - 1].max = wc; 337 lo[rl.maplower_ext_nranges - 1].map = ctn->tolower; 338 last_lo = ctn; 339 } 340 341 if (ctn->toupper == 0) { 342 last_up = NULL; 343 } else if ((last_up != NULL) && 344 (last_up->toupper + 1 == ctn->toupper)) { 345 up[rl.mapupper_ext_nranges-1].max = wc; 346 last_up = ctn; 347 } else { 348 rl.mapupper_ext_nranges++; 349 up = realloc(up, 350 sizeof (*up) * rl.mapupper_ext_nranges); 351 up[rl.mapupper_ext_nranges - 1].min = wc; 352 up[rl.mapupper_ext_nranges - 1].max = wc; 353 up[rl.mapupper_ext_nranges - 1].map = ctn->toupper; 354 last_up = ctn; 355 } 356 } 357 358 if ((wr_category(&rl, sizeof (rl), f) < 0) || 359 (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) || 360 (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) || 361 (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) { 362 return; 363 } 364 365 close_category(f); 366 }