1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2010,2011 Nexenta Systems, Inc.  All rights reserved.
  14  * Copyright 2012 Garrett D'Amore <garrett@damore.org>  All rights reserved.
  15  */
  16 
  17 /*
  18  * LC_CTYPE database generation routines for localedef.
  19  */
  20 
  21 #include <stdio.h>
  22 #include <stdlib.h>
  23 #include <string.h>
  24 #include <sys/types.h>
  25 #include <sys/avl.h>
  26 #include <wchar.h>
  27 #include <ctype.h>
  28 #include <wctype.h>
  29 #include <unistd.h>
  30 #include "localedef.h"
  31 #include "parser.tab.h"
  32 #include "runefile.h"
  33 
  34 static avl_tree_t       ctypes;
  35 
  36 static wchar_t          last_ctype;
  37 
  38 typedef struct ctype_node {
  39         wchar_t wc;
  40         int32_t ctype;
  41         int32_t toupper;
  42         int32_t tolower;
  43         avl_node_t avl;
  44 } ctype_node_t;
  45 
  46 static int
  47 ctype_compare(const void *n1, const void *n2)
  48 {
  49         const ctype_node_t *c1 = n1;
  50         const ctype_node_t *c2 = n2;
  51 
  52         return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
  53 }
  54 
  55 void
  56 init_ctype(void)
  57 {
  58         avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t),
  59             offsetof(ctype_node_t, avl));
  60 }
  61 
  62 
  63 static void
  64 add_ctype_impl(ctype_node_t *ctn)
  65 {
  66         switch (last_kw) {
  67         case T_ISUPPER:
  68                 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
  69                 break;
  70         case T_ISLOWER:
  71                 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
  72                 break;
  73         case T_ISALPHA:
  74                 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
  75                 break;
  76         case T_ISDIGIT:
  77                 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT);
  78                 break;
  79         case T_ISSPACE:
  80                 ctn->ctype |= _ISSPACE;
  81                 break;
  82         case T_ISCNTRL:
  83                 ctn->ctype |= _ISCNTRL;
  84                 break;
  85         case T_ISGRAPH:
  86                 ctn->ctype |= (_ISGRAPH | _ISPRINT);
  87                 break;
  88         case T_ISPRINT:
  89                 ctn->ctype |= _ISPRINT;
  90                 break;
  91         case T_ISPUNCT:
  92                 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
  93                 break;
  94         case T_ISXDIGIT:
  95                 ctn->ctype |= (_ISXDIGIT | _ISPRINT);
  96                 break;
  97         case T_ISBLANK:
  98                 ctn->ctype |= (_ISBLANK | _ISSPACE);
  99                 break;
 100         case T_ISPHONOGRAM:
 101                 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
 102                 break;
 103         case T_ISIDEOGRAM:
 104                 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
 105                 break;
 106         case T_ISENGLISH:
 107                 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
 108                 break;
 109         case T_ISNUMBER:
 110                 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
 111                 break;
 112         case T_ISSPECIAL:
 113                 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
 114                 break;
 115         case T_ISALNUM:
 116                 /*
 117                  * We can't do anything with this.  The character
 118                  * should already be specified as a digit or alpha.
 119                  */
 120                 break;
 121         default:
 122                 errf(_("not a valid character class"));
 123         }
 124 }
 125 
 126 static ctype_node_t *
 127 get_ctype(wchar_t wc)
 128 {
 129         ctype_node_t    srch;
 130         ctype_node_t    *ctn;
 131         avl_index_t     where;
 132 
 133         srch.wc = wc;
 134         if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) {
 135                 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
 136                         errf(_("out of memory"));
 137                         return (NULL);
 138                 }
 139                 ctn->wc = wc;
 140 
 141                 avl_insert(&ctypes, ctn, where);
 142         }
 143         return (ctn);
 144 }
 145 
 146 void
 147 add_ctype(int val)
 148 {
 149         ctype_node_t    *ctn;
 150 
 151         if ((ctn = get_ctype(val)) == NULL) {
 152                 INTERR;
 153                 return;
 154         }
 155         add_ctype_impl(ctn);
 156         last_ctype = ctn->wc;
 157 }
 158 
 159 void
 160 add_ctype_range(int end)
 161 {
 162         ctype_node_t    *ctn;
 163         wchar_t         cur;
 164 
 165         if (end < last_ctype) {
 166                 errf(_("malformed character range (%u ... %u))"),
 167                     last_ctype, end);
 168                 return;
 169         }
 170         for (cur = last_ctype + 1; cur <= end; cur++) {
 171                 if ((ctn = get_ctype(cur)) == NULL) {
 172                         INTERR;
 173                         return;
 174                 }
 175                 add_ctype_impl(ctn);
 176         }
 177         last_ctype = end;
 178 
 179 }
 180 
 181 void
 182 add_caseconv(int val, int wc)
 183 {
 184         ctype_node_t    *ctn;
 185 
 186         ctn = get_ctype(val);
 187         if (ctn == NULL) {
 188                 INTERR;
 189                 return;
 190         }
 191 
 192         switch (last_kw) {
 193         case T_TOUPPER:
 194                 ctn->toupper = wc;
 195                 break;
 196         case T_TOLOWER:
 197                 ctn->tolower = wc;
 198                 break;
 199         default:
 200                 INTERR;
 201                 break;
 202         }
 203 }
 204 
 205 void
 206 dump_ctype(void)
 207 {
 208         FILE            *f;
 209         _FileRuneLocale rl;
 210         ctype_node_t    *ctn, *last_ct, *last_lo, *last_up;
 211         _FileRuneEntry  *ct = NULL;
 212         _FileRuneEntry  *lo = NULL;
 213         _FileRuneEntry  *up = NULL;
 214         wchar_t         wc;
 215 
 216         (void) memset(&rl, 0, sizeof (rl));
 217         last_ct = NULL;
 218         last_lo = NULL;
 219         last_up = NULL;
 220 
 221         if ((f = open_category()) == NULL)
 222                 return;
 223 
 224         (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
 225         (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
 226 
 227         /*
 228          * Initialize the identity map.
 229          */
 230         for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
 231                 rl.maplower[wc] = wc;
 232                 rl.mapupper[wc] = wc;
 233         }
 234 
 235         for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) {
 236                 int conflict = 0;
 237 
 238                 wc = ctn->wc;
 239 
 240                 /*
 241                  * POSIX requires certain portable characters have
 242                  * certain types.  Add them if they are missing.
 243                  */
 244                 if ((wc >= 1) && (wc <= 127)) {
 245                         if ((wc >= 'A') && (wc <= 'Z'))
 246                                 ctn->ctype |= _ISUPPER;
 247                         if ((wc >= 'a') && (wc <= 'z'))
 248                                 ctn->ctype |= _ISLOWER;
 249                         if ((wc >= '0') && (wc <= '9'))
 250                                 ctn->ctype |= _ISDIGIT;
 251                         if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
 252                                 ctn->ctype |= _ISSPACE;
 253                         if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
 254                                 ctn->ctype |= _ISXDIGIT;
 255                         if (strchr(" \t", (char)wc))
 256                                 ctn->ctype |= _ISBLANK;
 257 
 258                         /*
 259                          * Technically these settings are only
 260                          * required for the C locale.  However, it
 261                          * turns out that because of the historical
 262                          * version of isprint(), we need them for all
 263                          * locales as well.  Note that these are not
 264                          * necessarily valid punctation characters in
 265                          * the current language, but ispunct() needs
 266                          * to return TRUE for them.
 267                          */
 268                         if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
 269                             (char)wc))
 270                                 ctn->ctype |= _ISPUNCT;
 271                 }
 272 
 273                 /*
 274                  * POSIX also requires that certain types imply
 275                  * others.  Add any inferred types here.
 276                  */
 277                 if (ctn->ctype & (_ISUPPER |_ISLOWER))
 278                         ctn->ctype |= _ISALPHA;
 279                 if (ctn->ctype & _ISDIGIT)
 280                         ctn->ctype |= _ISXDIGIT;
 281                 if (ctn->ctype & _ISBLANK)
 282                         ctn->ctype |= _ISSPACE;
 283                 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
 284                         ctn->ctype |= _ISGRAPH;
 285                 if (ctn->ctype & _ISGRAPH)
 286                         ctn->ctype |= _ISPRINT;
 287 
 288                 /*
 289                  * Finally, POSIX requires that certain combinations
 290                  * are invalid.  We don't flag this as a fatal error,
 291                  * but we will warn about.
 292                  */
 293                 if ((ctn->ctype & _ISALPHA) &&
 294                     (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
 295                         conflict++;
 296                 if ((ctn->ctype & _ISPUNCT) &
 297                     (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
 298                         conflict++;
 299                 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
 300                         conflict++;
 301                 if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
 302                         conflict++;
 303                 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
 304                         conflict++;
 305 
 306                 if (conflict) {
 307                         warn("conflicting classes for character 0x%x (%x)",
 308                             wc, ctn->ctype);
 309                 }
 310                 /*
 311                  * Handle the lower 256 characters using the simple
 312                  * optimization.  Note that if we have not defined the
 313                  * upper/lower case, then we identity map it.
 314                  */
 315                 if ((unsigned)wc < _CACHED_RUNES) {
 316                         rl.runetype[wc] = ctn->ctype;
 317                         if (ctn->tolower)
 318                                 rl.maplower[wc] = ctn->tolower;
 319                         if (ctn->toupper)
 320                                 rl.mapupper[wc] = ctn->toupper;
 321                         continue;
 322                 }
 323 
 324                 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) {
 325                         ct[rl.runetype_ext_nranges-1].max = wc;
 326                         last_ct = ctn;
 327                 } else {
 328                         rl.runetype_ext_nranges++;
 329                         ct = realloc(ct,
 330                             sizeof (*ct) * rl.runetype_ext_nranges);
 331                         ct[rl.runetype_ext_nranges - 1].min = wc;
 332                         ct[rl.runetype_ext_nranges - 1].max = wc;
 333                         ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
 334                         last_ct = ctn;
 335                 }
 336                 if (ctn->tolower == 0) {
 337                         last_lo = NULL;
 338                 } else if ((last_lo != NULL) &&
 339                     (last_lo->tolower + 1 == ctn->tolower)) {
 340                         lo[rl.maplower_ext_nranges-1].max = wc;
 341                         last_lo = ctn;
 342                 } else {
 343                         rl.maplower_ext_nranges++;
 344                         lo = realloc(lo,
 345                             sizeof (*lo) * rl.maplower_ext_nranges);
 346                         lo[rl.maplower_ext_nranges - 1].min = wc;
 347                         lo[rl.maplower_ext_nranges - 1].max = wc;
 348                         lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
 349                         last_lo = ctn;
 350                 }
 351 
 352                 if (ctn->toupper == 0) {
 353                         last_up = NULL;
 354                 } else if ((last_up != NULL) &&
 355                     (last_up->toupper + 1 == ctn->toupper)) {
 356                         up[rl.mapupper_ext_nranges-1].max = wc;
 357                         last_up = ctn;
 358                 } else {
 359                         rl.mapupper_ext_nranges++;
 360                         up = realloc(up,
 361                             sizeof (*up) * rl.mapupper_ext_nranges);
 362                         up[rl.mapupper_ext_nranges - 1].min = wc;
 363                         up[rl.mapupper_ext_nranges - 1].max = wc;
 364                         up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
 365                         last_up = ctn;
 366                 }
 367         }
 368 
 369         if ((wr_category(&rl, sizeof (rl), f) < 0) ||
 370             (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
 371             (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
 372             (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
 373                 return;
 374         }
 375 
 376         close_category(f);
 377 }