1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  14  */
  15 
  16 /*
  17  * This file contains the "scanner", which tokenizes charmap files
  18  * for iconv for processing by the higher level grammar processor.
  19  */
  20 
  21 #include <stdio.h>
  22 #include <stdlib.h>
  23 #include <ctype.h>
  24 #include <limits.h>
  25 #include <string.h>
  26 #include <widec.h>
  27 #include <sys/types.h>
  28 #include <assert.h>
  29 #include "charmap.h"
  30 #include "parser.tab.h"
  31 
  32 int                     com_char = '#';
  33 int                     esc_char = '\\';
  34 int                     mb_cur_min = 1;
  35 int                     mb_cur_max = 1;
  36 int                     lineno = 1;
  37 int                     warnings = 0;
  38 static int              nextline;
  39 static FILE             *input = stdin;
  40 static const char       *filename = "<stdin>";
  41 static int              instring = 0;
  42 static int              escaped = 0;
  43 
  44 /*
  45  * Token space ... grows on demand.
  46  */
  47 static char *token = NULL;
  48 static int tokidx;
  49 static int toksz = 0;
  50 static int hadtok = 0;
  51 
  52 /*
  53  * The last keyword seen.  This is useful to trigger the special lexer rules
  54  * for "copy" and also collating symbols and elements.
  55  */
  56 int     last_kw = 0;
  57 static int      category = T_END;
  58 
  59 static struct token {
  60         int id;
  61         const char *name;
  62 } keywords[] = {
  63         { T_COM_CHAR,           "comment_char" },
  64         { T_ESC_CHAR,           "escape_char" },
  65         { T_END,                "END" },
  66 
  67         /*
  68          * These are keywords used in the charmap file.  Note that
  69          * Solaris orginally used angle brackets to wrap some of them,
  70          * but we removed that to simplify our parser.  The first of these
  71          * items are "global items."
  72          */
  73         { T_CHARMAP,            "CHARMAP" },
  74         { T_WIDTH,              "WIDTH" },
  75         { T_WIDTH_DEFAULT,      "WIDTH_DEFAULT" },
  76 
  77         { -1, NULL },
  78 };
  79 
  80 /*
  81  * These special words are only used in a charmap file, enclosed in <>.
  82  */
  83 static struct token symwords[] = {
  84         { T_COM_CHAR,           "comment_char" },
  85         { T_ESC_CHAR,           "escape_char" },
  86         { T_CODE_SET,           "code_set_name" },
  87         { T_MB_CUR_MAX,         "mb_cur_max" },
  88         { T_MB_CUR_MIN,         "mb_cur_min" },
  89         { -1, NULL },
  90 };
  91 
  92 static int categories[] = {
  93         T_CHARMAP,
  94         0
  95 };
  96 
  97 void
  98 reset_scanner(const char *fname)
  99 {
 100         if (fname == NULL) {
 101                 filename = "<stdin>";
 102                 input = stdin;
 103         } else {
 104                 if (input != stdin)
 105                         (void) fclose(input);
 106                 if ((input = fopen(fname, "r")) == NULL) {
 107                         perror(fname);
 108                         exit(1);
 109                 }
 110                 filename = fname;
 111         }
 112         com_char = '#';
 113         esc_char = '\\';
 114         instring = 0;
 115         escaped = 0;
 116         lineno = 1;
 117         nextline = 1;
 118         tokidx = 0;
 119         last_kw = 0;
 120         category = T_END;
 121 }
 122 
 123 #define hex(x)  \
 124         (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
 125 #define isodigit(x)     ((x >= '0') && (x <= '7'))
 126 
 127 static int
 128 scanc(void)
 129 {
 130         int     c;
 131 
 132         c = getc(input);
 133         lineno = nextline;
 134         if (c == '\n') {
 135                 nextline++;
 136         }
 137         return (c);
 138 }
 139 
 140 static void
 141 unscanc(int c)
 142 {
 143         if (c == '\n') {
 144                 nextline--;
 145         }
 146         if (ungetc(c, input) < 0) {
 147                 yyerror(_("ungetc failed"));
 148         }
 149 }
 150 
 151 static int
 152 scan_hex_byte(void)
 153 {
 154         int     c1, c2;
 155         int     v;
 156 
 157         c1 = scanc();
 158         if (!isxdigit(c1)) {
 159                 yyerror(_("malformed hex digit"));
 160                 return (0);
 161         }
 162         c2 = scanc();
 163         if (!isxdigit(c2)) {
 164                 yyerror(_("malformed hex digit"));
 165                 return (0);
 166         }
 167         v = ((hex(c1) << 4) | hex(c2));
 168         return (v);
 169 }
 170 
 171 static int
 172 scan_dec_byte(void)
 173 {
 174         int     c1, c2, c3;
 175         int     b;
 176 
 177         c1 = scanc();
 178         if (!isdigit(c1)) {
 179                 yyerror(_("malformed decimal digit"));
 180                 return (0);
 181         }
 182         b = c1 - '0';
 183         c2 = scanc();
 184         if (!isdigit(c2)) {
 185                 yyerror(_("malformed decimal digit"));
 186                 return (0);
 187         }
 188         b *= 10;
 189         b += (c2 - '0');
 190         c3 = scanc();
 191         if (!isdigit(c3)) {
 192                 unscanc(c3);
 193         } else {
 194                 b *= 10;
 195                 b += (c3 - '0');
 196         }
 197         return (b);
 198 }
 199 
 200 static int
 201 scan_oct_byte(void)
 202 {
 203         int c1, c2, c3;
 204         int     b;
 205 
 206         b = 0;
 207 
 208         c1 = scanc();
 209         if (!isodigit(c1)) {
 210                 yyerror(_("malformed octal digit"));
 211                 return (0);
 212         }
 213         b = c1 - '0';
 214         c2 = scanc();
 215         if (!isodigit(c2)) {
 216                 yyerror(_("malformed octal digit"));
 217                 return (0);
 218         }
 219         b *= 8;
 220         b += (c2 - '0');
 221         c3 = scanc();
 222         if (!isodigit(c3)) {
 223                 unscanc(c3);
 224         } else {
 225                 b *= 8;
 226                 b += (c3 - '0');
 227         }
 228         return (b);
 229 }
 230 
 231 void
 232 add_tok(int c)
 233 {
 234         if ((tokidx + 1) >= toksz) {
 235                 toksz += 64;
 236                 if ((token = realloc(token, toksz)) == NULL) {
 237                         yyerror(_("out of memory"));
 238                         tokidx = 0;
 239                         toksz = 0;
 240                         return;
 241                 }
 242         }
 243 
 244         token[tokidx++] = (char)c;
 245         token[tokidx] = 0;
 246 }
 247 
 248 static int
 249 get_byte(void)
 250 {
 251         int     c;
 252 
 253         if ((c = scanc()) != esc_char) {
 254                 unscanc(c);
 255                 return (EOF);
 256         }
 257         c = scanc();
 258 
 259         switch (c) {
 260         case 'd':
 261         case 'D':
 262                 return (scan_dec_byte());
 263         case 'x':
 264         case 'X':
 265                 return (scan_hex_byte());
 266         case '0':
 267         case '1':
 268         case '2':
 269         case '3':
 270         case '4':
 271         case '5':
 272         case '6':
 273         case '7':
 274                 /* put the character back so we can get it */
 275                 unscanc(c);
 276                 return (scan_oct_byte());
 277         default:
 278                 unscanc(c);
 279                 unscanc(esc_char);
 280                 return (EOF);
 281         }
 282 }
 283 
 284 int
 285 get_escaped(int c)
 286 {
 287         switch (c) {
 288         case 'n':
 289                 return ('\n');
 290         case 'r':
 291                 return ('\r');
 292         case 't':
 293                 return ('\t');
 294         case 'f':
 295                 return ('\f');
 296         case 'v':
 297                 return ('\v');
 298         case 'b':
 299                 return ('\b');
 300         case 'a':
 301                 return ('\a');
 302         default:
 303                 return (c);
 304         }
 305 }
 306 
 307 int
 308 get_wide(void)
 309 {
 310         /* NB: yylval.mbs[0] is the length */
 311         char *mbs = &yylval.mbs[1];
 312         int mbi = 0;
 313         int c;
 314 
 315         mbs[mbi] = 0;
 316         if (mb_cur_max > MB_LEN_MAX) {
 317                 yyerror(_("max multibyte character size too big"));
 318                 return (T_NULL);
 319         }
 320         for (;;) {
 321                 if ((c = get_byte()) == EOF)
 322                         break;
 323                 if (mbi == mb_cur_max) {
 324                         unscanc(c);
 325                         yyerror(_("length > mb_cur_max"));
 326                         return (T_NULL);
 327                 }
 328                 mbs[mbi++] = c;
 329                 mbs[mbi] = 0;
 330         }
 331 
 332         /* result in yylval.mbs */
 333         mbs[-1] = mbi;
 334         return (T_CHAR);
 335 }
 336 
 337 int
 338 get_symbol(void)
 339 {
 340         int     c;
 341 
 342         while ((c = scanc()) != EOF) {
 343                 if (escaped) {
 344                         escaped = 0;
 345                         if (c == '\n')
 346                                 continue;
 347                         add_tok(get_escaped(c));
 348                         continue;
 349                 }
 350                 if (c == esc_char) {
 351                         escaped = 1;
 352                         continue;
 353                 }
 354                 if (c == '\n') {        /* well that's strange! */
 355                         yyerror(_("unterminated symbolic name"));
 356                         continue;
 357                 }
 358                 if (c == '>') {              /* end of symbol */
 359 
 360                         /*
 361                          * This restarts the token from the beginning
 362                          * the next time we scan a character.  (This
 363                          * token is complete.)
 364                          */
 365 
 366                         if (token == NULL) {
 367                                 yyerror(_("missing symbolic name"));
 368                                 return (T_NULL);
 369                         }
 370                         tokidx = 0;
 371 
 372                         /*
 373                          * A few symbols are handled as keywords outside
 374                          * of the normal categories.
 375                          */
 376                         if (category == T_END) {
 377                                 int i;
 378                                 for (i = 0; symwords[i].name != 0; i++) {
 379                                         if (strcmp(token, symwords[i].name) ==
 380                                             0) {
 381                                                 last_kw = symwords[i].id;
 382                                                 return (last_kw);
 383                                         }
 384                                 }
 385                         }
 386                         /* its an undefined symbol */
 387                         yylval.token = strdup(token);
 388                         token = NULL;
 389                         toksz = 0;
 390                         tokidx = 0;
 391                         return (T_SYMBOL);
 392                 }
 393                 add_tok(c);
 394         }
 395 
 396         yyerror(_("unterminated symbolic name"));
 397         return (EOF);
 398 }
 399 
 400 
 401 static int
 402 consume_token(void)
 403 {
 404         int     len = tokidx;
 405         int     i;
 406 
 407         tokidx = 0;
 408         if (token == NULL)
 409                 return (T_NULL);
 410 
 411         /*
 412          * this one is special, because we don't want it to alter the
 413          * last_kw field.
 414          */
 415         if (strcmp(token, "...") == 0) {
 416                 return (T_ELLIPSIS);
 417         }
 418 
 419         /* search for reserved words first */
 420         for (i = 0; keywords[i].name; i++) {
 421                 int j;
 422                 if (strcmp(keywords[i].name, token) != 0) {
 423                         continue;
 424                 }
 425 
 426                 last_kw = keywords[i].id;
 427 
 428                 /* clear the top level category if we're done with it */
 429                 if (last_kw == T_END) {
 430                         category = T_END;
 431                 }
 432 
 433                 /* set the top level category if we're changing */
 434                 for (j = 0; categories[j]; j++) {
 435                         if (categories[j] != last_kw)
 436                                 continue;
 437                         category = last_kw;
 438                 }
 439 
 440                 return (keywords[i].id);
 441         }
 442 
 443         /* maybe its a numeric constant? */
 444         if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
 445                 char *eptr;
 446                 yylval.num = strtol(token, &eptr, 10);
 447                 if (*eptr != 0)
 448                         yyerror(_("malformed number"));
 449                 return (T_NUMBER);
 450         }
 451 
 452         /*
 453          * A single lone character is treated as a character literal.
 454          * To avoid duplication of effort, we stick in the charmap.
 455          */
 456         if (len == 1) {
 457                 yylval.mbs[0] = 1; /* length */
 458                 yylval.mbs[1] = token[0];
 459                 yylval.mbs[2] = '\0';
 460                 return (T_CHAR);
 461         }
 462 
 463         /* anything else is treated as a symbolic name */
 464         yylval.token = strdup(token);
 465         token = NULL;
 466         toksz = 0;
 467         tokidx = 0;
 468         return (T_NAME);
 469 }
 470 
 471 void
 472 scan_to_eol(void)
 473 {
 474         int     c;
 475         while ((c = scanc()) != '\n') {
 476                 if (c == EOF) {
 477                         /* end of file without newline! */
 478                         errf(_("missing newline"));
 479                         return;
 480                 }
 481         }
 482         assert(c == '\n');
 483 }
 484 
 485 int
 486 yylex(void)
 487 {
 488         int             c;
 489 
 490         while ((c = scanc()) != EOF) {
 491 
 492                 /* special handling for quoted string */
 493                 if (instring) {
 494                         if (escaped) {
 495                                 escaped = 0;
 496 
 497                                 /* if newline, just eat and forget it */
 498                                 if (c == '\n')
 499                                         continue;
 500 
 501                                 if (strchr("xXd01234567", c)) {
 502                                         unscanc(c);
 503                                         unscanc(esc_char);
 504                                         return (get_wide());
 505                                 }
 506                                 yylval.mbs[0] = 1; /* length */
 507                                 yylval.mbs[1] = get_escaped(c);
 508                                 yylval.mbs[2] = '\0';
 509                                 return (T_CHAR);
 510                         }
 511                         if (c == esc_char) {
 512                                 escaped = 1;
 513                                 continue;
 514                         }
 515                         switch (c) {
 516                         case '<':
 517                                 return (get_symbol());
 518                         case '>':
 519                                 /* oops! should generate syntax error  */
 520                                 return (T_GT);
 521                         case '"':
 522                                 instring = 0;
 523                                 return (T_QUOTE);
 524                         default:
 525                                 yylval.mbs[0] = 1; /* length */
 526                                 yylval.mbs[1] = c;
 527                                 yylval.mbs[2] = '\0';
 528                                 return (T_CHAR);
 529                         }
 530                 }
 531 
 532                 /* escaped characters first */
 533                 if (escaped) {
 534                         escaped = 0;
 535                         if (c == '\n') {
 536                                 /* eat the newline */
 537                                 continue;
 538                         }
 539                         hadtok = 1;
 540                         if (tokidx) {
 541                                 /* an escape mid-token is nonsense */
 542                                 return (T_NULL);
 543                         }
 544 
 545                         /* numeric escapes are treated as wide characters */
 546                         if (strchr("xXd01234567", c)) {
 547                                 unscanc(c);
 548                                 unscanc(esc_char);
 549                                 return (get_wide());
 550                         }
 551 
 552                         add_tok(get_escaped(c));
 553                         continue;
 554                 }
 555 
 556                 /* if it is the escape charter itself note it */
 557                 if (c == esc_char) {
 558                         escaped = 1;
 559                         continue;
 560                 }
 561 
 562                 /* remove from the comment char to end of line */
 563                 if (c == com_char) {
 564                         while (c != '\n') {
 565                                 if ((c = scanc()) == EOF) {
 566                                         /* end of file without newline! */
 567                                         return (EOF);
 568                                 }
 569                         }
 570                         assert(c == '\n');
 571                         if (!hadtok) {
 572                                 /*
 573                                  * If there were no tokens on this line,
 574                                  * then just pretend it didn't exist at all.
 575                                  */
 576                                 continue;
 577                         }
 578                         hadtok = 0;
 579                         return (T_NL);
 580                 }
 581 
 582                 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
 583                         /*
 584                          * These are all token delimiters.  If there
 585                          * is a token already in progress, we need to
 586                          * process it.
 587                          */
 588                         unscanc(c);
 589                         return (consume_token());
 590                 }
 591 
 592                 switch (c) {
 593                 case '\n':
 594                         if (!hadtok) {
 595                                 /*
 596                                  * If the line was completely devoid of tokens,
 597                                  * then just ignore it.
 598                                  */
 599                                 continue;
 600                         }
 601                         /* we're starting a new line, reset the token state */
 602                         hadtok = 0;
 603                         return (T_NL);
 604                 case ',':
 605                         hadtok = 1;
 606                         return (T_COMMA);
 607                 case ';':
 608                         hadtok = 1;
 609                         return (T_SEMI);
 610                 case '(':
 611                         hadtok = 1;
 612                         return (T_LPAREN);
 613                 case ')':
 614                         hadtok = 1;
 615                         return (T_RPAREN);
 616                 case '>':
 617                         hadtok = 1;
 618                         return (T_GT);
 619                 case '<':
 620                         /* symbol start! */
 621                         hadtok = 1;
 622                         return (get_symbol());
 623                 case ' ':
 624                 case '\t':
 625                         /* whitespace, just ignore it */
 626                         continue;
 627                 case '"':
 628                         hadtok = 1;
 629                         instring = 1;
 630                         return (T_QUOTE);
 631                 default:
 632                         hadtok = 1;
 633                         add_tok(c);
 634                         continue;
 635                 }
 636         }
 637         return (EOF);
 638 }
 639 
 640 void
 641 yyerror(const char *msg)
 642 {
 643         (void) fprintf(stderr, _("%s: %d: error: %s\n"),
 644             filename, lineno, msg);
 645         exit(1);
 646 }
 647 
 648 void
 649 errf(const char *fmt, ...)
 650 {
 651         char    *msg;
 652 
 653         va_list va;
 654         va_start(va, fmt);
 655         (void) vasprintf(&msg, fmt, va);
 656         va_end(va);
 657 
 658         (void) fprintf(stderr, _("%s: %d: error: %s\n"),
 659             filename, lineno, msg);
 660         free(msg);
 661         exit(1);
 662 }
 663 
 664 void
 665 warn(const char *fmt, ...)
 666 {
 667         char    *msg;
 668 
 669         va_list va;
 670         va_start(va, fmt);
 671         (void) vasprintf(&msg, fmt, va);
 672         va_end(va);
 673 
 674         (void) fprintf(stderr, _("%s: %d: warning: %s\n"),
 675             filename, lineno, msg);
 676         free(msg);
 677         warnings++;
 678 }