1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
  14  * Copyright 2013 David Hoeppner.  All rights reserved.
  15  */
  16 
  17 /*
  18  * Functions to charmap .
  19  */
  20 
  21 #include <assert.h>
  22 #include <ctype.h>
  23 #include <limits.h>
  24 #include <widec.h>
  25 
  26 #include "iconv.h"
  27 #include "parser.tab.h"
  28 
  29 /*
  30  * Helper macros.
  31  */
  32 #define hex(x)  \
  33         (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
  34 
  35 #define isodigit(x)     ((x >= '0') && (x <= '7'))
  36 
  37 /*
  38  * Charmap specific.
  39  */
  40 int             com_char = '#';
  41 int             esc_char = '\\';
  42 int             mb_cur_max = 1;
  43 int             mb_cur_min = 1;
  44 
  45 int             lineno = 1;
  46 static FILE     *input = stdin;
  47 static const char *filename = "<stdin>";
  48 static int      escaped = 0;
  49 static int      instring = 0;
  50 static int      nextline;
  51 
  52 /*
  53  * Tokens.
  54  */
  55 static char     *token = NULL;
  56 static int      tokidx;
  57 static int      toksz = 0;
  58 static int      hadtok = 0;
  59 
  60 /*
  61  * Wide strings.
  62  */
  63 static wchar_t  *widestr = NULL;
  64 static int      wideidx = 0;
  65 static int      widesz = 0;
  66 
  67 /*
  68  * Keywords related.
  69  */
  70 int             last_kw = 0;
  71 static int      category = T_END;
  72 
  73 static struct token {
  74         int     id;
  75         const char *name;
  76 } keywords[] = {
  77         { T_COM_CHAR,           "comment_char" },
  78         { T_ESC_CHAR,           "escape_char" },
  79         { T_END,                "END" },
  80         { T_CHARMAP,            "CHARMAP" },
  81         { T_WIDTH,              "WIDTH" },
  82         { T_WIDTH_DEFAULT,      "WIDTH_DEFAULT" },
  83         { -1, NULL },
  84 };
  85 
  86 /*
  87  * Charmap reserved keywords.
  88  */
  89 static struct token symwords[] = {
  90         { T_COM_CHAR,           "comment_char" },
  91         { T_ESC_CHAR,           "escape_char" },
  92         { T_CODE_SET,           "code_set_name" },
  93         { T_MB_CUR_MAX,         "mb_cur_max" },
  94         { T_MB_CUR_MIN,         "mb_cur_min" },
  95         { -1, NULL },
  96 };
  97 
  98 static int categories[] = {
  99         T_CHARMAP,
 100         T_WIDTH,
 101         0,
 102 };
 103 
 104 char *
 105 to_mb_string(const wchar_t *wcs)
 106 {
 107         return (NULL);
 108 }
 109 
 110 void
 111 set_wide_encoding(const char *encoding)
 112 {
 113 }
 114 
 115 /*
 116  * Reset the scanner variables and open the supplied charmap file.
 117  */
 118 void
 119 reset_scanner(const char *fname)
 120 {
 121         input = fopen(fname, "r");
 122         if (input == NULL) {
 123                 perror("fopen");
 124                 exit(4);
 125         }
 126 
 127         filename = fname;
 128         com_char = '#';
 129         esc_char = '\\';
 130         instring = 0;
 131         escaped = 0;
 132         lineno = 1;
 133         nextline = 1;
 134         tokidx = 0;
 135         wideidx = 0;
 136 }
 137 
 138 static int
 139 scanc(void)
 140 {
 141         int     c;
 142 
 143         c = getc(input);
 144         lineno = nextline;
 145         if (c == '\n') {
 146                 nextline++;
 147         }
 148 
 149         return (c);
 150 }
 151 
 152 static void
 153 unscanc(int c)
 154 {
 155         if (c == '\n') {
 156                 nextline--;
 157         }
 158 
 159         if (ungetc(c, input) < 0) {
 160                 yyerror(_("ungetc failed"));
 161         }
 162 }
 163 
 164 static int
 165 scan_hex_byte(void)
 166 {
 167         int     c1, c2;
 168         int     v;
 169 
 170         c1 = scanc();
 171         if (!isxdigit(c1)) {
 172                 yyerror(_("malformed hex digit"));
 173                 return (0);
 174         }
 175         c2 = scanc();
 176         if (!isxdigit(c2)) {
 177                 yyerror(_("malformed hex digit"));
 178                 return (0);
 179         }
 180         v = ((hex(c1) << 4) | hex(c2));
 181         return (v);
 182 }
 183 
 184 static int
 185 scan_dec_byte(void)
 186 {
 187         int     c1, c2, c3;
 188         int     b;
 189 
 190         c1 = scanc();
 191         if (!isdigit(c1)) {
 192                 yyerror(_("malformed decimal digit"));
 193                 return (0);
 194         }
 195         b = c1 - '0';
 196         c2 = scanc();
 197         if (!isdigit(c2)) {
 198                 yyerror(_("malformed decimal digit"));
 199                 return (0);
 200         }
 201         b *= 10;
 202         b += (c2 - '0');
 203         c3 = scanc();
 204         if (!isdigit(c3)) {
 205                 unscanc(c3);
 206         } else {
 207                 b *= 10;
 208                 b += (c3 - '0');
 209         }
 210         return (b);
 211 }
 212 
 213 static int
 214 scan_oct_byte(void)
 215 {
 216         int     c1, c2, c3;
 217         int     b;
 218 
 219         b = 0;
 220 
 221         c1 = scanc();
 222         if (!isodigit(c1)) {
 223                 yyerror(_("malformed octal digit"));
 224                 return (0);
 225         }
 226         b = c1 - '0';
 227         c2 = scanc();
 228         if (!isodigit(c2)) {
 229                 yyerror(_("malformed octal digit"));
 230                 return (0);
 231         }
 232         b *= 8;
 233         b += (c2 - '0');
 234         c3 = scanc();
 235         if (!isodigit(c3)) {
 236                 unscanc(c3);
 237         } else {
 238                 b *= 8;
 239                 b += (c3 - '0');
 240         }
 241         return (b);
 242 }
 243 
 244 void
 245 add_tok(int c)
 246 {
 247         if ((tokidx + 1) >= toksz) {
 248                 toksz += 64;
 249 
 250                 if ((token = realloc(token, toksz)) == NULL) {
 251                         yyerror(_("out of memory"));
 252                         tokidx = 0;
 253                         toksz = 0;
 254                         return;
 255                 }
 256         }
 257 
 258         token[tokidx++] = (char)c;
 259         token[tokidx] = 0;
 260 }
 261 
 262 void
 263 add_wcs(wchar_t c)
 264 {
 265         if ((wideidx + 1) >= widesz) {
 266                 widesz += 64;
 267                 widestr = realloc(widestr, (widesz * sizeof (wchar_t)));
 268                 if (widestr == NULL) {
 269                         yyerror(_("out of memory"));
 270                         wideidx = 0;
 271                         widesz = 0;
 272                         return;
 273                 }
 274         }
 275 
 276         widestr[wideidx++] = c;
 277         widestr[wideidx] = 0;
 278 }
 279 
 280 wchar_t *
 281 get_wcs(void)
 282 {
 283         wchar_t *ws = widestr;
 284 
 285         wideidx = 0;
 286         widestr = NULL;
 287         widesz = 0;
 288 
 289         if (ws == NULL) {
 290                 if ((ws = wsdup(L"")) == NULL) {
 291                         yyerror(_("out of memory"));
 292                 }
 293         }
 294 
 295         return (ws);
 296 }
 297 
 298 static int
 299 get_byte(void)
 300 {
 301         int     c;
 302 
 303         if ((c = scanc()) != esc_char) {
 304                 unscanc(c);
 305                 return (EOF);
 306         }
 307 
 308         c = scanc();
 309 
 310         switch (c) {
 311         case 'd':
 312         case 'D':
 313                 return (scan_dec_byte());
 314         case 'x':
 315         case 'X':
 316                 return (scan_hex_byte());
 317         case '0' ... '7':
 318                 /* Put the character back so we can get it */
 319                 unscanc(c);
 320                 return (scan_oct_byte());
 321         default:
 322                 unscanc(c);
 323                 unscanc(esc_char);
 324                 return (EOF);
 325         }
 326 }
 327 
 328 int
 329 get_escaped(int c)
 330 {
 331         switch (c) {
 332         case 'n':
 333                 return ('\n');
 334         case 'r':
 335                 return ('\r');
 336         case 't':
 337                 return ('\t');
 338         case 'f':
 339                 return ('\f');
 340         case 'v':
 341                 return ('\v');
 342         case 'b':
 343                 return ('\b');
 344         case 'a':
 345                 return ('\a');
 346         default:
 347                 return (c);
 348         }
 349 }
 350 
 351 int
 352 get_wide(void)
 353 {
 354         char    mbs[MB_LEN_MAX + 1] = "";
 355         int     mbi = 0;
 356         int     c;
 357         wchar_t wc;
 358 
 359         if (mb_cur_max >= sizeof (mbs)) {
 360                 yyerror(_("max multibyte character size too big"));
 361                 mbi = 0;
 362                 return (T_NULL);
 363         }
 364 
 365         for (;;) {
 366                 if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) {
 367                         /*
 368                          * End of the byte sequence reached, but no
 369                          * valid wide decoding.  Fatal error.
 370                          */
 371                         mbi = 0;
 372                         yyerror(_("not a valid character encoding"));
 373                         return (T_NULL);
 374                 }
 375 
 376                 mbs[mbi++] = c;
 377                 mbs[mbi] = 0;
 378 
 379                 if (mbi == mb_cur_max) {
 380                         break;
 381                 }
 382         }
 383 
 384         mbi = 0;
 385         /* XXX */
 386         yylval.wc = (uint8_t)*mbs;
 387 
 388         return (T_CHAR);
 389 }
 390 
 391 int
 392 get_symbol(void)
 393 {
 394         int     c;
 395 
 396         while ((c = scanc()) != EOF) {
 397                 if (escaped == 1) {
 398                         escaped = 0;
 399                         if (c == '\n') {
 400                                 continue;
 401                         }
 402 
 403                         add_tok(get_escaped(c));
 404                         continue;
 405                 }
 406 
 407                 if (c == esc_char) {
 408                         escaped = 1;
 409                         continue;
 410                 }
 411 
 412                 if (c == '\n') {        /* Well that's strange! */
 413                         yyerror(_("unterminated symbolic name"));
 414                         continue;
 415                 }
 416 
 417                 if (c == '>') {              /* End of symbol */
 418                         /*
 419                          * This restarts the token from the beginning
 420                          * the next time we scan a character.  (This
 421                          * token is complete.)
 422                          */
 423                         if (token == NULL) {
 424                                 yyerror(_("missing symbolic name"));
 425                                 return (T_NULL);
 426                         }
 427 
 428                         tokidx = 0;
 429 
 430                         /*
 431                          * A few symbols are handled as keywords outside
 432                          * of the normal categories.
 433                          */
 434                         if (category == T_END) {
 435                                 int     i;
 436 
 437                                 for (i = 0; symwords[i].name != 0; i++) {
 438                                         if (strcmp(token, symwords[i].name) ==
 439                                             0) {
 440                                                 last_kw = symwords[i].id;
 441                                                 return (last_kw);
 442                                         }
 443                                 }
 444                         }
 445 
 446                         /* XXX */
 447 
 448                         /* Its an undefined symbol */
 449                         yylval.token = strdup(token);
 450                         token = NULL;
 451                         toksz = 0;
 452                         tokidx = 0;
 453 printf("returning SYMBOL %s\n", yylval.token);
 454                         return (T_SYMBOL);
 455                 }
 456 
 457                 add_tok(c);
 458         }
 459 
 460         yyerror(_("unterminated symbolic name"));
 461 
 462         return (EOF);
 463 }
 464 
 465 static int
 466 consume_token(void)
 467 {
 468         int     len = tokidx;
 469         int     i;
 470 
 471         tokidx = 0;
 472         if (token == NULL) {
 473                 return (T_NULL);
 474         }
 475 
 476         /*
 477          * This one is special, because we don't want it to alter the
 478          * last_kw field.
 479          */
 480         if (strcmp(token, "...") == 0) {
 481                 return (T_ELLIPSIS);
 482         }
 483 
 484         /* Search for reserved words first */
 485         for (i = 0; keywords[i].name; i++) {
 486                 int     j;
 487 
 488                 if (strcmp(keywords[i].name, token)) {
 489                         continue;
 490                 }
 491 
 492                 last_kw = keywords[i].id;
 493 
 494                 /* Clear the top level category if we're done with it */
 495                 if (last_kw == T_END) {
 496                         category = T_END;
 497                 }
 498 
 499                 /* Set the top level category if we're changing */
 500                 for (j = 0; categories[j]; j++) {
 501                         if (categories[j] != last_kw) {
 502                                 continue;
 503                         }
 504                         category = last_kw;
 505                 }
 506 
 507                 return (keywords[i].id);
 508         }
 509 
 510         /* Maybe its a numeric constant? */
 511         if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
 512                 char    *eptr;
 513 
 514                 yylval.num = strtol(token, &eptr, 10);
 515                 if (*eptr != 0) {
 516                         yyerror(_("malformed number"));
 517                 }
 518 
 519                 return (T_NUMBER);
 520         }
 521 
 522         /*
 523          * A single lone character is treated as a character literal.
 524          * To avoid duplication of effort, we stick in the charmap.
 525          */
 526         if (len == 1) {
 527                 yylval.wc = token[0];
 528                 return (T_CHAR);
 529         }
 530 
 531         /* Anything else is treated as a symbolic name */
 532         yylval.token = strdup(token);
 533         token = NULL;
 534         toksz = 0;
 535         tokidx = 0;
 536 
 537         return (T_NAME);
 538 }
 539 
 540 void
 541 scan_to_eol(void)
 542 {
 543         int     c;
 544 
 545         while ((c = scanc()) != '\n') {
 546                 if (c == EOF) {
 547                         /* end of file without newline! */
 548                         errf(_("missing newline"));
 549                         return;
 550                 }
 551         }
 552 
 553         assert(c == '\n');
 554 }
 555 
 556 int
 557 yylex(void)
 558 {
 559         int     c;
 560 
 561         while ((c = scanc()) != EOF) {
 562 printf("--- yylex --%c--\n", c);
 563 
 564                 /* Special handling for quoted strings */
 565                 if (instring == 1) {
 566                         if (escaped == 1) {
 567                                 escaped = 0;
 568 
 569                                 /* If newline, just eat and forget it */
 570                                 if (c == '\n') {
 571                                         continue;
 572                                 }
 573 
 574                                 if (strchr("xd01234567", c)) {
 575                                         unscanc(c);
 576                                         unscanc(esc_char);
 577                                         return (get_wide());
 578                                 }
 579 
 580                                 yylval.wc = get_escaped(c);
 581                                 return (T_CHAR);
 582                         }
 583 
 584                         if (c == esc_char) {
 585                                 escaped = 1;
 586                                 continue;
 587                         }
 588 
 589                         switch (c) {
 590                         case '<':
 591                                 return (get_symbol());
 592                         case '>':
 593                                 /* Opps! Should generate syntax error */
 594                                 return (T_GT);
 595                         case '"':
 596                                 instring = 0;
 597                                 return (T_QUOTE);
 598                         default:
 599                                 yylval.wc = c;
 600                                 return (T_CHAR);
 601                         }
 602                 }
 603 
 604                 /* Escaped characters first */
 605                 if (escaped == 1) {
 606                         escaped = 0;
 607                         if (c == '\n') {
 608                                 /* Eat the newline */
 609                                 continue;
 610                         }
 611                         hadtok = 1;
 612                         if (tokidx != 0) {
 613                                 /* An escape mid-token is nonsense */
 614                                 return (T_NULL);
 615                         }
 616 
 617                         /* Numeric escapes are treated as wide characters */
 618                         if (strchr("xXd01234567", c)) {
 619                                 unscanc(c);
 620                                 unscanc(esc_char);
 621                                 return (get_wide());
 622                         }
 623 
 624                         add_tok(get_escaped(c));
 625                         continue;
 626                 }
 627 
 628                 /* If it is the escape character itself note it */
 629                 if (c == esc_char) {
 630                         escaped = 1;
 631                         continue;
 632                 }
 633 
 634                 /* Remove from the comment character to end of line */
 635                 if (c == com_char) {
 636                         while (c != '\n') {
 637                                 if ((c = scanc()) == EOF) {
 638                                         /* End of file without newline */
 639                                         return (EOF);
 640                                 }
 641                         }
 642 
 643                         assert(c == '\n');
 644 
 645                         if (hadtok == 0) {
 646                                 /*
 647                                  * If there were no tokens on this line,
 648                                  * then just pretend it didn't exist at all.
 649                                  */
 650                                 continue;
 651                         }
 652 
 653                         hadtok = 0;
 654                         return (T_NL);
 655                 }
 656 
 657                 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
 658                         /*
 659                          * These are all token delimiters.  If there
 660                          * is a token already in progress, we need to
 661                          * process it.
 662                          */
 663                         unscanc(c);
 664                         return (consume_token());
 665                 }
 666 
 667                 switch (c) {
 668                 case '\n':
 669                         if (hadtok == 0) {
 670                                 /*
 671                                  * If the line was completely devoid of tokens,
 672                                  * then just ignore it.
 673                                  */
 674                                 continue;
 675                         }
 676 
 677                         /* We're starting a new line, reset the token state */
 678                         hadtok = 0;
 679                         return (T_NL);
 680                 case '>':
 681                         hadtok = 1;
 682                         return (T_GT);
 683                 case '<':
 684                         /* Symbol start! */
 685                         hadtok = 1;
 686                         return (get_symbol());
 687                 case ' ':
 688                 case '\t':
 689                         /* Whitespace, just ignore */
 690                         continue;
 691                 case '"':
 692                         hadtok = 1;
 693                         instring = 1;
 694                         return (T_QUOTE);
 695                 default:
 696 //printf("--- adding %c to token\n", c);
 697                         hadtok = 1;
 698                         add_tok(c);
 699                         continue;
 700                 }
 701         }
 702 
 703         return (EOF);
 704 }
 705 
 706 void
 707 yyerror(const char *msg)
 708 {
 709         (void) fprintf(stderr, _("%s: %d: error: %s\n"),
 710             filename, lineno, msg);
 711         exit(4);
 712 }
 713 
 714 void
 715 errf(const char *fmt, ...)
 716 {
 717         char    *msg;
 718         va_list va;
 719 
 720         va_start(va, fmt);
 721         (void) vasprintf(&msg, fmt, va);
 722         va_end(va);
 723 
 724         (void) fprintf(stderr, _("%s: %d: error: %s\n"),
 725             filename, lineno, msg);
 726         free(msg);
 727         exit(4);
 728 }