1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 14 * Copyright 2013 David Hoeppner. All rights reserved. 15 */ 16 17 /* 18 * Functions to charmap . 19 */ 20 21 #include <assert.h> 22 #include <ctype.h> 23 #include <limits.h> 24 #include <widec.h> 25 26 #include "iconv.h" 27 #include "parser.tab.h" 28 29 /* 30 * Helper macros. 31 */ 32 #define hex(x) \ 33 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10)) 34 35 #define isodigit(x) ((x >= '0') && (x <= '7')) 36 37 /* 38 * Charmap specific. 39 */ 40 int com_char = '#'; 41 int esc_char = '\\'; 42 int mb_cur_max = 1; 43 int mb_cur_min = 1; 44 45 int lineno = 1; 46 static FILE *input = stdin; 47 static const char *filename = "<stdin>"; 48 static int escaped = 0; 49 static int instring = 0; 50 static int nextline; 51 52 /* 53 * Tokens. 54 */ 55 static char *token = NULL; 56 static int tokidx; 57 static int toksz = 0; 58 static int hadtok = 0; 59 60 /* 61 * Wide strings. 62 */ 63 static wchar_t *widestr = NULL; 64 static int wideidx = 0; 65 static int widesz = 0; 66 67 /* 68 * Keywords related. 69 */ 70 int last_kw = 0; 71 static int category = T_END; 72 73 static struct token { 74 int id; 75 const char *name; 76 } keywords[] = { 77 { T_COM_CHAR, "comment_char" }, 78 { T_ESC_CHAR, "escape_char" }, 79 { T_END, "END" }, 80 { T_CHARMAP, "CHARMAP" }, 81 { T_WIDTH, "WIDTH" }, 82 { T_WIDTH_DEFAULT, "WIDTH_DEFAULT" }, 83 { -1, NULL }, 84 }; 85 86 /* 87 * Charmap reserved keywords. 88 */ 89 static struct token symwords[] = { 90 { T_COM_CHAR, "comment_char" }, 91 { T_ESC_CHAR, "escape_char" }, 92 { T_CODE_SET, "code_set_name" }, 93 { T_MB_CUR_MAX, "mb_cur_max" }, 94 { T_MB_CUR_MIN, "mb_cur_min" }, 95 { -1, NULL }, 96 }; 97 98 static int categories[] = { 99 T_CHARMAP, 100 T_WIDTH, 101 0, 102 }; 103 104 char * 105 to_mb_string(const wchar_t *wcs) 106 { 107 return (NULL); 108 } 109 110 void 111 set_wide_encoding(const char *encoding) 112 { 113 } 114 115 /* 116 * Reset the scanner variables and open the supplied charmap file. 117 */ 118 void 119 reset_scanner(const char *fname) 120 { 121 input = fopen(fname, "r"); 122 if (input == NULL) { 123 perror("fopen"); 124 exit(4); 125 } 126 127 filename = fname; 128 com_char = '#'; 129 esc_char = '\\'; 130 instring = 0; 131 escaped = 0; 132 lineno = 1; 133 nextline = 1; 134 tokidx = 0; 135 wideidx = 0; 136 } 137 138 static int 139 scanc(void) 140 { 141 int c; 142 143 c = getc(input); 144 lineno = nextline; 145 if (c == '\n') { 146 nextline++; 147 } 148 149 return (c); 150 } 151 152 static void 153 unscanc(int c) 154 { 155 if (c == '\n') { 156 nextline--; 157 } 158 159 if (ungetc(c, input) < 0) { 160 yyerror(_("ungetc failed")); 161 } 162 } 163 164 static int 165 scan_hex_byte(void) 166 { 167 int c1, c2; 168 int v; 169 170 c1 = scanc(); 171 if (!isxdigit(c1)) { 172 yyerror(_("malformed hex digit")); 173 return (0); 174 } 175 c2 = scanc(); 176 if (!isxdigit(c2)) { 177 yyerror(_("malformed hex digit")); 178 return (0); 179 } 180 v = ((hex(c1) << 4) | hex(c2)); 181 return (v); 182 } 183 184 static int 185 scan_dec_byte(void) 186 { 187 int c1, c2, c3; 188 int b; 189 190 c1 = scanc(); 191 if (!isdigit(c1)) { 192 yyerror(_("malformed decimal digit")); 193 return (0); 194 } 195 b = c1 - '0'; 196 c2 = scanc(); 197 if (!isdigit(c2)) { 198 yyerror(_("malformed decimal digit")); 199 return (0); 200 } 201 b *= 10; 202 b += (c2 - '0'); 203 c3 = scanc(); 204 if (!isdigit(c3)) { 205 unscanc(c3); 206 } else { 207 b *= 10; 208 b += (c3 - '0'); 209 } 210 return (b); 211 } 212 213 static int 214 scan_oct_byte(void) 215 { 216 int c1, c2, c3; 217 int b; 218 219 b = 0; 220 221 c1 = scanc(); 222 if (!isodigit(c1)) { 223 yyerror(_("malformed octal digit")); 224 return (0); 225 } 226 b = c1 - '0'; 227 c2 = scanc(); 228 if (!isodigit(c2)) { 229 yyerror(_("malformed octal digit")); 230 return (0); 231 } 232 b *= 8; 233 b += (c2 - '0'); 234 c3 = scanc(); 235 if (!isodigit(c3)) { 236 unscanc(c3); 237 } else { 238 b *= 8; 239 b += (c3 - '0'); 240 } 241 return (b); 242 } 243 244 void 245 add_tok(int c) 246 { 247 if ((tokidx + 1) >= toksz) { 248 toksz += 64; 249 250 if ((token = realloc(token, toksz)) == NULL) { 251 yyerror(_("out of memory")); 252 tokidx = 0; 253 toksz = 0; 254 return; 255 } 256 } 257 258 token[tokidx++] = (char)c; 259 token[tokidx] = 0; 260 } 261 262 void 263 add_wcs(wchar_t c) 264 { 265 if ((wideidx + 1) >= widesz) { 266 widesz += 64; 267 widestr = realloc(widestr, (widesz * sizeof (wchar_t))); 268 if (widestr == NULL) { 269 yyerror(_("out of memory")); 270 wideidx = 0; 271 widesz = 0; 272 return; 273 } 274 } 275 276 widestr[wideidx++] = c; 277 widestr[wideidx] = 0; 278 } 279 280 wchar_t * 281 get_wcs(void) 282 { 283 wchar_t *ws = widestr; 284 285 wideidx = 0; 286 widestr = NULL; 287 widesz = 0; 288 289 if (ws == NULL) { 290 if ((ws = wsdup(L"")) == NULL) { 291 yyerror(_("out of memory")); 292 } 293 } 294 295 return (ws); 296 } 297 298 static int 299 get_byte(void) 300 { 301 int c; 302 303 if ((c = scanc()) != esc_char) { 304 unscanc(c); 305 return (EOF); 306 } 307 308 c = scanc(); 309 310 switch (c) { 311 case 'd': 312 case 'D': 313 return (scan_dec_byte()); 314 case 'x': 315 case 'X': 316 return (scan_hex_byte()); 317 case '0' ... '7': 318 /* Put the character back so we can get it */ 319 unscanc(c); 320 return (scan_oct_byte()); 321 default: 322 unscanc(c); 323 unscanc(esc_char); 324 return (EOF); 325 } 326 } 327 328 int 329 get_escaped(int c) 330 { 331 switch (c) { 332 case 'n': 333 return ('\n'); 334 case 'r': 335 return ('\r'); 336 case 't': 337 return ('\t'); 338 case 'f': 339 return ('\f'); 340 case 'v': 341 return ('\v'); 342 case 'b': 343 return ('\b'); 344 case 'a': 345 return ('\a'); 346 default: 347 return (c); 348 } 349 } 350 351 int 352 get_wide(void) 353 { 354 char mbs[MB_LEN_MAX + 1] = ""; 355 int mbi = 0; 356 int c; 357 wchar_t wc; 358 359 if (mb_cur_max >= sizeof (mbs)) { 360 yyerror(_("max multibyte character size too big")); 361 mbi = 0; 362 return (T_NULL); 363 } 364 365 for (;;) { 366 if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) { 367 /* 368 * End of the byte sequence reached, but no 369 * valid wide decoding. Fatal error. 370 */ 371 mbi = 0; 372 yyerror(_("not a valid character encoding")); 373 return (T_NULL); 374 } 375 376 mbs[mbi++] = c; 377 mbs[mbi] = 0; 378 379 if (mbi == mb_cur_max) { 380 break; 381 } 382 } 383 384 mbi = 0; 385 /* XXX */ 386 yylval.wc = (uint8_t)*mbs; 387 388 return (T_CHAR); 389 } 390 391 int 392 get_symbol(void) 393 { 394 int c; 395 396 while ((c = scanc()) != EOF) { 397 if (escaped == 1) { 398 escaped = 0; 399 if (c == '\n') { 400 continue; 401 } 402 403 add_tok(get_escaped(c)); 404 continue; 405 } 406 407 if (c == esc_char) { 408 escaped = 1; 409 continue; 410 } 411 412 if (c == '\n') { /* Well that's strange! */ 413 yyerror(_("unterminated symbolic name")); 414 continue; 415 } 416 417 if (c == '>') { /* End of symbol */ 418 /* 419 * This restarts the token from the beginning 420 * the next time we scan a character. (This 421 * token is complete.) 422 */ 423 if (token == NULL) { 424 yyerror(_("missing symbolic name")); 425 return (T_NULL); 426 } 427 428 tokidx = 0; 429 430 /* 431 * A few symbols are handled as keywords outside 432 * of the normal categories. 433 */ 434 if (category == T_END) { 435 int i; 436 437 for (i = 0; symwords[i].name != 0; i++) { 438 if (strcmp(token, symwords[i].name) == 439 0) { 440 last_kw = symwords[i].id; 441 return (last_kw); 442 } 443 } 444 } 445 446 /* XXX */ 447 448 /* Its an undefined symbol */ 449 yylval.token = strdup(token); 450 token = NULL; 451 toksz = 0; 452 tokidx = 0; 453 printf("returning SYMBOL %s\n", yylval.token); 454 return (T_SYMBOL); 455 } 456 457 add_tok(c); 458 } 459 460 yyerror(_("unterminated symbolic name")); 461 462 return (EOF); 463 } 464 465 static int 466 consume_token(void) 467 { 468 int len = tokidx; 469 int i; 470 471 tokidx = 0; 472 if (token == NULL) { 473 return (T_NULL); 474 } 475 476 /* 477 * This one is special, because we don't want it to alter the 478 * last_kw field. 479 */ 480 if (strcmp(token, "...") == 0) { 481 return (T_ELLIPSIS); 482 } 483 484 /* Search for reserved words first */ 485 for (i = 0; keywords[i].name; i++) { 486 int j; 487 488 if (strcmp(keywords[i].name, token)) { 489 continue; 490 } 491 492 last_kw = keywords[i].id; 493 494 /* Clear the top level category if we're done with it */ 495 if (last_kw == T_END) { 496 category = T_END; 497 } 498 499 /* Set the top level category if we're changing */ 500 for (j = 0; categories[j]; j++) { 501 if (categories[j] != last_kw) { 502 continue; 503 } 504 category = last_kw; 505 } 506 507 return (keywords[i].id); 508 } 509 510 /* Maybe its a numeric constant? */ 511 if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) { 512 char *eptr; 513 514 yylval.num = strtol(token, &eptr, 10); 515 if (*eptr != 0) { 516 yyerror(_("malformed number")); 517 } 518 519 return (T_NUMBER); 520 } 521 522 /* 523 * A single lone character is treated as a character literal. 524 * To avoid duplication of effort, we stick in the charmap. 525 */ 526 if (len == 1) { 527 yylval.wc = token[0]; 528 return (T_CHAR); 529 } 530 531 /* Anything else is treated as a symbolic name */ 532 yylval.token = strdup(token); 533 token = NULL; 534 toksz = 0; 535 tokidx = 0; 536 537 return (T_NAME); 538 } 539 540 void 541 scan_to_eol(void) 542 { 543 int c; 544 545 while ((c = scanc()) != '\n') { 546 if (c == EOF) { 547 /* end of file without newline! */ 548 errf(_("missing newline")); 549 return; 550 } 551 } 552 553 assert(c == '\n'); 554 } 555 556 int 557 yylex(void) 558 { 559 int c; 560 561 while ((c = scanc()) != EOF) { 562 printf("--- yylex --%c--\n", c); 563 564 /* Special handling for quoted strings */ 565 if (instring == 1) { 566 if (escaped == 1) { 567 escaped = 0; 568 569 /* If newline, just eat and forget it */ 570 if (c == '\n') { 571 continue; 572 } 573 574 if (strchr("xd01234567", c)) { 575 unscanc(c); 576 unscanc(esc_char); 577 return (get_wide()); 578 } 579 580 yylval.wc = get_escaped(c); 581 return (T_CHAR); 582 } 583 584 if (c == esc_char) { 585 escaped = 1; 586 continue; 587 } 588 589 switch (c) { 590 case '<': 591 return (get_symbol()); 592 case '>': 593 /* Opps! Should generate syntax error */ 594 return (T_GT); 595 case '"': 596 instring = 0; 597 return (T_QUOTE); 598 default: 599 yylval.wc = c; 600 return (T_CHAR); 601 } 602 } 603 604 /* Escaped characters first */ 605 if (escaped == 1) { 606 escaped = 0; 607 if (c == '\n') { 608 /* Eat the newline */ 609 continue; 610 } 611 hadtok = 1; 612 if (tokidx != 0) { 613 /* An escape mid-token is nonsense */ 614 return (T_NULL); 615 } 616 617 /* Numeric escapes are treated as wide characters */ 618 if (strchr("xXd01234567", c)) { 619 unscanc(c); 620 unscanc(esc_char); 621 return (get_wide()); 622 } 623 624 add_tok(get_escaped(c)); 625 continue; 626 } 627 628 /* If it is the escape character itself note it */ 629 if (c == esc_char) { 630 escaped = 1; 631 continue; 632 } 633 634 /* Remove from the comment character to end of line */ 635 if (c == com_char) { 636 while (c != '\n') { 637 if ((c = scanc()) == EOF) { 638 /* End of file without newline */ 639 return (EOF); 640 } 641 } 642 643 assert(c == '\n'); 644 645 if (hadtok == 0) { 646 /* 647 * If there were no tokens on this line, 648 * then just pretend it didn't exist at all. 649 */ 650 continue; 651 } 652 653 hadtok = 0; 654 return (T_NL); 655 } 656 657 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) { 658 /* 659 * These are all token delimiters. If there 660 * is a token already in progress, we need to 661 * process it. 662 */ 663 unscanc(c); 664 return (consume_token()); 665 } 666 667 switch (c) { 668 case '\n': 669 if (hadtok == 0) { 670 /* 671 * If the line was completely devoid of tokens, 672 * then just ignore it. 673 */ 674 continue; 675 } 676 677 /* We're starting a new line, reset the token state */ 678 hadtok = 0; 679 return (T_NL); 680 case '>': 681 hadtok = 1; 682 return (T_GT); 683 case '<': 684 /* Symbol start! */ 685 hadtok = 1; 686 return (get_symbol()); 687 case ' ': 688 case '\t': 689 /* Whitespace, just ignore */ 690 continue; 691 case '"': 692 hadtok = 1; 693 instring = 1; 694 return (T_QUOTE); 695 default: 696 //printf("--- adding %c to token\n", c); 697 hadtok = 1; 698 add_tok(c); 699 continue; 700 } 701 } 702 703 return (EOF); 704 } 705 706 void 707 yyerror(const char *msg) 708 { 709 (void) fprintf(stderr, _("%s: %d: error: %s\n"), 710 filename, lineno, msg); 711 exit(4); 712 } 713 714 void 715 errf(const char *fmt, ...) 716 { 717 char *msg; 718 va_list va; 719 720 va_start(va, fmt); 721 (void) vasprintf(&msg, fmt, va); 722 va_end(va); 723 724 (void) fprintf(stderr, _("%s: %d: error: %s\n"), 725 filename, lineno, msg); 726 free(msg); 727 exit(4); 728 }