1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 14 */ 15 16 /* 17 * This file contains the "scanner", which tokenizes charmap files 18 * for iconv for processing by the higher level grammar processor. 19 */ 20 21 #include <stdio.h> 22 #include <stdlib.h> 23 #include <ctype.h> 24 #include <limits.h> 25 #include <string.h> 26 #include <widec.h> 27 #include <sys/types.h> 28 #include <assert.h> 29 #include "charmap.h" 30 #include "parser.tab.h" 31 32 int com_char = '#'; 33 int esc_char = '\\'; 34 int mb_cur_min = 1; 35 int mb_cur_max = 1; 36 int lineno = 1; 37 int warnings = 0; 38 static int nextline; 39 static FILE *input = stdin; 40 static const char *filename = "<stdin>"; 41 static int instring = 0; 42 static int escaped = 0; 43 44 /* 45 * Token space ... grows on demand. 46 */ 47 static char *token = NULL; 48 static int tokidx; 49 static int toksz = 0; 50 static int hadtok = 0; 51 52 /* 53 * The last keyword seen. This is useful to trigger the special lexer rules 54 * for "copy" and also collating symbols and elements. 55 */ 56 int last_kw = 0; 57 static int category = T_END; 58 59 static struct token { 60 int id; 61 const char *name; 62 } keywords[] = { 63 { T_COM_CHAR, "comment_char" }, 64 { T_ESC_CHAR, "escape_char" }, 65 { T_END, "END" }, 66 67 /* 68 * These are keywords used in the charmap file. Note that 69 * Solaris orginally used angle brackets to wrap some of them, 70 * but we removed that to simplify our parser. The first of these 71 * items are "global items." 72 */ 73 { T_CHARMAP, "CHARMAP" }, 74 { T_WIDTH, "WIDTH" }, 75 { T_WIDTH_DEFAULT, "WIDTH_DEFAULT" }, 76 77 { -1, NULL }, 78 }; 79 80 /* 81 * These special words are only used in a charmap file, enclosed in <>. 82 */ 83 static struct token symwords[] = { 84 { T_COM_CHAR, "comment_char" }, 85 { T_ESC_CHAR, "escape_char" }, 86 { T_CODE_SET, "code_set_name" }, 87 { T_MB_CUR_MAX, "mb_cur_max" }, 88 { T_MB_CUR_MIN, "mb_cur_min" }, 89 { -1, NULL }, 90 }; 91 92 static int categories[] = { 93 T_CHARMAP, 94 0 95 }; 96 97 void 98 reset_scanner(const char *fname) 99 { 100 if (fname == NULL) { 101 filename = "<stdin>"; 102 input = stdin; 103 } else { 104 if (input != stdin) 105 (void) fclose(input); 106 if ((input = fopen(fname, "r")) == NULL) { 107 perror(fname); 108 exit(1); 109 } 110 filename = fname; 111 } 112 com_char = '#'; 113 esc_char = '\\'; 114 instring = 0; 115 escaped = 0; 116 lineno = 1; 117 nextline = 1; 118 tokidx = 0; 119 last_kw = 0; 120 category = T_END; 121 } 122 123 #define hex(x) \ 124 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10)) 125 #define isodigit(x) ((x >= '0') && (x <= '7')) 126 127 static int 128 scanc(void) 129 { 130 int c; 131 132 c = getc(input); 133 lineno = nextline; 134 if (c == '\n') { 135 nextline++; 136 } 137 return (c); 138 } 139 140 static void 141 unscanc(int c) 142 { 143 if (c == '\n') { 144 nextline--; 145 } 146 if (ungetc(c, input) < 0) { 147 yyerror(_("ungetc failed")); 148 } 149 } 150 151 static int 152 scan_hex_byte(void) 153 { 154 int c1, c2; 155 int v; 156 157 c1 = scanc(); 158 if (!isxdigit(c1)) { 159 yyerror(_("malformed hex digit")); 160 return (0); 161 } 162 c2 = scanc(); 163 if (!isxdigit(c2)) { 164 yyerror(_("malformed hex digit")); 165 return (0); 166 } 167 v = ((hex(c1) << 4) | hex(c2)); 168 return (v); 169 } 170 171 static int 172 scan_dec_byte(void) 173 { 174 int c1, c2, c3; 175 int b; 176 177 c1 = scanc(); 178 if (!isdigit(c1)) { 179 yyerror(_("malformed decimal digit")); 180 return (0); 181 } 182 b = c1 - '0'; 183 c2 = scanc(); 184 if (!isdigit(c2)) { 185 yyerror(_("malformed decimal digit")); 186 return (0); 187 } 188 b *= 10; 189 b += (c2 - '0'); 190 c3 = scanc(); 191 if (!isdigit(c3)) { 192 unscanc(c3); 193 } else { 194 b *= 10; 195 b += (c3 - '0'); 196 } 197 return (b); 198 } 199 200 static int 201 scan_oct_byte(void) 202 { 203 int c1, c2, c3; 204 int b; 205 206 b = 0; 207 208 c1 = scanc(); 209 if (!isodigit(c1)) { 210 yyerror(_("malformed octal digit")); 211 return (0); 212 } 213 b = c1 - '0'; 214 c2 = scanc(); 215 if (!isodigit(c2)) { 216 yyerror(_("malformed octal digit")); 217 return (0); 218 } 219 b *= 8; 220 b += (c2 - '0'); 221 c3 = scanc(); 222 if (!isodigit(c3)) { 223 unscanc(c3); 224 } else { 225 b *= 8; 226 b += (c3 - '0'); 227 } 228 return (b); 229 } 230 231 void 232 add_tok(int c) 233 { 234 if ((tokidx + 1) >= toksz) { 235 toksz += 64; 236 if ((token = realloc(token, toksz)) == NULL) { 237 yyerror(_("out of memory")); 238 tokidx = 0; 239 toksz = 0; 240 return; 241 } 242 } 243 244 token[tokidx++] = (char)c; 245 token[tokidx] = 0; 246 } 247 248 static int 249 get_byte(void) 250 { 251 int c; 252 253 if ((c = scanc()) != esc_char) { 254 unscanc(c); 255 return (EOF); 256 } 257 c = scanc(); 258 259 switch (c) { 260 case 'd': 261 case 'D': 262 return (scan_dec_byte()); 263 case 'x': 264 case 'X': 265 return (scan_hex_byte()); 266 case '0': 267 case '1': 268 case '2': 269 case '3': 270 case '4': 271 case '5': 272 case '6': 273 case '7': 274 /* put the character back so we can get it */ 275 unscanc(c); 276 return (scan_oct_byte()); 277 default: 278 unscanc(c); 279 unscanc(esc_char); 280 return (EOF); 281 } 282 } 283 284 int 285 get_escaped(int c) 286 { 287 switch (c) { 288 case 'n': 289 return ('\n'); 290 case 'r': 291 return ('\r'); 292 case 't': 293 return ('\t'); 294 case 'f': 295 return ('\f'); 296 case 'v': 297 return ('\v'); 298 case 'b': 299 return ('\b'); 300 case 'a': 301 return ('\a'); 302 default: 303 return (c); 304 } 305 } 306 307 int 308 get_wide(void) 309 { 310 /* NB: yylval.mbs[0] is the length */ 311 char *mbs = &yylval.mbs[1]; 312 int mbi = 0; 313 int c; 314 315 mbs[mbi] = 0; 316 if (mb_cur_max > MB_LEN_MAX) { 317 yyerror(_("max multibyte character size too big")); 318 return (T_NULL); 319 } 320 for (;;) { 321 if ((c = get_byte()) == EOF) 322 break; 323 if (mbi == mb_cur_max) { 324 unscanc(c); 325 yyerror(_("length > mb_cur_max")); 326 return (T_NULL); 327 } 328 mbs[mbi++] = c; 329 mbs[mbi] = 0; 330 } 331 332 /* result in yylval.mbs */ 333 mbs[-1] = mbi; 334 return (T_CHAR); 335 } 336 337 int 338 get_symbol(void) 339 { 340 int c; 341 342 while ((c = scanc()) != EOF) { 343 if (escaped) { 344 escaped = 0; 345 if (c == '\n') 346 continue; 347 add_tok(get_escaped(c)); 348 continue; 349 } 350 if (c == esc_char) { 351 escaped = 1; 352 continue; 353 } 354 if (c == '\n') { /* well that's strange! */ 355 yyerror(_("unterminated symbolic name")); 356 continue; 357 } 358 if (c == '>') { /* end of symbol */ 359 360 /* 361 * This restarts the token from the beginning 362 * the next time we scan a character. (This 363 * token is complete.) 364 */ 365 366 if (token == NULL) { 367 yyerror(_("missing symbolic name")); 368 return (T_NULL); 369 } 370 tokidx = 0; 371 372 /* 373 * A few symbols are handled as keywords outside 374 * of the normal categories. 375 */ 376 if (category == T_END) { 377 int i; 378 for (i = 0; symwords[i].name != 0; i++) { 379 if (strcmp(token, symwords[i].name) == 380 0) { 381 last_kw = symwords[i].id; 382 return (last_kw); 383 } 384 } 385 } 386 /* its an undefined symbol */ 387 yylval.token = strdup(token); 388 token = NULL; 389 toksz = 0; 390 tokidx = 0; 391 return (T_SYMBOL); 392 } 393 add_tok(c); 394 } 395 396 yyerror(_("unterminated symbolic name")); 397 return (EOF); 398 } 399 400 401 static int 402 consume_token(void) 403 { 404 int len = tokidx; 405 int i; 406 407 tokidx = 0; 408 if (token == NULL) 409 return (T_NULL); 410 411 /* 412 * this one is special, because we don't want it to alter the 413 * last_kw field. 414 */ 415 if (strcmp(token, "...") == 0) { 416 return (T_ELLIPSIS); 417 } 418 419 /* search for reserved words first */ 420 for (i = 0; keywords[i].name; i++) { 421 int j; 422 if (strcmp(keywords[i].name, token) != 0) { 423 continue; 424 } 425 426 last_kw = keywords[i].id; 427 428 /* clear the top level category if we're done with it */ 429 if (last_kw == T_END) { 430 category = T_END; 431 } 432 433 /* set the top level category if we're changing */ 434 for (j = 0; categories[j]; j++) { 435 if (categories[j] != last_kw) 436 continue; 437 category = last_kw; 438 } 439 440 return (keywords[i].id); 441 } 442 443 /* maybe its a numeric constant? */ 444 if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) { 445 char *eptr; 446 yylval.num = strtol(token, &eptr, 10); 447 if (*eptr != 0) 448 yyerror(_("malformed number")); 449 return (T_NUMBER); 450 } 451 452 /* 453 * A single lone character is treated as a character literal. 454 * To avoid duplication of effort, we stick in the charmap. 455 */ 456 if (len == 1) { 457 yylval.mbs[0] = 1; /* length */ 458 yylval.mbs[1] = token[0]; 459 yylval.mbs[2] = '\0'; 460 return (T_CHAR); 461 } 462 463 /* anything else is treated as a symbolic name */ 464 yylval.token = strdup(token); 465 token = NULL; 466 toksz = 0; 467 tokidx = 0; 468 return (T_NAME); 469 } 470 471 void 472 scan_to_eol(void) 473 { 474 int c; 475 while ((c = scanc()) != '\n') { 476 if (c == EOF) { 477 /* end of file without newline! */ 478 errf(_("missing newline")); 479 return; 480 } 481 } 482 assert(c == '\n'); 483 } 484 485 int 486 yylex(void) 487 { 488 int c; 489 490 while ((c = scanc()) != EOF) { 491 492 /* special handling for quoted string */ 493 if (instring) { 494 if (escaped) { 495 escaped = 0; 496 497 /* if newline, just eat and forget it */ 498 if (c == '\n') 499 continue; 500 501 if (strchr("xXd01234567", c)) { 502 unscanc(c); 503 unscanc(esc_char); 504 return (get_wide()); 505 } 506 yylval.mbs[0] = 1; /* length */ 507 yylval.mbs[1] = get_escaped(c); 508 yylval.mbs[2] = '\0'; 509 return (T_CHAR); 510 } 511 if (c == esc_char) { 512 escaped = 1; 513 continue; 514 } 515 switch (c) { 516 case '<': 517 return (get_symbol()); 518 case '>': 519 /* oops! should generate syntax error */ 520 return (T_GT); 521 case '"': 522 instring = 0; 523 return (T_QUOTE); 524 default: 525 yylval.mbs[0] = 1; /* length */ 526 yylval.mbs[1] = c; 527 yylval.mbs[2] = '\0'; 528 return (T_CHAR); 529 } 530 } 531 532 /* escaped characters first */ 533 if (escaped) { 534 escaped = 0; 535 if (c == '\n') { 536 /* eat the newline */ 537 continue; 538 } 539 hadtok = 1; 540 if (tokidx) { 541 /* an escape mid-token is nonsense */ 542 return (T_NULL); 543 } 544 545 /* numeric escapes are treated as wide characters */ 546 if (strchr("xXd01234567", c)) { 547 unscanc(c); 548 unscanc(esc_char); 549 return (get_wide()); 550 } 551 552 add_tok(get_escaped(c)); 553 continue; 554 } 555 556 /* if it is the escape charter itself note it */ 557 if (c == esc_char) { 558 escaped = 1; 559 continue; 560 } 561 562 /* remove from the comment char to end of line */ 563 if (c == com_char) { 564 while (c != '\n') { 565 if ((c = scanc()) == EOF) { 566 /* end of file without newline! */ 567 return (EOF); 568 } 569 } 570 assert(c == '\n'); 571 if (!hadtok) { 572 /* 573 * If there were no tokens on this line, 574 * then just pretend it didn't exist at all. 575 */ 576 continue; 577 } 578 hadtok = 0; 579 return (T_NL); 580 } 581 582 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) { 583 /* 584 * These are all token delimiters. If there 585 * is a token already in progress, we need to 586 * process it. 587 */ 588 unscanc(c); 589 return (consume_token()); 590 } 591 592 switch (c) { 593 case '\n': 594 if (!hadtok) { 595 /* 596 * If the line was completely devoid of tokens, 597 * then just ignore it. 598 */ 599 continue; 600 } 601 /* we're starting a new line, reset the token state */ 602 hadtok = 0; 603 return (T_NL); 604 case ',': 605 hadtok = 1; 606 return (T_COMMA); 607 case ';': 608 hadtok = 1; 609 return (T_SEMI); 610 case '(': 611 hadtok = 1; 612 return (T_LPAREN); 613 case ')': 614 hadtok = 1; 615 return (T_RPAREN); 616 case '>': 617 hadtok = 1; 618 return (T_GT); 619 case '<': 620 /* symbol start! */ 621 hadtok = 1; 622 return (get_symbol()); 623 case ' ': 624 case '\t': 625 /* whitespace, just ignore it */ 626 continue; 627 case '"': 628 hadtok = 1; 629 instring = 1; 630 return (T_QUOTE); 631 default: 632 hadtok = 1; 633 add_tok(c); 634 continue; 635 } 636 } 637 return (EOF); 638 } 639 640 void 641 yyerror(const char *msg) 642 { 643 (void) fprintf(stderr, _("%s: %d: error: %s\n"), 644 filename, lineno, msg); 645 exit(1); 646 } 647 648 void 649 errf(const char *fmt, ...) 650 { 651 char *msg; 652 653 va_list va; 654 va_start(va, fmt); 655 (void) vasprintf(&msg, fmt, va); 656 va_end(va); 657 658 (void) fprintf(stderr, _("%s: %d: error: %s\n"), 659 filename, lineno, msg); 660 free(msg); 661 exit(1); 662 } 663 664 void 665 warn(const char *fmt, ...) 666 { 667 char *msg; 668 669 va_list va; 670 va_start(va, fmt); 671 (void) vasprintf(&msg, fmt, va); 672 va_end(va); 673 674 (void) fprintf(stderr, _("%s: %d: warning: %s\n"), 675 filename, lineno, msg); 676 free(msg); 677 warnings++; 678 }