1 /* 2 * This is a really stupid C tokenizer. It doesn't do any include 3 * files or anything complex at all. That's the preprocessor. 4 * 5 * Copyright (C) 2003 Transmeta Corp. 6 * 2003 Linus Torvalds 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a copy 9 * of this software and associated documentation files (the "Software"), to deal 10 * in the Software without restriction, including without limitation the rights 11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 * copies of the Software, and to permit persons to whom the Software is 13 * furnished to do so, subject to the following conditions: 14 * 15 * The above copyright notice and this permission notice shall be included in 16 * all copies or substantial portions of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 * THE SOFTWARE. 25 */ 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <stdarg.h> 29 #include <stddef.h> 30 #include <string.h> 31 #include <ctype.h> 32 #include <unistd.h> 33 #include <stdint.h> 34 35 #include "lib.h" 36 #include "allocate.h" 37 #include "token.h" 38 #include "symbol.h" 39 40 #define EOF (-1) 41 42 int input_stream_nr = 0; 43 struct stream *input_streams; 44 static int input_streams_allocated; 45 unsigned int tabstop = 8; 46 int no_lineno = 0; 47 48 #define BUFSIZE (8192) 49 50 typedef struct { 51 int fd, offset, size; 52 int pos, line, nr; 53 int newline, whitespace; 54 struct token **tokenlist; 55 struct token *token; 56 unsigned char *buffer; 57 } stream_t; 58 59 const char *stream_name(int stream) 60 { 61 if (stream < 0 || stream > input_stream_nr) 62 return "<bad stream>"; 63 return input_streams[stream].name; 64 } 65 66 static struct position stream_pos(stream_t *stream) 67 { 68 struct position pos; 69 pos.type = 0; 70 pos.stream = stream->nr; 71 pos.newline = stream->newline; 72 pos.whitespace = stream->whitespace; 73 pos.pos = stream->pos; 74 75 pos.line = stream->line; 76 if (no_lineno) 77 pos.line = 123456; 78 79 pos.noexpand = 0; 80 return pos; 81 } 82 83 const char *show_special(int val) 84 { 85 static char buffer[4]; 86 87 buffer[0] = val; 88 buffer[1] = 0; 89 if (val >= SPECIAL_BASE) 90 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]); 91 return buffer; 92 } 93 94 const char *show_ident(const struct ident *ident) 95 { 96 static char buffer[256]; 97 if (!ident) 98 return "<noident>"; 99 sprintf(buffer, "%.*s", ident->len, ident->name); 100 return buffer; 101 } 102 103 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next) 104 { 105 if (isprint(c)) { 106 if (c == escape || c == '\\') 107 *ptr++ = '\\'; 108 *ptr++ = c; 109 return ptr; 110 } 111 *ptr++ = '\\'; 112 switch (c) { 113 case '\n': 114 *ptr++ = 'n'; 115 return ptr; 116 case '\t': 117 *ptr++ = 't'; 118 return ptr; 119 } 120 if (!isdigit(next)) 121 return ptr + sprintf(ptr, "%o", c); 122 123 return ptr + sprintf(ptr, "%03o", c); 124 } 125 126 const char *show_string(const struct string *string) 127 { 128 static char buffer[4 * MAX_STRING + 3]; 129 char *ptr; 130 int i; 131 132 if (!string->length) 133 return "<bad_string>"; 134 ptr = buffer; 135 *ptr++ = '"'; 136 for (i = 0; i < string->length-1; i++) { 137 const char *p = string->data + i; 138 ptr = charstr(ptr, p[0], '"', p[1]); 139 } 140 *ptr++ = '"'; 141 *ptr = '\0'; 142 return buffer; 143 } 144 145 static const char *show_char(const char *s, size_t len, char prefix, char delim) 146 { 147 static char buffer[MAX_STRING + 4]; 148 char *p = buffer; 149 if (prefix) 150 *p++ = prefix; 151 *p++ = delim; 152 memcpy(p, s, len); 153 p += len; 154 *p++ = delim; 155 *p++ = '\0'; 156 return buffer; 157 } 158 159 static const char *quote_char(const char *s, size_t len, char prefix, char delim) 160 { 161 static char buffer[2*MAX_STRING + 6]; 162 size_t i; 163 char *p = buffer; 164 if (prefix) 165 *p++ = prefix; 166 if (delim == '"') 167 *p++ = '\\'; 168 *p++ = delim; 169 for (i = 0; i < len; i++) { 170 if (s[i] == '"' || s[i] == '\\') 171 *p++ = '\\'; 172 *p++ = s[i]; 173 } 174 if (delim == '"') 175 *p++ = '\\'; 176 *p++ = delim; 177 *p++ = '\0'; 178 return buffer; 179 } 180 181 const char *show_token(const struct token *token) 182 { 183 static char buffer[256]; 184 185 if (!token) 186 return "<no token>"; 187 switch (token_type(token)) { 188 case TOKEN_ERROR: 189 return "syntax error"; 190 191 case TOKEN_EOF: 192 return "end-of-input"; 193 194 case TOKEN_IDENT: 195 return show_ident(token->ident); 196 197 case TOKEN_NUMBER: 198 return token->number; 199 200 case TOKEN_SPECIAL: 201 return show_special(token->special); 202 203 case TOKEN_CHAR: 204 return show_char(token->string->data, 205 token->string->length - 1, 0, '\''); 206 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3: 207 return show_char(token->embedded, 208 token_type(token) - TOKEN_CHAR, 0, '\''); 209 case TOKEN_WIDE_CHAR: 210 return show_char(token->string->data, 211 token->string->length - 1, 'L', '\''); 212 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3: 213 return show_char(token->embedded, 214 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\''); 215 case TOKEN_STRING: 216 return show_char(token->string->data, 217 token->string->length - 1, 0, '"'); 218 case TOKEN_WIDE_STRING: 219 return show_char(token->string->data, 220 token->string->length - 1, 'L', '"'); 221 222 case TOKEN_STREAMBEGIN: 223 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream)); 224 return buffer; 225 226 case TOKEN_STREAMEND: 227 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream)); 228 return buffer; 229 230 case TOKEN_UNTAINT: 231 sprintf(buffer, "<untaint>"); 232 return buffer; 233 234 case TOKEN_ARG_COUNT: 235 sprintf(buffer, "<argcnt>"); 236 return buffer; 237 238 default: 239 sprintf(buffer, "unhandled token type '%d' ", token_type(token)); 240 return buffer; 241 } 242 } 243 244 const char *quote_token(const struct token *token) 245 { 246 static char buffer[256]; 247 248 switch (token_type(token)) { 249 case TOKEN_ERROR: 250 return "syntax error"; 251 252 case TOKEN_IDENT: 253 return show_ident(token->ident); 254 255 case TOKEN_NUMBER: 256 return token->number; 257 258 case TOKEN_SPECIAL: 259 return show_special(token->special); 260 261 case TOKEN_CHAR: 262 return quote_char(token->string->data, 263 token->string->length - 1, 0, '\''); 264 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3: 265 return quote_char(token->embedded, 266 token_type(token) - TOKEN_CHAR, 0, '\''); 267 case TOKEN_WIDE_CHAR: 268 return quote_char(token->string->data, 269 token->string->length - 1, 'L', '\''); 270 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3: 271 return quote_char(token->embedded, 272 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\''); 273 case TOKEN_STRING: 274 return quote_char(token->string->data, 275 token->string->length - 1, 0, '"'); 276 case TOKEN_WIDE_STRING: 277 return quote_char(token->string->data, 278 token->string->length - 1, 'L', '"'); 279 default: 280 sprintf(buffer, "unhandled token type '%d' ", token_type(token)); 281 return buffer; 282 } 283 } 284 285 #define HASHED_INPUT_BITS (6) 286 #define HASHED_INPUT (1 << HASHED_INPUT_BITS) 287 #define HASH_PRIME 0x9e370001UL 288 289 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 }; 290 291 int *hash_stream(const char *name) 292 { 293 uint32_t hash = 0; 294 unsigned char c; 295 296 while ((c = *name++) != 0) 297 hash = (hash + (c << 4) + (c >> 4)) * 11; 298 299 hash *= HASH_PRIME; 300 hash >>= 32 - HASHED_INPUT_BITS; 301 return input_stream_hashes + hash; 302 } 303 304 int init_stream(const char *name, int fd, const char **next_path) 305 { 306 int stream = input_stream_nr, *hash; 307 struct stream *current; 308 309 if (stream >= input_streams_allocated) { 310 int newalloc = stream * 4 / 3 + 10; 311 input_streams = realloc(input_streams, newalloc * sizeof(struct stream)); 312 if (!input_streams) 313 die("Unable to allocate more streams space"); 314 input_streams_allocated = newalloc; 315 } 316 current = input_streams + stream; 317 memset(current, 0, sizeof(*current)); 318 current->name = name; 319 current->fd = fd; 320 current->next_path = next_path; 321 current->path = NULL; 322 current->constant = CONSTANT_FILE_MAYBE; 323 input_stream_nr = stream+1; 324 hash = hash_stream(name); 325 current->next_stream = *hash; 326 *hash = stream; 327 return stream; 328 } 329 330 static struct token * alloc_token(stream_t *stream) 331 { 332 struct token *token = __alloc_token(0); 333 token->pos = stream_pos(stream); 334 return token; 335 } 336 337 /* 338 * Argh... That was surprisingly messy - handling '\r' complicates the 339 * things a _lot_. 340 */ 341 static int nextchar_slow(stream_t *stream) 342 { 343 int offset = stream->offset; 344 int size = stream->size; 345 int c; 346 int spliced = 0, had_cr, had_backslash; 347 348 restart: 349 had_cr = had_backslash = 0; 350 351 repeat: 352 if (offset >= size) { 353 if (stream->fd < 0) 354 goto got_eof; 355 size = read(stream->fd, stream->buffer, BUFSIZE); 356 if (size <= 0) 357 goto got_eof; 358 stream->size = size; 359 stream->offset = offset = 0; 360 } 361 362 c = stream->buffer[offset++]; 363 if (had_cr) 364 goto check_lf; 365 366 if (c == '\r') { 367 had_cr = 1; 368 goto repeat; 369 } 370 371 norm: 372 if (!had_backslash) { 373 switch (c) { 374 case '\t': 375 stream->pos += tabstop - stream->pos % tabstop; 376 break; 377 case '\n': 378 stream->line++; 379 stream->pos = 0; 380 stream->newline = 1; 381 break; 382 case '\\': 383 had_backslash = 1; 384 stream->pos++; 385 goto repeat; 386 default: 387 stream->pos++; 388 } 389 } else { 390 if (c == '\n') { 391 stream->line++; 392 stream->pos = 0; 393 spliced = 1; 394 goto restart; 395 } 396 offset--; 397 c = '\\'; 398 } 399 out: 400 stream->offset = offset; 401 402 return c; 403 404 check_lf: 405 if (c != '\n') 406 offset--; 407 c = '\n'; 408 goto norm; 409 410 got_eof: 411 if (had_backslash) { 412 c = '\\'; 413 goto out; 414 } 415 if (stream->pos) 416 warning(stream_pos(stream), "no newline at end of file"); 417 else if (spliced) 418 warning(stream_pos(stream), "backslash-newline at end of file"); 419 return EOF; 420 } 421 422 /* 423 * We want that as light as possible while covering all normal cases. 424 * Slow path (including the logics with line-splicing and EOF sanity 425 * checks) is in nextchar_slow(). 426 */ 427 static inline int nextchar(stream_t *stream) 428 { 429 int offset = stream->offset; 430 431 if (offset < stream->size) { 432 int c = stream->buffer[offset++]; 433 static const char special[256] = { 434 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1 435 }; 436 if (!special[c]) { 437 stream->offset = offset; 438 stream->pos++; 439 return c; 440 } 441 } 442 return nextchar_slow(stream); 443 } 444 445 struct token eof_token_entry; 446 447 static struct token *mark_eof(stream_t *stream) 448 { 449 struct token *end; 450 451 end = alloc_token(stream); 452 token_type(end) = TOKEN_STREAMEND; 453 end->pos.newline = 1; 454 455 eof_token_entry.next = &eof_token_entry; 456 eof_token_entry.pos.newline = 1; 457 458 end->next = &eof_token_entry; 459 *stream->tokenlist = end; 460 stream->tokenlist = NULL; 461 return end; 462 } 463 464 static void add_token(stream_t *stream) 465 { 466 struct token *token = stream->token; 467 468 stream->token = NULL; 469 token->next = NULL; 470 *stream->tokenlist = token; 471 stream->tokenlist = &token->next; 472 } 473 474 static void drop_token(stream_t *stream) 475 { 476 stream->newline |= stream->token->pos.newline; 477 stream->whitespace |= stream->token->pos.whitespace; 478 stream->token = NULL; 479 } 480 481 enum { 482 Letter = 1, 483 Digit = 2, 484 Hex = 4, 485 Exp = 8, 486 Dot = 16, 487 ValidSecond = 32, 488 Quote = 64, 489 }; 490 491 static const long cclass[257] = { 492 ['0' + 1 ... '7' + 1] = Digit | Hex, /* \<octal> */ 493 ['8' + 1 ... '9' + 1] = Digit | Hex, 494 ['A' + 1 ... 'D' + 1] = Letter | Hex, 495 ['E' + 1] = Letter | Hex | Exp, /* E<exp> */ 496 ['F' + 1] = Letter | Hex, 497 ['G' + 1 ... 'O' + 1] = Letter, 498 ['P' + 1] = Letter | Exp, /* P<exp> */ 499 ['Q' + 1 ... 'Z' + 1] = Letter, 500 ['a' + 1 ... 'b' + 1] = Letter | Hex, /* \a, \b */ 501 ['c' + 1 ... 'd' + 1] = Letter | Hex, 502 ['e' + 1] = Letter | Hex | Exp,/* \e, e<exp> */ 503 ['f' + 1] = Letter | Hex, /* \f */ 504 ['g' + 1 ... 'm' + 1] = Letter, 505 ['n' + 1] = Letter, /* \n */ 506 ['o' + 1] = Letter, 507 ['p' + 1] = Letter | Exp, /* p<exp> */ 508 ['q' + 1] = Letter, 509 ['r' + 1] = Letter, /* \r */ 510 ['s' + 1] = Letter, 511 ['t' + 1] = Letter, /* \t */ 512 ['u' + 1] = Letter, 513 ['v' + 1] = Letter, /* \v */ 514 ['w' + 1] = Letter, 515 ['x' + 1] = Letter, /* \x<hex> */ 516 ['y' + 1 ... 'z' + 1] = Letter, 517 ['_' + 1] = Letter, 518 ['.' + 1] = Dot | ValidSecond, 519 ['=' + 1] = ValidSecond, 520 ['+' + 1] = ValidSecond, 521 ['-' + 1] = ValidSecond, 522 ['>' + 1] = ValidSecond, 523 ['<' + 1] = ValidSecond, 524 ['&' + 1] = ValidSecond, 525 ['|' + 1] = ValidSecond, 526 ['#' + 1] = ValidSecond, 527 ['\'' + 1] = Quote, 528 ['"' + 1] = Quote, 529 }; 530 531 /* 532 * pp-number: 533 * digit 534 * . digit 535 * pp-number digit 536 * pp-number identifier-nodigit 537 * pp-number e sign 538 * pp-number E sign 539 * pp-number p sign 540 * pp-number P sign 541 * pp-number . 542 */ 543 static int get_one_number(int c, int next, stream_t *stream) 544 { 545 struct token *token; 546 static char buffer[4095]; 547 char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer); 548 int len; 549 550 *p++ = c; 551 for (;;) { 552 long class = cclass[next + 1]; 553 if (!(class & (Dot | Digit | Letter))) 554 break; 555 if (p != buffer_end) 556 *p++ = next; 557 next = nextchar(stream); 558 if (class & Exp) { 559 if (next == '-' || next == '+') { 560 if (p != buffer_end) 561 *p++ = next; 562 next = nextchar(stream); 563 } 564 } 565 } 566 567 if (p == buffer_end) { 568 sparse_error(stream_pos(stream), "number token exceeds %td characters", 569 buffer_end - buffer); 570 // Pretend we saw just "1". 571 buffer[0] = '1'; 572 p = buffer + 1; 573 } 574 575 *p++ = 0; 576 len = p - buffer; 577 buf = __alloc_bytes(len); 578 memcpy(buf, buffer, len); 579 580 token = stream->token; 581 token_type(token) = TOKEN_NUMBER; 582 token->number = buf; 583 add_token(stream); 584 585 return next; 586 } 587 588 static int eat_string(int next, stream_t *stream, enum token_type type) 589 { 590 static char buffer[MAX_STRING]; 591 struct string *string; 592 struct token *token = stream->token; 593 int len = 0; 594 int escape; 595 int want_hex = 0; 596 char delim = type < TOKEN_STRING ? '\'' : '"'; 597 598 for (escape = 0; escape || next != delim; next = nextchar(stream)) { 599 if (len < MAX_STRING) 600 buffer[len] = next; 601 len++; 602 if (next == '\n') { 603 warning(stream_pos(stream), 604 "Newline in string or character constant"); 605 if (delim == '\'') /* assume it's lost ' */ 606 break; 607 } 608 if (next == EOF) { 609 warning(stream_pos(stream), 610 "End of file in middle of string"); 611 return next; 612 } 613 if (!escape) { 614 if (want_hex && !(cclass[next + 1] & Hex)) 615 warning(stream_pos(stream), 616 "\\x used with no following hex digits"); 617 want_hex = 0; 618 escape = next == '\\'; 619 } else { 620 escape = 0; 621 want_hex = next == 'x'; 622 } 623 } 624 if (want_hex) 625 warning(stream_pos(stream), 626 "\\x used with no following hex digits"); 627 if (len > MAX_STRING) { 628 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING); 629 len = MAX_STRING; 630 } 631 if (delim == '\'' && len <= 4) { 632 if (len == 0) { 633 sparse_error(stream_pos(stream), 634 "empty character constant"); 635 return nextchar(stream); 636 } 637 token_type(token) = type + len; 638 memset(buffer + len, '\0', 4 - len); 639 memcpy(token->embedded, buffer, 4); 640 } else { 641 token_type(token) = type; 642 string = __alloc_string(len+1); 643 memcpy(string->data, buffer, len); 644 string->data[len] = '\0'; 645 string->length = len+1; 646 token->string = string; 647 } 648 649 /* Pass it on.. */ 650 token = stream->token; 651 add_token(stream); 652 return nextchar(stream); 653 } 654 655 static int drop_stream_eoln(stream_t *stream) 656 { 657 drop_token(stream); 658 for (;;) { 659 switch (nextchar(stream)) { 660 case EOF: 661 return EOF; 662 case '\n': 663 return nextchar(stream); 664 } 665 } 666 } 667 668 static int drop_stream_comment(stream_t *stream) 669 { 670 int newline; 671 int next; 672 drop_token(stream); 673 newline = stream->newline; 674 675 next = nextchar(stream); 676 for (;;) { 677 int curr = next; 678 if (curr == EOF) { 679 warning(stream_pos(stream), "End of file in the middle of a comment"); 680 return curr; 681 } 682 next = nextchar(stream); 683 if (curr == '*' && next == '/') 684 break; 685 } 686 stream->newline = newline; 687 return nextchar(stream); 688 } 689 690 unsigned char combinations[][4] = COMBINATION_STRINGS; 691 692 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE) 693 694 /* hash function for two-character punctuators - all give unique values */ 695 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31) 696 697 /* 698 * note that we won't get false positives - special_hash(0,0) is 0 and 699 * entry 0 is filled (by +=), so all the missing ones are OK. 700 */ 701 static unsigned char hash_results[32][2] = { 702 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1} 703 RES('+', '='), /* 00 */ 704 RES('/', '='), /* 01 */ 705 RES('^', '='), /* 05 */ 706 RES('&', '&'), /* 07 */ 707 RES('#', '#'), /* 08 */ 708 RES('<', '<'), /* 0a */ 709 RES('<', '='), /* 0c */ 710 RES('!', '='), /* 0e */ 711 RES('%', '='), /* 0f */ 712 RES('-', '-'), /* 10 */ 713 RES('-', '='), /* 11 */ 714 RES('-', '>'), /* 13 */ 715 RES('=', '='), /* 15 */ 716 RES('&', '='), /* 17 */ 717 RES('*', '='), /* 18 */ 718 RES('.', '.'), /* 1a */ 719 RES('+', '+'), /* 1b */ 720 RES('|', '='), /* 1c */ 721 RES('>', '='), /* 1d */ 722 RES('|', '|'), /* 1e */ 723 RES('>', '>') /* 1f */ 724 #undef RES 725 }; 726 static int code[32] = { 727 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value 728 CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */ 729 CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */ 730 CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */ 731 CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */ 732 CODE('#', '#', SPECIAL_HASHHASH), /* 08 */ 733 CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */ 734 CODE('<', '=', SPECIAL_LTE), /* 0c */ 735 CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */ 736 CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */ 737 CODE('-', '-', SPECIAL_DECREMENT), /* 10 */ 738 CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */ 739 CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */ 740 CODE('=', '=', SPECIAL_EQUAL), /* 15 */ 741 CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */ 742 CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */ 743 CODE('.', '.', SPECIAL_DOTDOT), /* 1a */ 744 CODE('+', '+', SPECIAL_INCREMENT), /* 1b */ 745 CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */ 746 CODE('>', '=', SPECIAL_GTE), /* 1d */ 747 CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */ 748 CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */ 749 #undef CODE 750 }; 751 752 static int get_one_special(int c, stream_t *stream) 753 { 754 struct token *token; 755 int next, value, i; 756 757 next = nextchar(stream); 758 759 /* 760 * Check for numbers, strings, character constants, and comments 761 */ 762 switch (c) { 763 case '.': 764 if (next >= '0' && next <= '9') 765 return get_one_number(c, next, stream); 766 break; 767 case '"': 768 return eat_string(next, stream, TOKEN_STRING); 769 case '\'': 770 return eat_string(next, stream, TOKEN_CHAR); 771 case '/': 772 if (next == '/') 773 return drop_stream_eoln(stream); 774 if (next == '*') 775 return drop_stream_comment(stream); 776 } 777 778 /* 779 * Check for combinations 780 */ 781 value = c; 782 if (cclass[next + 1] & ValidSecond) { 783 i = special_hash(c, next); 784 if (hash_results[i][0] == c && hash_results[i][1] == next) { 785 value = code[i]; 786 next = nextchar(stream); 787 if (value >= SPECIAL_LEFTSHIFT && 788 next == "==."[value - SPECIAL_LEFTSHIFT]) { 789 value += 3; 790 next = nextchar(stream); 791 } 792 } 793 } 794 795 /* Pass it on.. */ 796 token = stream->token; 797 token_type(token) = TOKEN_SPECIAL; 798 token->special = value; 799 add_token(stream); 800 return next; 801 } 802 803 #define IDENT_HASH_BITS (13) 804 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS) 805 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1) 806 807 #define ident_hash_init(c) (c) 808 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c)) 809 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK) 810 811 static struct ident *hash_table[IDENT_HASH_SIZE]; 812 static int ident_hit, ident_miss, idents; 813 814 void show_identifier_stats(void) 815 { 816 int i; 817 int distribution[100]; 818 819 fprintf(stderr, "identifiers: %d hits, %d misses\n", 820 ident_hit, ident_miss); 821 822 for (i = 0; i < 100; i++) 823 distribution[i] = 0; 824 825 for (i = 0; i < IDENT_HASH_SIZE; i++) { 826 struct ident * ident = hash_table[i]; 827 int count = 0; 828 829 while (ident) { 830 count++; 831 ident = ident->next; 832 } 833 if (count > 99) 834 count = 99; 835 distribution[count]++; 836 } 837 838 for (i = 0; i < 100; i++) { 839 if (distribution[i]) 840 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]); 841 } 842 } 843 844 static struct ident *alloc_ident(const char *name, int len) 845 { 846 struct ident *ident = __alloc_ident(len); 847 ident->symbols = NULL; 848 ident->len = len; 849 ident->tainted = 0; 850 memcpy(ident->name, name, len); 851 return ident; 852 } 853 854 static struct ident * insert_hash(struct ident *ident, unsigned long hash) 855 { 856 ident->next = hash_table[hash]; 857 hash_table[hash] = ident; 858 ident_miss++; 859 return ident; 860 } 861 862 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash) 863 { 864 struct ident *ident; 865 struct ident **p; 866 867 p = &hash_table[hash]; 868 while ((ident = *p) != NULL) { 869 if (ident->len == (unsigned char) len) { 870 if (strncmp(name, ident->name, len) != 0) 871 goto next; 872 873 ident_hit++; 874 return ident; 875 } 876 next: 877 //misses++; 878 p = &ident->next; 879 } 880 ident = alloc_ident(name, len); 881 *p = ident; 882 ident->next = NULL; 883 ident_miss++; 884 idents++; 885 return ident; 886 } 887 888 static unsigned long hash_name(const char *name, int len) 889 { 890 unsigned long hash; 891 const unsigned char *p = (const unsigned char *)name; 892 893 hash = ident_hash_init(*p++); 894 while (--len) { 895 unsigned int i = *p++; 896 hash = ident_hash_add(hash, i); 897 } 898 return ident_hash_end(hash); 899 } 900 901 struct ident *hash_ident(struct ident *ident) 902 { 903 return insert_hash(ident, hash_name(ident->name, ident->len)); 904 } 905 906 struct ident *built_in_ident(const char *name) 907 { 908 int len = strlen(name); 909 return create_hashed_ident(name, len, hash_name(name, len)); 910 } 911 912 struct token *built_in_token(int stream, struct ident *ident) 913 { 914 struct token *token; 915 916 token = __alloc_token(0); 917 token->pos.stream = stream; 918 token_type(token) = TOKEN_IDENT; 919 token->ident = ident; 920 return token; 921 } 922 923 static int get_one_identifier(int c, stream_t *stream) 924 { 925 struct token *token; 926 struct ident *ident; 927 unsigned long hash; 928 char buf[256]; 929 int len = 1; 930 int next; 931 932 hash = ident_hash_init(c); 933 buf[0] = c; 934 for (;;) { 935 next = nextchar(stream); 936 if (!(cclass[next + 1] & (Letter | Digit))) 937 break; 938 if (len >= sizeof(buf)) 939 break; 940 hash = ident_hash_add(hash, next); 941 buf[len] = next; 942 len++; 943 }; 944 if (cclass[next + 1] & Quote) { 945 if (len == 1 && buf[0] == 'L') { 946 if (next == '\'') 947 return eat_string(nextchar(stream), stream, 948 TOKEN_WIDE_CHAR); 949 else 950 return eat_string(nextchar(stream), stream, 951 TOKEN_WIDE_STRING); 952 } 953 } 954 hash = ident_hash_end(hash); 955 ident = create_hashed_ident(buf, len, hash); 956 957 /* Pass it on.. */ 958 token = stream->token; 959 token_type(token) = TOKEN_IDENT; 960 token->ident = ident; 961 add_token(stream); 962 return next; 963 } 964 965 static int get_one_token(int c, stream_t *stream) 966 { 967 long class = cclass[c + 1]; 968 if (class & Digit) 969 return get_one_number(c, nextchar(stream), stream); 970 if (class & Letter) 971 return get_one_identifier(c, stream); 972 return get_one_special(c, stream); 973 } 974 975 static struct token *setup_stream(stream_t *stream, int idx, int fd, 976 unsigned char *buf, unsigned int buf_size) 977 { 978 struct token *begin; 979 980 stream->nr = idx; 981 stream->line = 1; 982 stream->newline = 1; 983 stream->whitespace = 0; 984 stream->pos = 0; 985 986 stream->token = NULL; 987 stream->fd = fd; 988 stream->offset = 0; 989 stream->size = buf_size; 990 stream->buffer = buf; 991 992 begin = alloc_token(stream); 993 token_type(begin) = TOKEN_STREAMBEGIN; 994 stream->tokenlist = &begin->next; 995 return begin; 996 } 997 998 static struct token *tokenize_stream(stream_t *stream) 999 { 1000 int c = nextchar(stream); 1001 while (c != EOF) { 1002 if (!isspace(c)) { 1003 struct token *token = alloc_token(stream); 1004 stream->token = token; 1005 stream->newline = 0; 1006 stream->whitespace = 0; 1007 c = get_one_token(c, stream); 1008 continue; 1009 } 1010 stream->whitespace = 1; 1011 c = nextchar(stream); 1012 } 1013 return mark_eof(stream); 1014 } 1015 1016 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken) 1017 { 1018 stream_t stream; 1019 struct token *begin; 1020 1021 begin = setup_stream(&stream, 0, -1, buffer, size); 1022 *endtoken = tokenize_stream(&stream); 1023 return begin; 1024 } 1025 1026 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path) 1027 { 1028 struct token *begin, *end; 1029 stream_t stream; 1030 unsigned char buffer[BUFSIZE]; 1031 int idx; 1032 1033 idx = init_stream(name, fd, next_path); 1034 if (idx < 0) { 1035 // info(endtoken->pos, "File %s is const", name); 1036 return endtoken; 1037 } 1038 1039 begin = setup_stream(&stream, idx, fd, buffer, 0); 1040 end = tokenize_stream(&stream); 1041 if (endtoken) 1042 end->next = endtoken; 1043 return begin; 1044 }