Print this page
12724 update smatch to 0.6.1-rc1-il-5
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/tools/smatch/src/tokenize.c
+++ new/usr/src/tools/smatch/src/tokenize.c
1 1 /*
2 2 * This is a really stupid C tokenizer. It doesn't do any include
3 3 * files or anything complex at all. That's the preprocessor.
4 4 *
5 5 * Copyright (C) 2003 Transmeta Corp.
6 6 * 2003 Linus Torvalds
7 7 *
8 8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 9 * of this software and associated documentation files (the "Software"), to deal
10 10 * in the Software without restriction, including without limitation the rights
11 11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 12 * copies of the Software, and to permit persons to whom the Software is
13 13 * furnished to do so, subject to the following conditions:
14 14 *
15 15 * The above copyright notice and this permission notice shall be included in
16 16 * all copies or substantial portions of the Software.
17 17 *
18 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 24 * THE SOFTWARE.
25 25 */
26 26 #include <stdio.h>
27 27 #include <stdlib.h>
28 28 #include <stdarg.h>
29 29 #include <stddef.h>
30 30 #include <string.h>
31 31 #include <ctype.h>
32 32 #include <unistd.h>
33 33 #include <stdint.h>
34 34
35 35 #include "lib.h"
36 36 #include "allocate.h"
37 37 #include "token.h"
38 38 #include "symbol.h"
39 39
40 40 #define EOF (-1)
41 41
42 42 int input_stream_nr = 0;
43 43 struct stream *input_streams;
44 44 static int input_streams_allocated;
45 45 unsigned int tabstop = 8;
46 46 int no_lineno = 0;
47 47
48 48 #define BUFSIZE (8192)
49 49
50 50 typedef struct {
51 51 int fd, offset, size;
52 52 int pos, line, nr;
53 53 int newline, whitespace;
54 54 struct token **tokenlist;
55 55 struct token *token;
56 56 unsigned char *buffer;
57 57 } stream_t;
58 58
59 59 const char *stream_name(int stream)
60 60 {
61 61 if (stream < 0 || stream > input_stream_nr)
62 62 return "<bad stream>";
63 63 return input_streams[stream].name;
64 64 }
65 65
66 66 static struct position stream_pos(stream_t *stream)
67 67 {
68 68 struct position pos;
69 69 pos.type = 0;
70 70 pos.stream = stream->nr;
71 71 pos.newline = stream->newline;
72 72 pos.whitespace = stream->whitespace;
73 73 pos.pos = stream->pos;
74 74
75 75 pos.line = stream->line;
76 76 if (no_lineno)
77 77 pos.line = 123456;
78 78
79 79 pos.noexpand = 0;
80 80 return pos;
81 81 }
82 82
83 83 const char *show_special(int val)
84 84 {
85 85 static char buffer[4];
86 86
87 87 buffer[0] = val;
88 88 buffer[1] = 0;
89 89 if (val >= SPECIAL_BASE)
90 90 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
91 91 return buffer;
92 92 }
93 93
94 94 const char *show_ident(const struct ident *ident)
95 95 {
96 96 static char buff[4][256];
97 97 static int n;
98 98 char *buffer;
99 99
100 100 if (!ident)
101 101 return "<noident>";
102 102 buffer = buff[3 & ++n];
103 103 sprintf(buffer, "%.*s", ident->len, ident->name);
104 104 return buffer;
105 105 }
106 106
107 107 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
108 108 {
109 109 if (isprint(c)) {
110 110 if (c == escape || c == '\\')
111 111 *ptr++ = '\\';
112 112 *ptr++ = c;
113 113 return ptr;
114 114 }
115 115 *ptr++ = '\\';
116 116 switch (c) {
117 117 case '\n':
118 118 *ptr++ = 'n';
119 119 return ptr;
120 120 case '\t':
121 121 *ptr++ = 't';
122 122 return ptr;
123 123 }
124 124 if (!isdigit(next))
125 125 return ptr + sprintf(ptr, "%o", c);
126 126
127 127 return ptr + sprintf(ptr, "%03o", c);
128 128 }
129 129
130 130 const char *show_string(const struct string *string)
131 131 {
132 132 static char buffer[4 * MAX_STRING + 3];
133 133 char *ptr;
134 134 int i;
135 135
136 136 if (!string || !string->length)
137 137 return "<bad_string>";
138 138 ptr = buffer;
139 139 *ptr++ = '"';
140 140 for (i = 0; i < string->length-1; i++) {
141 141 const char *p = string->data + i;
142 142 ptr = charstr(ptr, p[0], '"', p[1]);
143 143 }
144 144 *ptr++ = '"';
145 145 *ptr = '\0';
146 146 return buffer;
147 147 }
148 148
149 149 static const char *show_char(const char *s, size_t len, char prefix, char delim)
150 150 {
151 151 static char buffer[MAX_STRING + 4];
152 152 char *p = buffer;
153 153 if (prefix)
154 154 *p++ = prefix;
155 155 *p++ = delim;
156 156 memcpy(p, s, len);
157 157 p += len;
158 158 *p++ = delim;
159 159 *p++ = '\0';
160 160 return buffer;
161 161 }
162 162
163 163 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
164 164 {
165 165 static char buffer[2*MAX_STRING + 6];
166 166 size_t i;
167 167 char *p = buffer;
168 168 if (prefix)
169 169 *p++ = prefix;
170 170 if (delim == '"')
171 171 *p++ = '\\';
172 172 *p++ = delim;
173 173 for (i = 0; i < len; i++) {
174 174 if (s[i] == '"' || s[i] == '\\')
175 175 *p++ = '\\';
176 176 *p++ = s[i];
177 177 }
178 178 if (delim == '"')
179 179 *p++ = '\\';
180 180 *p++ = delim;
181 181 *p++ = '\0';
182 182 return buffer;
183 183 }
184 184
185 185 const char *show_token(const struct token *token)
186 186 {
187 187 static char buffer[256];
188 188
189 189 if (!token)
190 190 return "<no token>";
191 191 switch (token_type(token)) {
192 192 case TOKEN_ERROR:
193 193 return "syntax error";
194 194
195 195 case TOKEN_EOF:
196 196 return "end-of-input";
197 197
198 198 case TOKEN_IDENT:
199 199 return show_ident(token->ident);
200 200
201 201 case TOKEN_NUMBER:
202 202 return token->number;
203 203
204 204 case TOKEN_SPECIAL:
205 205 return show_special(token->special);
206 206
207 207 case TOKEN_CHAR:
208 208 return show_char(token->string->data,
209 209 token->string->length - 1, 0, '\'');
210 210 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
211 211 return show_char(token->embedded,
212 212 token_type(token) - TOKEN_CHAR, 0, '\'');
213 213 case TOKEN_WIDE_CHAR:
214 214 return show_char(token->string->data,
215 215 token->string->length - 1, 'L', '\'');
216 216 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
217 217 return show_char(token->embedded,
218 218 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
219 219 case TOKEN_STRING:
220 220 return show_char(token->string->data,
221 221 token->string->length - 1, 0, '"');
222 222 case TOKEN_WIDE_STRING:
223 223 return show_char(token->string->data,
224 224 token->string->length - 1, 'L', '"');
225 225
226 226 case TOKEN_STREAMBEGIN:
227 227 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
228 228 return buffer;
229 229
230 230 case TOKEN_STREAMEND:
231 231 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
232 232 return buffer;
233 233
234 234 case TOKEN_UNTAINT:
235 235 sprintf(buffer, "<untaint>");
236 236 return buffer;
237 237
238 238 case TOKEN_ARG_COUNT:
239 239 sprintf(buffer, "<argcnt>");
240 240 return buffer;
241 241
242 242 default:
243 243 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
244 244 return buffer;
245 245 }
246 246 }
247 247
248 248 const char *quote_token(const struct token *token)
249 249 {
250 250 static char buffer[256];
251 251
252 252 switch (token_type(token)) {
253 253 case TOKEN_ERROR:
254 254 return "syntax error";
255 255
256 256 case TOKEN_IDENT:
257 257 return show_ident(token->ident);
258 258
259 259 case TOKEN_NUMBER:
260 260 return token->number;
261 261
262 262 case TOKEN_SPECIAL:
263 263 return show_special(token->special);
264 264
265 265 case TOKEN_CHAR:
266 266 return quote_char(token->string->data,
267 267 token->string->length - 1, 0, '\'');
268 268 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
269 269 return quote_char(token->embedded,
270 270 token_type(token) - TOKEN_CHAR, 0, '\'');
271 271 case TOKEN_WIDE_CHAR:
272 272 return quote_char(token->string->data,
273 273 token->string->length - 1, 'L', '\'');
274 274 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
275 275 return quote_char(token->embedded,
276 276 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
277 277 case TOKEN_STRING:
278 278 return quote_char(token->string->data,
279 279 token->string->length - 1, 0, '"');
280 280 case TOKEN_WIDE_STRING:
281 281 return quote_char(token->string->data,
282 282 token->string->length - 1, 'L', '"');
283 283 default:
284 284 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
285 285 return buffer;
286 286 }
287 287 }
288 288
289 289 #define HASHED_INPUT_BITS (6)
290 290 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
291 291 #define HASH_PRIME 0x9e370001UL
292 292
293 293 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
294 294
295 295 int *hash_stream(const char *name)
296 296 {
297 297 uint32_t hash = 0;
298 298 unsigned char c;
299 299
300 300 while ((c = *name++) != 0)
301 301 hash = (hash + (c << 4) + (c >> 4)) * 11;
302 302
303 303 hash *= HASH_PRIME;
304 304 hash >>= 32 - HASHED_INPUT_BITS;
305 305 return input_stream_hashes + hash;
306 306 }
307 307
308 308 int init_stream(const char *name, int fd, const char **next_path)
309 309 {
310 310 int stream = input_stream_nr, *hash;
311 311 struct stream *current;
312 312
313 313 if (stream >= input_streams_allocated) {
314 314 int newalloc = stream * 4 / 3 + 10;
315 315 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
316 316 if (!input_streams)
317 317 die("Unable to allocate more streams space");
318 318 input_streams_allocated = newalloc;
319 319 }
320 320 current = input_streams + stream;
321 321 memset(current, 0, sizeof(*current));
322 322 current->name = name;
323 323 current->fd = fd;
324 324 current->next_path = next_path;
325 325 current->path = NULL;
326 326 current->constant = CONSTANT_FILE_MAYBE;
327 327 input_stream_nr = stream+1;
328 328 hash = hash_stream(name);
329 329 current->next_stream = *hash;
330 330 *hash = stream;
331 331 return stream;
332 332 }
333 333
334 334 static struct token * alloc_token(stream_t *stream)
335 335 {
336 336 struct token *token = __alloc_token(0);
337 337 token->pos = stream_pos(stream);
338 338 return token;
339 339 }
340 340
341 341 /*
342 342 * Argh... That was surprisingly messy - handling '\r' complicates the
343 343 * things a _lot_.
344 344 */
345 345 static int nextchar_slow(stream_t *stream)
346 346 {
347 347 int offset = stream->offset;
348 348 int size = stream->size;
349 349 int c;
350 350 int spliced = 0, had_cr, had_backslash;
351 351
352 352 restart:
353 353 had_cr = had_backslash = 0;
354 354
355 355 repeat:
356 356 if (offset >= size) {
357 357 if (stream->fd < 0)
358 358 goto got_eof;
359 359 size = read(stream->fd, stream->buffer, BUFSIZE);
360 360 if (size <= 0)
361 361 goto got_eof;
362 362 stream->size = size;
363 363 stream->offset = offset = 0;
364 364 }
365 365
366 366 c = stream->buffer[offset++];
367 367 if (had_cr)
368 368 goto check_lf;
369 369
370 370 if (c == '\r') {
371 371 had_cr = 1;
372 372 goto repeat;
373 373 }
374 374
375 375 norm:
376 376 if (!had_backslash) {
377 377 switch (c) {
378 378 case '\t':
379 379 stream->pos += tabstop - stream->pos % tabstop;
380 380 break;
381 381 case '\n':
382 382 stream->line++;
383 383 stream->pos = 0;
384 384 stream->newline = 1;
385 385 break;
386 386 case '\\':
387 387 had_backslash = 1;
388 388 stream->pos++;
389 389 goto repeat;
390 390 default:
391 391 stream->pos++;
392 392 }
393 393 } else {
394 394 if (c == '\n') {
395 395 stream->line++;
396 396 stream->pos = 0;
397 397 spliced = 1;
398 398 goto restart;
399 399 }
400 400 offset--;
401 401 c = '\\';
402 402 }
403 403 out:
404 404 stream->offset = offset;
405 405
406 406 return c;
407 407
408 408 check_lf:
409 409 if (c != '\n')
410 410 offset--;
411 411 c = '\n';
412 412 goto norm;
413 413
414 414 got_eof:
415 415 if (had_backslash) {
416 416 c = '\\';
417 417 goto out;
418 418 }
419 419 if (stream->pos)
420 420 warning(stream_pos(stream), "no newline at end of file");
421 421 else if (spliced)
422 422 warning(stream_pos(stream), "backslash-newline at end of file");
423 423 return EOF;
424 424 }
425 425
426 426 /*
427 427 * We want that as light as possible while covering all normal cases.
428 428 * Slow path (including the logics with line-splicing and EOF sanity
429 429 * checks) is in nextchar_slow().
430 430 */
431 431 static inline int nextchar(stream_t *stream)
432 432 {
433 433 int offset = stream->offset;
434 434
435 435 if (offset < stream->size) {
436 436 int c = stream->buffer[offset++];
437 437 static const char special[256] = {
438 438 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
439 439 };
440 440 if (!special[c]) {
441 441 stream->offset = offset;
442 442 stream->pos++;
443 443 return c;
444 444 }
445 445 }
446 446 return nextchar_slow(stream);
447 447 }
448 448
449 449 struct token eof_token_entry;
450 450
451 451 static struct token *mark_eof(stream_t *stream)
452 452 {
453 453 struct token *end;
454 454
455 455 end = alloc_token(stream);
456 456 eof_token_entry.pos = end->pos;
457 457 token_type(end) = TOKEN_STREAMEND;
458 458 end->pos.newline = 1;
459 459
460 460 eof_token_entry.next = &eof_token_entry;
461 461 eof_token_entry.pos.newline = 1;
462 462
463 463 end->next = &eof_token_entry;
464 464 *stream->tokenlist = end;
465 465 stream->tokenlist = NULL;
466 466 return end;
467 467 }
468 468
469 469 static void add_token(stream_t *stream)
470 470 {
471 471 struct token *token = stream->token;
472 472
473 473 stream->token = NULL;
474 474 token->next = NULL;
475 475 *stream->tokenlist = token;
476 476 stream->tokenlist = &token->next;
477 477 }
478 478
479 479 static void drop_token(stream_t *stream)
480 480 {
481 481 stream->newline |= stream->token->pos.newline;
482 482 stream->whitespace |= stream->token->pos.whitespace;
483 483 stream->token = NULL;
484 484 }
485 485
486 486 enum {
487 487 Letter = 1,
488 488 Digit = 2,
489 489 Hex = 4,
490 490 Exp = 8,
491 491 Dot = 16,
492 492 ValidSecond = 32,
493 493 Quote = 64,
494 494 };
495 495
496 496 static const char cclass[257] = {
497 497 ['0' + 1 ... '9' + 1] = Digit | Hex,
498 498 ['A' + 1 ... 'D' + 1] = Letter | Hex,
499 499 ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
500 500 ['F' + 1] = Letter | Hex,
501 501 ['G' + 1 ... 'O' + 1] = Letter,
502 502 ['P' + 1] = Letter | Exp, /* P<exp> */
503 503 ['Q' + 1 ... 'Z' + 1] = Letter,
504 504 ['a' + 1 ... 'd' + 1] = Letter | Hex,
505 505 ['e' + 1] = Letter | Hex | Exp, /* e<exp> */
506 506 ['f' + 1] = Letter | Hex,
507 507 ['g' + 1 ... 'o' + 1] = Letter,
508 508 ['p' + 1] = Letter | Exp, /* p<exp> */
509 509 ['q' + 1 ... 'z' + 1] = Letter,
510 510 ['_' + 1] = Letter,
511 511 ['.' + 1] = Dot | ValidSecond,
512 512 ['=' + 1] = ValidSecond,
513 513 ['+' + 1] = ValidSecond,
514 514 ['-' + 1] = ValidSecond,
515 515 ['>' + 1] = ValidSecond,
516 516 ['<' + 1] = ValidSecond,
517 517 ['&' + 1] = ValidSecond,
518 518 ['|' + 1] = ValidSecond,
519 519 ['#' + 1] = ValidSecond,
520 520 ['\'' + 1] = Quote,
521 521 ['"' + 1] = Quote,
522 522 };
523 523
524 524 /*
525 525 * pp-number:
526 526 * digit
527 527 * . digit
528 528 * pp-number digit
529 529 * pp-number identifier-nodigit
530 530 * pp-number e sign
531 531 * pp-number E sign
532 532 * pp-number p sign
533 533 * pp-number P sign
534 534 * pp-number .
535 535 */
536 536 static int get_one_number(int c, int next, stream_t *stream)
537 537 {
538 538 struct token *token;
539 539 static char buffer[4095];
540 540 char *p = buffer, *buffer_end = buffer + sizeof (buffer);
541 541
542 542 *p++ = c;
543 543 for (;;) {
544 544 long class = cclass[next + 1];
545 545 if (!(class & (Dot | Digit | Letter)))
546 546 break;
547 547 if (p != buffer_end)
548 548 *p++ = next;
549 549 next = nextchar(stream);
550 550 if (class & Exp) {
551 551 if (next == '-' || next == '+') {
552 552 if (p != buffer_end)
553 553 *p++ = next;
554 554 next = nextchar(stream);
555 555 }
556 556 }
557 557 }
558 558
559 559 if (p == buffer_end) {
560 560 sparse_error(stream_pos(stream), "number token exceeds %td characters",
561 561 buffer_end - buffer);
562 562 // Pretend we saw just "1".
563 563 buffer[0] = '1';
564 564 p = buffer + 1;
565 565 }
566 566
567 567 *p++ = 0;
568 568 token = stream->token;
569 569 token_type(token) = TOKEN_NUMBER;
570 570 token->number = xmemdup(buffer, p - buffer);
571 571 add_token(stream);
572 572
573 573 return next;
574 574 }
575 575
576 576 static int eat_string(int next, stream_t *stream, enum token_type type)
577 577 {
578 578 static char buffer[MAX_STRING];
579 579 struct string *string;
580 580 struct token *token = stream->token;
581 581 int len = 0;
582 582 int escape;
583 583 int want_hex = 0;
584 584 char delim = type < TOKEN_STRING ? '\'' : '"';
585 585
586 586 for (escape = 0; escape || next != delim; next = nextchar(stream)) {
587 587 if (len < MAX_STRING)
588 588 buffer[len] = next;
589 589 len++;
590 590 if (next == '\n') {
591 591 warning(stream_pos(stream),
592 592 "missing terminating %c character", delim);
593 593 /* assume delimiter is lost */
594 594 break;
595 595 }
596 596 if (next == EOF) {
597 597 warning(stream_pos(stream),
598 598 "End of file in middle of string");
599 599 return next;
600 600 }
601 601 if (!escape) {
602 602 if (want_hex && !(cclass[next + 1] & Hex))
603 603 warning(stream_pos(stream),
604 604 "\\x used with no following hex digits");
605 605 want_hex = 0;
606 606 escape = next == '\\';
607 607 } else {
608 608 escape = 0;
609 609 want_hex = next == 'x';
610 610 }
611 611 }
612 612 if (want_hex)
613 613 warning(stream_pos(stream),
614 614 "\\x used with no following hex digits");
615 615 if (len > MAX_STRING) {
616 616 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
617 617 len = MAX_STRING;
618 618 }
619 619 if (delim == '\'' && len <= 4) {
620 620 if (len == 0) {
621 621 sparse_error(stream_pos(stream),
622 622 "empty character constant");
623 623 return nextchar(stream);
624 624 }
625 625 token_type(token) = type + len;
626 626 memset(buffer + len, '\0', 4 - len);
627 627 memcpy(token->embedded, buffer, 4);
628 628 } else {
629 629 token_type(token) = type;
630 630 string = __alloc_string(len+1);
631 631 memcpy(string->data, buffer, len);
632 632 string->data[len] = '\0';
633 633 string->length = len+1;
634 634 token->string = string;
635 635 }
636 636
637 637 /* Pass it on.. */
638 638 token = stream->token;
639 639 add_token(stream);
640 640 return nextchar(stream);
641 641 }
642 642
643 643 static int drop_stream_eoln(stream_t *stream)
644 644 {
645 645 drop_token(stream);
646 646 for (;;) {
647 647 switch (nextchar(stream)) {
648 648 case EOF:
649 649 return EOF;
650 650 case '\n':
651 651 return nextchar(stream);
652 652 }
653 653 }
654 654 }
655 655
656 656 static int drop_stream_comment(stream_t *stream)
657 657 {
658 658 int newline;
659 659 int next;
660 660 drop_token(stream);
661 661 newline = stream->newline;
662 662
663 663 next = nextchar(stream);
664 664 for (;;) {
665 665 int curr = next;
666 666 if (curr == EOF) {
667 667 warning(stream_pos(stream), "End of file in the middle of a comment");
668 668 return curr;
669 669 }
670 670 next = nextchar(stream);
671 671 if (curr == '*' && next == '/')
672 672 break;
673 673 }
674 674 stream->newline = newline;
675 675 return nextchar(stream);
676 676 }
677 677
678 678 unsigned char combinations[][4] = COMBINATION_STRINGS;
679 679
680 680 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
681 681
682 682 /* hash function for two-character punctuators - all give unique values */
683 683 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
684 684
685 685 /*
686 686 * note that we won't get false positives - special_hash(0,0) is 0 and
687 687 * entry 0 is filled (by +=), so all the missing ones are OK.
688 688 */
689 689 static unsigned char hash_results[32][2] = {
690 690 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
691 691 RES('+', '='), /* 00 */
692 692 RES('/', '='), /* 01 */
693 693 RES('^', '='), /* 05 */
694 694 RES('&', '&'), /* 07 */
695 695 RES('#', '#'), /* 08 */
696 696 RES('<', '<'), /* 0a */
697 697 RES('<', '='), /* 0c */
698 698 RES('!', '='), /* 0e */
699 699 RES('%', '='), /* 0f */
700 700 RES('-', '-'), /* 10 */
701 701 RES('-', '='), /* 11 */
702 702 RES('-', '>'), /* 13 */
703 703 RES('=', '='), /* 15 */
704 704 RES('&', '='), /* 17 */
705 705 RES('*', '='), /* 18 */
706 706 RES('.', '.'), /* 1a */
707 707 RES('+', '+'), /* 1b */
708 708 RES('|', '='), /* 1c */
709 709 RES('>', '='), /* 1d */
710 710 RES('|', '|'), /* 1e */
711 711 RES('>', '>') /* 1f */
712 712 #undef RES
713 713 };
714 714 static int code[32] = {
715 715 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
716 716 CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
717 717 CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
718 718 CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
719 719 CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
720 720 CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
721 721 CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
722 722 CODE('<', '=', SPECIAL_LTE), /* 0c */
723 723 CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
724 724 CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
725 725 CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
726 726 CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
727 727 CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
728 728 CODE('=', '=', SPECIAL_EQUAL), /* 15 */
729 729 CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
730 730 CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
731 731 CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
732 732 CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
733 733 CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
734 734 CODE('>', '=', SPECIAL_GTE), /* 1d */
735 735 CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
736 736 CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */
737 737 #undef CODE
738 738 };
739 739
740 740 static int get_one_special(int c, stream_t *stream)
741 741 {
742 742 struct token *token;
743 743 int next, value, i;
744 744
745 745 next = nextchar(stream);
746 746
747 747 /*
748 748 * Check for numbers, strings, character constants, and comments
749 749 */
750 750 switch (c) {
751 751 case '.':
752 752 if (next >= '0' && next <= '9')
753 753 return get_one_number(c, next, stream);
754 754 break;
755 755 case '"':
756 756 return eat_string(next, stream, TOKEN_STRING);
757 757 case '\'':
758 758 return eat_string(next, stream, TOKEN_CHAR);
759 759 case '/':
760 760 if (next == '/')
761 761 return drop_stream_eoln(stream);
762 762 if (next == '*')
763 763 return drop_stream_comment(stream);
764 764 }
765 765
766 766 /*
767 767 * Check for combinations
768 768 */
769 769 value = c;
770 770 if (cclass[next + 1] & ValidSecond) {
771 771 i = special_hash(c, next);
772 772 if (hash_results[i][0] == c && hash_results[i][1] == next) {
773 773 value = code[i];
774 774 next = nextchar(stream);
775 775 if (value >= SPECIAL_LEFTSHIFT &&
776 776 next == "==."[value - SPECIAL_LEFTSHIFT]) {
777 777 value += 3;
778 778 next = nextchar(stream);
779 779 }
780 780 }
781 781 }
782 782
783 783 /* Pass it on.. */
784 784 token = stream->token;
785 785 token_type(token) = TOKEN_SPECIAL;
786 786 token->special = value;
787 787 add_token(stream);
788 788 return next;
789 789 }
790 790
791 791 #define IDENT_HASH_BITS (13)
792 792 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
793 793 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
794 794
795 795 #define ident_hash_init(c) (c)
796 796 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
797 797 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
798 798
799 799 static struct ident *hash_table[IDENT_HASH_SIZE];
800 800 static int ident_hit, ident_miss, idents;
801 801
802 802 void show_identifier_stats(void)
803 803 {
804 804 int i;
805 805 int distribution[100];
806 806
807 807 fprintf(stderr, "identifiers: %d hits, %d misses\n",
808 808 ident_hit, ident_miss);
809 809
810 810 for (i = 0; i < 100; i++)
811 811 distribution[i] = 0;
812 812
813 813 for (i = 0; i < IDENT_HASH_SIZE; i++) {
814 814 struct ident * ident = hash_table[i];
815 815 int count = 0;
816 816
817 817 while (ident) {
818 818 count++;
819 819 ident = ident->next;
820 820 }
821 821 if (count > 99)
↓ open down ↓ |
821 lines elided |
↑ open up ↑ |
822 822 count = 99;
823 823 distribution[count]++;
824 824 }
825 825
826 826 for (i = 0; i < 100; i++) {
827 827 if (distribution[i])
828 828 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
829 829 }
830 830 }
831 831
832 -static struct ident *alloc_ident(const char *name, int len)
832 +struct ident *alloc_ident(const char *name, int len)
833 833 {
834 834 struct ident *ident = __alloc_ident(len);
835 835 ident->symbols = NULL;
836 836 ident->len = len;
837 837 ident->tainted = 0;
838 838 memcpy(ident->name, name, len);
839 839 return ident;
840 840 }
841 841
842 842 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
843 843 {
844 844 ident->next = hash_table[hash];
845 845 hash_table[hash] = ident;
846 846 ident_miss++;
847 847 return ident;
848 848 }
849 849
850 850 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
851 851 {
852 852 struct ident *ident;
853 853 struct ident **p;
854 854
855 855 p = &hash_table[hash];
856 856 while ((ident = *p) != NULL) {
857 857 if (ident->len == (unsigned char) len) {
858 858 if (strncmp(name, ident->name, len) != 0)
859 859 goto next;
860 860
861 861 ident_hit++;
862 862 return ident;
863 863 }
864 864 next:
865 865 //misses++;
866 866 p = &ident->next;
867 867 }
868 868 ident = alloc_ident(name, len);
869 869 *p = ident;
870 870 ident->next = NULL;
871 871 ident_miss++;
872 872 idents++;
873 873 return ident;
874 874 }
875 875
876 876 static unsigned long hash_name(const char *name, int len)
877 877 {
878 878 unsigned long hash;
879 879 const unsigned char *p = (const unsigned char *)name;
880 880
881 881 hash = ident_hash_init(*p++);
882 882 while (--len) {
883 883 unsigned int i = *p++;
884 884 hash = ident_hash_add(hash, i);
885 885 }
886 886 return ident_hash_end(hash);
887 887 }
888 888
889 889 struct ident *hash_ident(struct ident *ident)
890 890 {
891 891 return insert_hash(ident, hash_name(ident->name, ident->len));
892 892 }
893 893
894 894 struct ident *built_in_ident(const char *name)
895 895 {
896 896 int len = strlen(name);
897 897 return create_hashed_ident(name, len, hash_name(name, len));
898 898 }
899 899
900 900 struct token *built_in_token(int stream, struct ident *ident)
901 901 {
902 902 struct token *token;
903 903
904 904 token = __alloc_token(0);
905 905 token->pos.stream = stream;
906 906 token_type(token) = TOKEN_IDENT;
907 907 token->ident = ident;
908 908 return token;
909 909 }
910 910
911 911 static int get_one_identifier(int c, stream_t *stream)
912 912 {
913 913 struct token *token;
914 914 struct ident *ident;
915 915 unsigned long hash;
916 916 char buf[256];
917 917 int len = 1;
918 918 int next;
919 919
920 920 hash = ident_hash_init(c);
921 921 buf[0] = c;
922 922 for (;;) {
923 923 next = nextchar(stream);
924 924 if (!(cclass[next + 1] & (Letter | Digit)))
925 925 break;
926 926 if (len >= sizeof(buf))
927 927 break;
928 928 hash = ident_hash_add(hash, next);
929 929 buf[len] = next;
930 930 len++;
931 931 };
932 932 if (cclass[next + 1] & Quote) {
933 933 if (len == 1 && buf[0] == 'L') {
934 934 if (next == '\'')
935 935 return eat_string(nextchar(stream), stream,
936 936 TOKEN_WIDE_CHAR);
937 937 else
938 938 return eat_string(nextchar(stream), stream,
939 939 TOKEN_WIDE_STRING);
940 940 }
941 941 }
942 942 hash = ident_hash_end(hash);
943 943 ident = create_hashed_ident(buf, len, hash);
944 944
945 945 /* Pass it on.. */
946 946 token = stream->token;
947 947 token_type(token) = TOKEN_IDENT;
948 948 token->ident = ident;
949 949 add_token(stream);
950 950 return next;
951 951 }
952 952
953 953 static int get_one_token(int c, stream_t *stream)
954 954 {
955 955 long class = cclass[c + 1];
956 956 if (class & Digit)
957 957 return get_one_number(c, nextchar(stream), stream);
958 958 if (class & Letter)
959 959 return get_one_identifier(c, stream);
960 960 return get_one_special(c, stream);
961 961 }
962 962
963 963 static struct token *setup_stream(stream_t *stream, int idx, int fd,
964 964 unsigned char *buf, unsigned int buf_size)
965 965 {
966 966 struct token *begin;
967 967
968 968 stream->nr = idx;
969 969 stream->line = 1;
970 970 stream->newline = 1;
971 971 stream->whitespace = 0;
972 972 stream->pos = 0;
973 973
974 974 stream->token = NULL;
975 975 stream->fd = fd;
976 976 stream->offset = 0;
977 977 stream->size = buf_size;
978 978 stream->buffer = buf;
979 979
980 980 begin = alloc_token(stream);
981 981 token_type(begin) = TOKEN_STREAMBEGIN;
982 982 stream->tokenlist = &begin->next;
983 983 return begin;
984 984 }
985 985
986 986 static struct token *tokenize_stream(stream_t *stream)
987 987 {
988 988 int c = nextchar(stream);
989 989 while (c != EOF) {
990 990 if (!isspace(c)) {
991 991 struct token *token = alloc_token(stream);
992 992 stream->token = token;
993 993 stream->newline = 0;
994 994 stream->whitespace = 0;
995 995 c = get_one_token(c, stream);
996 996 continue;
997 997 }
998 998 stream->whitespace = 1;
999 999 c = nextchar(stream);
1000 1000 }
1001 1001 return mark_eof(stream);
1002 1002 }
1003 1003
1004 1004 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1005 1005 {
1006 1006 stream_t stream;
1007 1007 struct token *begin;
1008 1008
1009 1009 begin = setup_stream(&stream, 0, -1, buffer, size);
1010 1010 *endtoken = tokenize_stream(&stream);
1011 1011 return begin;
1012 1012 }
1013 1013
1014 1014 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1015 1015 {
1016 1016 struct token *begin, *end;
1017 1017 stream_t stream;
1018 1018 unsigned char buffer[BUFSIZE];
1019 1019 int idx;
1020 1020
1021 1021 idx = init_stream(name, fd, next_path);
1022 1022 if (idx < 0) {
1023 1023 // info(endtoken->pos, "File %s is const", name);
1024 1024 return endtoken;
1025 1025 }
1026 1026
1027 1027 begin = setup_stream(&stream, idx, fd, buffer, 0);
1028 1028 end = tokenize_stream(&stream);
1029 1029 if (endtoken)
1030 1030 end->next = endtoken;
1031 1031 return begin;
1032 1032 }
↓ open down ↓ |
190 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX