Print this page
new smatch
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/tools/smatch/src/tokenize.c
+++ new/usr/src/tools/smatch/src/tokenize.c
1 1 /*
2 2 * This is a really stupid C tokenizer. It doesn't do any include
3 3 * files or anything complex at all. That's the preprocessor.
4 4 *
5 5 * Copyright (C) 2003 Transmeta Corp.
6 6 * 2003 Linus Torvalds
7 7 *
8 8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 9 * of this software and associated documentation files (the "Software"), to deal
10 10 * in the Software without restriction, including without limitation the rights
11 11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 12 * copies of the Software, and to permit persons to whom the Software is
13 13 * furnished to do so, subject to the following conditions:
14 14 *
15 15 * The above copyright notice and this permission notice shall be included in
16 16 * all copies or substantial portions of the Software.
17 17 *
18 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 24 * THE SOFTWARE.
25 25 */
26 26 #include <stdio.h>
27 27 #include <stdlib.h>
28 28 #include <stdarg.h>
29 29 #include <stddef.h>
30 30 #include <string.h>
31 31 #include <ctype.h>
32 32 #include <unistd.h>
33 33 #include <stdint.h>
34 34
35 35 #include "lib.h"
36 36 #include "allocate.h"
37 37 #include "token.h"
38 38 #include "symbol.h"
39 39
40 40 #define EOF (-1)
41 41
42 42 int input_stream_nr = 0;
43 43 struct stream *input_streams;
44 44 static int input_streams_allocated;
45 45 unsigned int tabstop = 8;
46 46 int no_lineno = 0;
47 47
48 48 #define BUFSIZE (8192)
49 49
50 50 typedef struct {
51 51 int fd, offset, size;
52 52 int pos, line, nr;
53 53 int newline, whitespace;
54 54 struct token **tokenlist;
55 55 struct token *token;
56 56 unsigned char *buffer;
57 57 } stream_t;
58 58
59 59 const char *stream_name(int stream)
60 60 {
61 61 if (stream < 0 || stream > input_stream_nr)
62 62 return "<bad stream>";
63 63 return input_streams[stream].name;
64 64 }
65 65
66 66 static struct position stream_pos(stream_t *stream)
67 67 {
68 68 struct position pos;
69 69 pos.type = 0;
70 70 pos.stream = stream->nr;
71 71 pos.newline = stream->newline;
72 72 pos.whitespace = stream->whitespace;
73 73 pos.pos = stream->pos;
74 74
75 75 pos.line = stream->line;
76 76 if (no_lineno)
77 77 pos.line = 123456;
78 78
79 79 pos.noexpand = 0;
80 80 return pos;
81 81 }
82 82
83 83 const char *show_special(int val)
84 84 {
85 85 static char buffer[4];
↓ open down ↓ |
85 lines elided |
↑ open up ↑ |
86 86
87 87 buffer[0] = val;
88 88 buffer[1] = 0;
89 89 if (val >= SPECIAL_BASE)
90 90 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
91 91 return buffer;
92 92 }
93 93
94 94 const char *show_ident(const struct ident *ident)
95 95 {
96 - static char buffer[256];
96 + static char buff[4][256];
97 + static int n;
98 + char *buffer;
99 +
97 100 if (!ident)
98 101 return "<noident>";
102 + buffer = buff[3 & ++n];
99 103 sprintf(buffer, "%.*s", ident->len, ident->name);
100 104 return buffer;
101 105 }
102 106
103 107 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
104 108 {
105 109 if (isprint(c)) {
106 110 if (c == escape || c == '\\')
107 111 *ptr++ = '\\';
108 112 *ptr++ = c;
109 113 return ptr;
110 114 }
111 115 *ptr++ = '\\';
112 116 switch (c) {
113 117 case '\n':
114 118 *ptr++ = 'n';
115 119 return ptr;
116 120 case '\t':
117 121 *ptr++ = 't';
118 122 return ptr;
119 123 }
120 124 if (!isdigit(next))
121 125 return ptr + sprintf(ptr, "%o", c);
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
122 126
123 127 return ptr + sprintf(ptr, "%03o", c);
124 128 }
125 129
126 130 const char *show_string(const struct string *string)
127 131 {
128 132 static char buffer[4 * MAX_STRING + 3];
129 133 char *ptr;
130 134 int i;
131 135
132 - if (!string->length)
136 + if (!string || !string->length)
133 137 return "<bad_string>";
134 138 ptr = buffer;
135 139 *ptr++ = '"';
136 140 for (i = 0; i < string->length-1; i++) {
137 141 const char *p = string->data + i;
138 142 ptr = charstr(ptr, p[0], '"', p[1]);
139 143 }
140 144 *ptr++ = '"';
141 145 *ptr = '\0';
142 146 return buffer;
143 147 }
144 148
145 149 static const char *show_char(const char *s, size_t len, char prefix, char delim)
146 150 {
147 151 static char buffer[MAX_STRING + 4];
148 152 char *p = buffer;
149 153 if (prefix)
150 154 *p++ = prefix;
151 155 *p++ = delim;
152 156 memcpy(p, s, len);
153 157 p += len;
154 158 *p++ = delim;
155 159 *p++ = '\0';
156 160 return buffer;
157 161 }
158 162
159 163 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
160 164 {
161 165 static char buffer[2*MAX_STRING + 6];
162 166 size_t i;
163 167 char *p = buffer;
164 168 if (prefix)
165 169 *p++ = prefix;
166 170 if (delim == '"')
167 171 *p++ = '\\';
168 172 *p++ = delim;
169 173 for (i = 0; i < len; i++) {
170 174 if (s[i] == '"' || s[i] == '\\')
171 175 *p++ = '\\';
172 176 *p++ = s[i];
173 177 }
174 178 if (delim == '"')
175 179 *p++ = '\\';
176 180 *p++ = delim;
177 181 *p++ = '\0';
178 182 return buffer;
179 183 }
180 184
181 185 const char *show_token(const struct token *token)
182 186 {
183 187 static char buffer[256];
184 188
185 189 if (!token)
186 190 return "<no token>";
187 191 switch (token_type(token)) {
188 192 case TOKEN_ERROR:
189 193 return "syntax error";
190 194
191 195 case TOKEN_EOF:
192 196 return "end-of-input";
193 197
194 198 case TOKEN_IDENT:
195 199 return show_ident(token->ident);
196 200
197 201 case TOKEN_NUMBER:
198 202 return token->number;
199 203
200 204 case TOKEN_SPECIAL:
201 205 return show_special(token->special);
202 206
203 207 case TOKEN_CHAR:
204 208 return show_char(token->string->data,
205 209 token->string->length - 1, 0, '\'');
206 210 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
207 211 return show_char(token->embedded,
208 212 token_type(token) - TOKEN_CHAR, 0, '\'');
209 213 case TOKEN_WIDE_CHAR:
210 214 return show_char(token->string->data,
211 215 token->string->length - 1, 'L', '\'');
212 216 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
213 217 return show_char(token->embedded,
214 218 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
215 219 case TOKEN_STRING:
216 220 return show_char(token->string->data,
217 221 token->string->length - 1, 0, '"');
218 222 case TOKEN_WIDE_STRING:
219 223 return show_char(token->string->data,
220 224 token->string->length - 1, 'L', '"');
221 225
222 226 case TOKEN_STREAMBEGIN:
223 227 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
224 228 return buffer;
225 229
226 230 case TOKEN_STREAMEND:
227 231 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
228 232 return buffer;
229 233
230 234 case TOKEN_UNTAINT:
231 235 sprintf(buffer, "<untaint>");
232 236 return buffer;
233 237
234 238 case TOKEN_ARG_COUNT:
235 239 sprintf(buffer, "<argcnt>");
236 240 return buffer;
237 241
238 242 default:
239 243 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
240 244 return buffer;
241 245 }
242 246 }
243 247
244 248 const char *quote_token(const struct token *token)
245 249 {
246 250 static char buffer[256];
247 251
248 252 switch (token_type(token)) {
249 253 case TOKEN_ERROR:
250 254 return "syntax error";
251 255
252 256 case TOKEN_IDENT:
253 257 return show_ident(token->ident);
254 258
255 259 case TOKEN_NUMBER:
256 260 return token->number;
257 261
258 262 case TOKEN_SPECIAL:
259 263 return show_special(token->special);
260 264
261 265 case TOKEN_CHAR:
262 266 return quote_char(token->string->data,
263 267 token->string->length - 1, 0, '\'');
264 268 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
265 269 return quote_char(token->embedded,
266 270 token_type(token) - TOKEN_CHAR, 0, '\'');
267 271 case TOKEN_WIDE_CHAR:
268 272 return quote_char(token->string->data,
269 273 token->string->length - 1, 'L', '\'');
270 274 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
271 275 return quote_char(token->embedded,
272 276 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
273 277 case TOKEN_STRING:
274 278 return quote_char(token->string->data,
275 279 token->string->length - 1, 0, '"');
276 280 case TOKEN_WIDE_STRING:
277 281 return quote_char(token->string->data,
278 282 token->string->length - 1, 'L', '"');
279 283 default:
280 284 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
281 285 return buffer;
282 286 }
283 287 }
284 288
285 289 #define HASHED_INPUT_BITS (6)
286 290 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
287 291 #define HASH_PRIME 0x9e370001UL
288 292
289 293 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
290 294
291 295 int *hash_stream(const char *name)
292 296 {
293 297 uint32_t hash = 0;
294 298 unsigned char c;
295 299
296 300 while ((c = *name++) != 0)
297 301 hash = (hash + (c << 4) + (c >> 4)) * 11;
298 302
299 303 hash *= HASH_PRIME;
300 304 hash >>= 32 - HASHED_INPUT_BITS;
301 305 return input_stream_hashes + hash;
302 306 }
303 307
304 308 int init_stream(const char *name, int fd, const char **next_path)
305 309 {
306 310 int stream = input_stream_nr, *hash;
307 311 struct stream *current;
308 312
309 313 if (stream >= input_streams_allocated) {
310 314 int newalloc = stream * 4 / 3 + 10;
311 315 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
312 316 if (!input_streams)
313 317 die("Unable to allocate more streams space");
314 318 input_streams_allocated = newalloc;
315 319 }
316 320 current = input_streams + stream;
317 321 memset(current, 0, sizeof(*current));
318 322 current->name = name;
319 323 current->fd = fd;
320 324 current->next_path = next_path;
321 325 current->path = NULL;
322 326 current->constant = CONSTANT_FILE_MAYBE;
323 327 input_stream_nr = stream+1;
324 328 hash = hash_stream(name);
325 329 current->next_stream = *hash;
326 330 *hash = stream;
327 331 return stream;
328 332 }
329 333
330 334 static struct token * alloc_token(stream_t *stream)
331 335 {
332 336 struct token *token = __alloc_token(0);
333 337 token->pos = stream_pos(stream);
334 338 return token;
335 339 }
336 340
337 341 /*
338 342 * Argh... That was surprisingly messy - handling '\r' complicates the
339 343 * things a _lot_.
340 344 */
341 345 static int nextchar_slow(stream_t *stream)
342 346 {
343 347 int offset = stream->offset;
344 348 int size = stream->size;
345 349 int c;
346 350 int spliced = 0, had_cr, had_backslash;
347 351
348 352 restart:
349 353 had_cr = had_backslash = 0;
350 354
351 355 repeat:
352 356 if (offset >= size) {
353 357 if (stream->fd < 0)
354 358 goto got_eof;
355 359 size = read(stream->fd, stream->buffer, BUFSIZE);
356 360 if (size <= 0)
357 361 goto got_eof;
358 362 stream->size = size;
359 363 stream->offset = offset = 0;
360 364 }
361 365
362 366 c = stream->buffer[offset++];
363 367 if (had_cr)
364 368 goto check_lf;
365 369
366 370 if (c == '\r') {
367 371 had_cr = 1;
368 372 goto repeat;
369 373 }
370 374
371 375 norm:
372 376 if (!had_backslash) {
373 377 switch (c) {
374 378 case '\t':
375 379 stream->pos += tabstop - stream->pos % tabstop;
376 380 break;
377 381 case '\n':
378 382 stream->line++;
379 383 stream->pos = 0;
380 384 stream->newline = 1;
381 385 break;
382 386 case '\\':
383 387 had_backslash = 1;
384 388 stream->pos++;
385 389 goto repeat;
386 390 default:
387 391 stream->pos++;
388 392 }
389 393 } else {
390 394 if (c == '\n') {
391 395 stream->line++;
392 396 stream->pos = 0;
393 397 spliced = 1;
394 398 goto restart;
395 399 }
396 400 offset--;
397 401 c = '\\';
398 402 }
399 403 out:
400 404 stream->offset = offset;
401 405
402 406 return c;
403 407
404 408 check_lf:
405 409 if (c != '\n')
406 410 offset--;
407 411 c = '\n';
408 412 goto norm;
409 413
410 414 got_eof:
411 415 if (had_backslash) {
412 416 c = '\\';
413 417 goto out;
414 418 }
415 419 if (stream->pos)
416 420 warning(stream_pos(stream), "no newline at end of file");
417 421 else if (spliced)
418 422 warning(stream_pos(stream), "backslash-newline at end of file");
419 423 return EOF;
420 424 }
421 425
422 426 /*
423 427 * We want that as light as possible while covering all normal cases.
424 428 * Slow path (including the logics with line-splicing and EOF sanity
425 429 * checks) is in nextchar_slow().
426 430 */
427 431 static inline int nextchar(stream_t *stream)
428 432 {
429 433 int offset = stream->offset;
430 434
431 435 if (offset < stream->size) {
432 436 int c = stream->buffer[offset++];
433 437 static const char special[256] = {
434 438 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
435 439 };
436 440 if (!special[c]) {
437 441 stream->offset = offset;
438 442 stream->pos++;
439 443 return c;
440 444 }
441 445 }
↓ open down ↓ |
299 lines elided |
↑ open up ↑ |
442 446 return nextchar_slow(stream);
443 447 }
444 448
445 449 struct token eof_token_entry;
446 450
447 451 static struct token *mark_eof(stream_t *stream)
448 452 {
449 453 struct token *end;
450 454
451 455 end = alloc_token(stream);
456 + eof_token_entry.pos = end->pos;
452 457 token_type(end) = TOKEN_STREAMEND;
453 458 end->pos.newline = 1;
454 459
455 460 eof_token_entry.next = &eof_token_entry;
456 461 eof_token_entry.pos.newline = 1;
457 462
458 463 end->next = &eof_token_entry;
459 464 *stream->tokenlist = end;
460 465 stream->tokenlist = NULL;
461 466 return end;
462 467 }
463 468
464 469 static void add_token(stream_t *stream)
465 470 {
466 471 struct token *token = stream->token;
467 472
468 473 stream->token = NULL;
469 474 token->next = NULL;
470 475 *stream->tokenlist = token;
471 476 stream->tokenlist = &token->next;
472 477 }
473 478
474 479 static void drop_token(stream_t *stream)
475 480 {
476 481 stream->newline |= stream->token->pos.newline;
477 482 stream->whitespace |= stream->token->pos.whitespace;
478 483 stream->token = NULL;
479 484 }
480 485
↓ open down ↓ |
19 lines elided |
↑ open up ↑ |
481 486 enum {
482 487 Letter = 1,
483 488 Digit = 2,
484 489 Hex = 4,
485 490 Exp = 8,
486 491 Dot = 16,
487 492 ValidSecond = 32,
488 493 Quote = 64,
489 494 };
490 495
491 -static const long cclass[257] = {
492 - ['0' + 1 ... '7' + 1] = Digit | Hex, /* \<octal> */
493 - ['8' + 1 ... '9' + 1] = Digit | Hex,
496 +static const char cclass[257] = {
497 + ['0' + 1 ... '9' + 1] = Digit | Hex,
494 498 ['A' + 1 ... 'D' + 1] = Letter | Hex,
495 499 ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
496 500 ['F' + 1] = Letter | Hex,
497 501 ['G' + 1 ... 'O' + 1] = Letter,
498 502 ['P' + 1] = Letter | Exp, /* P<exp> */
499 503 ['Q' + 1 ... 'Z' + 1] = Letter,
500 - ['a' + 1 ... 'b' + 1] = Letter | Hex, /* \a, \b */
501 - ['c' + 1 ... 'd' + 1] = Letter | Hex,
502 - ['e' + 1] = Letter | Hex | Exp,/* \e, e<exp> */
503 - ['f' + 1] = Letter | Hex, /* \f */
504 - ['g' + 1 ... 'm' + 1] = Letter,
505 - ['n' + 1] = Letter, /* \n */
506 - ['o' + 1] = Letter,
504 + ['a' + 1 ... 'd' + 1] = Letter | Hex,
505 + ['e' + 1] = Letter | Hex | Exp, /* e<exp> */
506 + ['f' + 1] = Letter | Hex,
507 + ['g' + 1 ... 'o' + 1] = Letter,
507 508 ['p' + 1] = Letter | Exp, /* p<exp> */
508 - ['q' + 1] = Letter,
509 - ['r' + 1] = Letter, /* \r */
510 - ['s' + 1] = Letter,
511 - ['t' + 1] = Letter, /* \t */
512 - ['u' + 1] = Letter,
513 - ['v' + 1] = Letter, /* \v */
514 - ['w' + 1] = Letter,
515 - ['x' + 1] = Letter, /* \x<hex> */
516 - ['y' + 1 ... 'z' + 1] = Letter,
509 + ['q' + 1 ... 'z' + 1] = Letter,
517 510 ['_' + 1] = Letter,
518 511 ['.' + 1] = Dot | ValidSecond,
519 512 ['=' + 1] = ValidSecond,
520 513 ['+' + 1] = ValidSecond,
521 514 ['-' + 1] = ValidSecond,
522 515 ['>' + 1] = ValidSecond,
523 516 ['<' + 1] = ValidSecond,
524 517 ['&' + 1] = ValidSecond,
525 518 ['|' + 1] = ValidSecond,
526 519 ['#' + 1] = ValidSecond,
527 520 ['\'' + 1] = Quote,
528 521 ['"' + 1] = Quote,
529 522 };
530 523
531 524 /*
532 525 * pp-number:
533 526 * digit
534 527 * . digit
535 528 * pp-number digit
536 529 * pp-number identifier-nodigit
↓ open down ↓ |
10 lines elided |
↑ open up ↑ |
537 530 * pp-number e sign
538 531 * pp-number E sign
539 532 * pp-number p sign
540 533 * pp-number P sign
541 534 * pp-number .
542 535 */
543 536 static int get_one_number(int c, int next, stream_t *stream)
544 537 {
545 538 struct token *token;
546 539 static char buffer[4095];
547 - char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
548 - int len;
540 + char *p = buffer, *buffer_end = buffer + sizeof (buffer);
549 541
550 542 *p++ = c;
551 543 for (;;) {
552 544 long class = cclass[next + 1];
553 545 if (!(class & (Dot | Digit | Letter)))
554 546 break;
555 547 if (p != buffer_end)
556 548 *p++ = next;
557 549 next = nextchar(stream);
558 550 if (class & Exp) {
559 551 if (next == '-' || next == '+') {
560 552 if (p != buffer_end)
561 553 *p++ = next;
562 554 next = nextchar(stream);
563 555 }
564 556 }
565 557 }
↓ open down ↓ |
7 lines elided |
↑ open up ↑ |
566 558
567 559 if (p == buffer_end) {
568 560 sparse_error(stream_pos(stream), "number token exceeds %td characters",
569 561 buffer_end - buffer);
570 562 // Pretend we saw just "1".
571 563 buffer[0] = '1';
572 564 p = buffer + 1;
573 565 }
574 566
575 567 *p++ = 0;
576 - len = p - buffer;
577 - buf = __alloc_bytes(len);
578 - memcpy(buf, buffer, len);
579 -
580 568 token = stream->token;
581 569 token_type(token) = TOKEN_NUMBER;
582 - token->number = buf;
570 + token->number = xmemdup(buffer, p - buffer);
583 571 add_token(stream);
584 572
585 573 return next;
586 574 }
587 575
588 576 static int eat_string(int next, stream_t *stream, enum token_type type)
589 577 {
590 578 static char buffer[MAX_STRING];
591 579 struct string *string;
592 580 struct token *token = stream->token;
593 581 int len = 0;
↓ open down ↓ |
1 lines elided |
↑ open up ↑ |
594 582 int escape;
595 583 int want_hex = 0;
596 584 char delim = type < TOKEN_STRING ? '\'' : '"';
597 585
598 586 for (escape = 0; escape || next != delim; next = nextchar(stream)) {
599 587 if (len < MAX_STRING)
600 588 buffer[len] = next;
601 589 len++;
602 590 if (next == '\n') {
603 591 warning(stream_pos(stream),
604 - "Newline in string or character constant");
605 - if (delim == '\'') /* assume it's lost ' */
606 - break;
592 + "missing terminating %c character", delim);
593 + /* assume delimiter is lost */
594 + break;
607 595 }
608 596 if (next == EOF) {
609 597 warning(stream_pos(stream),
610 598 "End of file in middle of string");
611 599 return next;
612 600 }
613 601 if (!escape) {
614 602 if (want_hex && !(cclass[next + 1] & Hex))
615 603 warning(stream_pos(stream),
616 604 "\\x used with no following hex digits");
617 605 want_hex = 0;
618 606 escape = next == '\\';
619 607 } else {
620 608 escape = 0;
621 609 want_hex = next == 'x';
622 610 }
623 611 }
624 612 if (want_hex)
625 613 warning(stream_pos(stream),
626 614 "\\x used with no following hex digits");
627 615 if (len > MAX_STRING) {
628 616 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
629 617 len = MAX_STRING;
630 618 }
631 619 if (delim == '\'' && len <= 4) {
632 620 if (len == 0) {
633 621 sparse_error(stream_pos(stream),
634 622 "empty character constant");
635 623 return nextchar(stream);
636 624 }
637 625 token_type(token) = type + len;
638 626 memset(buffer + len, '\0', 4 - len);
639 627 memcpy(token->embedded, buffer, 4);
640 628 } else {
641 629 token_type(token) = type;
642 630 string = __alloc_string(len+1);
643 631 memcpy(string->data, buffer, len);
644 632 string->data[len] = '\0';
645 633 string->length = len+1;
646 634 token->string = string;
647 635 }
648 636
649 637 /* Pass it on.. */
650 638 token = stream->token;
651 639 add_token(stream);
652 640 return nextchar(stream);
653 641 }
654 642
655 643 static int drop_stream_eoln(stream_t *stream)
656 644 {
657 645 drop_token(stream);
658 646 for (;;) {
659 647 switch (nextchar(stream)) {
660 648 case EOF:
661 649 return EOF;
662 650 case '\n':
663 651 return nextchar(stream);
664 652 }
665 653 }
666 654 }
667 655
668 656 static int drop_stream_comment(stream_t *stream)
669 657 {
670 658 int newline;
671 659 int next;
672 660 drop_token(stream);
673 661 newline = stream->newline;
674 662
675 663 next = nextchar(stream);
676 664 for (;;) {
677 665 int curr = next;
678 666 if (curr == EOF) {
679 667 warning(stream_pos(stream), "End of file in the middle of a comment");
680 668 return curr;
681 669 }
682 670 next = nextchar(stream);
683 671 if (curr == '*' && next == '/')
684 672 break;
685 673 }
686 674 stream->newline = newline;
687 675 return nextchar(stream);
688 676 }
689 677
690 678 unsigned char combinations[][4] = COMBINATION_STRINGS;
691 679
692 680 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
693 681
694 682 /* hash function for two-character punctuators - all give unique values */
695 683 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
696 684
697 685 /*
698 686 * note that we won't get false positives - special_hash(0,0) is 0 and
699 687 * entry 0 is filled (by +=), so all the missing ones are OK.
700 688 */
701 689 static unsigned char hash_results[32][2] = {
702 690 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
703 691 RES('+', '='), /* 00 */
704 692 RES('/', '='), /* 01 */
705 693 RES('^', '='), /* 05 */
706 694 RES('&', '&'), /* 07 */
707 695 RES('#', '#'), /* 08 */
708 696 RES('<', '<'), /* 0a */
709 697 RES('<', '='), /* 0c */
710 698 RES('!', '='), /* 0e */
711 699 RES('%', '='), /* 0f */
712 700 RES('-', '-'), /* 10 */
713 701 RES('-', '='), /* 11 */
714 702 RES('-', '>'), /* 13 */
715 703 RES('=', '='), /* 15 */
716 704 RES('&', '='), /* 17 */
717 705 RES('*', '='), /* 18 */
718 706 RES('.', '.'), /* 1a */
719 707 RES('+', '+'), /* 1b */
720 708 RES('|', '='), /* 1c */
721 709 RES('>', '='), /* 1d */
722 710 RES('|', '|'), /* 1e */
723 711 RES('>', '>') /* 1f */
724 712 #undef RES
725 713 };
726 714 static int code[32] = {
727 715 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
728 716 CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
729 717 CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
730 718 CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
731 719 CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
732 720 CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
733 721 CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
734 722 CODE('<', '=', SPECIAL_LTE), /* 0c */
735 723 CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
736 724 CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
737 725 CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
738 726 CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
739 727 CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
740 728 CODE('=', '=', SPECIAL_EQUAL), /* 15 */
741 729 CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
742 730 CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
743 731 CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
744 732 CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
745 733 CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
746 734 CODE('>', '=', SPECIAL_GTE), /* 1d */
747 735 CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
748 736 CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */
749 737 #undef CODE
750 738 };
751 739
752 740 static int get_one_special(int c, stream_t *stream)
753 741 {
754 742 struct token *token;
755 743 int next, value, i;
756 744
757 745 next = nextchar(stream);
758 746
759 747 /*
760 748 * Check for numbers, strings, character constants, and comments
761 749 */
762 750 switch (c) {
763 751 case '.':
764 752 if (next >= '0' && next <= '9')
765 753 return get_one_number(c, next, stream);
766 754 break;
767 755 case '"':
768 756 return eat_string(next, stream, TOKEN_STRING);
769 757 case '\'':
770 758 return eat_string(next, stream, TOKEN_CHAR);
771 759 case '/':
772 760 if (next == '/')
773 761 return drop_stream_eoln(stream);
774 762 if (next == '*')
775 763 return drop_stream_comment(stream);
776 764 }
777 765
778 766 /*
779 767 * Check for combinations
780 768 */
781 769 value = c;
782 770 if (cclass[next + 1] & ValidSecond) {
783 771 i = special_hash(c, next);
784 772 if (hash_results[i][0] == c && hash_results[i][1] == next) {
785 773 value = code[i];
786 774 next = nextchar(stream);
787 775 if (value >= SPECIAL_LEFTSHIFT &&
788 776 next == "==."[value - SPECIAL_LEFTSHIFT]) {
789 777 value += 3;
790 778 next = nextchar(stream);
791 779 }
792 780 }
793 781 }
794 782
795 783 /* Pass it on.. */
796 784 token = stream->token;
797 785 token_type(token) = TOKEN_SPECIAL;
798 786 token->special = value;
799 787 add_token(stream);
800 788 return next;
801 789 }
802 790
803 791 #define IDENT_HASH_BITS (13)
804 792 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
805 793 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
806 794
807 795 #define ident_hash_init(c) (c)
808 796 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
809 797 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
810 798
811 799 static struct ident *hash_table[IDENT_HASH_SIZE];
812 800 static int ident_hit, ident_miss, idents;
813 801
814 802 void show_identifier_stats(void)
815 803 {
816 804 int i;
817 805 int distribution[100];
818 806
819 807 fprintf(stderr, "identifiers: %d hits, %d misses\n",
820 808 ident_hit, ident_miss);
821 809
822 810 for (i = 0; i < 100; i++)
823 811 distribution[i] = 0;
824 812
825 813 for (i = 0; i < IDENT_HASH_SIZE; i++) {
826 814 struct ident * ident = hash_table[i];
827 815 int count = 0;
828 816
829 817 while (ident) {
830 818 count++;
831 819 ident = ident->next;
832 820 }
833 821 if (count > 99)
834 822 count = 99;
835 823 distribution[count]++;
836 824 }
837 825
838 826 for (i = 0; i < 100; i++) {
839 827 if (distribution[i])
840 828 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
841 829 }
842 830 }
843 831
844 832 static struct ident *alloc_ident(const char *name, int len)
845 833 {
846 834 struct ident *ident = __alloc_ident(len);
847 835 ident->symbols = NULL;
848 836 ident->len = len;
849 837 ident->tainted = 0;
850 838 memcpy(ident->name, name, len);
851 839 return ident;
852 840 }
853 841
854 842 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
855 843 {
856 844 ident->next = hash_table[hash];
857 845 hash_table[hash] = ident;
858 846 ident_miss++;
859 847 return ident;
860 848 }
861 849
862 850 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
863 851 {
864 852 struct ident *ident;
865 853 struct ident **p;
866 854
867 855 p = &hash_table[hash];
868 856 while ((ident = *p) != NULL) {
869 857 if (ident->len == (unsigned char) len) {
870 858 if (strncmp(name, ident->name, len) != 0)
871 859 goto next;
872 860
873 861 ident_hit++;
874 862 return ident;
875 863 }
876 864 next:
877 865 //misses++;
878 866 p = &ident->next;
879 867 }
880 868 ident = alloc_ident(name, len);
881 869 *p = ident;
882 870 ident->next = NULL;
883 871 ident_miss++;
884 872 idents++;
885 873 return ident;
886 874 }
887 875
888 876 static unsigned long hash_name(const char *name, int len)
889 877 {
890 878 unsigned long hash;
891 879 const unsigned char *p = (const unsigned char *)name;
892 880
893 881 hash = ident_hash_init(*p++);
894 882 while (--len) {
895 883 unsigned int i = *p++;
896 884 hash = ident_hash_add(hash, i);
897 885 }
898 886 return ident_hash_end(hash);
899 887 }
900 888
901 889 struct ident *hash_ident(struct ident *ident)
902 890 {
903 891 return insert_hash(ident, hash_name(ident->name, ident->len));
904 892 }
905 893
906 894 struct ident *built_in_ident(const char *name)
907 895 {
908 896 int len = strlen(name);
909 897 return create_hashed_ident(name, len, hash_name(name, len));
910 898 }
911 899
912 900 struct token *built_in_token(int stream, struct ident *ident)
913 901 {
914 902 struct token *token;
915 903
916 904 token = __alloc_token(0);
917 905 token->pos.stream = stream;
918 906 token_type(token) = TOKEN_IDENT;
919 907 token->ident = ident;
920 908 return token;
921 909 }
922 910
923 911 static int get_one_identifier(int c, stream_t *stream)
924 912 {
925 913 struct token *token;
926 914 struct ident *ident;
927 915 unsigned long hash;
928 916 char buf[256];
929 917 int len = 1;
930 918 int next;
931 919
932 920 hash = ident_hash_init(c);
933 921 buf[0] = c;
934 922 for (;;) {
935 923 next = nextchar(stream);
936 924 if (!(cclass[next + 1] & (Letter | Digit)))
937 925 break;
938 926 if (len >= sizeof(buf))
939 927 break;
940 928 hash = ident_hash_add(hash, next);
941 929 buf[len] = next;
942 930 len++;
943 931 };
944 932 if (cclass[next + 1] & Quote) {
945 933 if (len == 1 && buf[0] == 'L') {
946 934 if (next == '\'')
947 935 return eat_string(nextchar(stream), stream,
948 936 TOKEN_WIDE_CHAR);
949 937 else
950 938 return eat_string(nextchar(stream), stream,
951 939 TOKEN_WIDE_STRING);
952 940 }
953 941 }
954 942 hash = ident_hash_end(hash);
955 943 ident = create_hashed_ident(buf, len, hash);
956 944
957 945 /* Pass it on.. */
958 946 token = stream->token;
959 947 token_type(token) = TOKEN_IDENT;
960 948 token->ident = ident;
961 949 add_token(stream);
962 950 return next;
963 951 }
964 952
965 953 static int get_one_token(int c, stream_t *stream)
966 954 {
967 955 long class = cclass[c + 1];
968 956 if (class & Digit)
969 957 return get_one_number(c, nextchar(stream), stream);
970 958 if (class & Letter)
971 959 return get_one_identifier(c, stream);
972 960 return get_one_special(c, stream);
973 961 }
974 962
975 963 static struct token *setup_stream(stream_t *stream, int idx, int fd,
976 964 unsigned char *buf, unsigned int buf_size)
977 965 {
978 966 struct token *begin;
979 967
980 968 stream->nr = idx;
981 969 stream->line = 1;
982 970 stream->newline = 1;
983 971 stream->whitespace = 0;
984 972 stream->pos = 0;
985 973
986 974 stream->token = NULL;
987 975 stream->fd = fd;
988 976 stream->offset = 0;
989 977 stream->size = buf_size;
990 978 stream->buffer = buf;
991 979
992 980 begin = alloc_token(stream);
993 981 token_type(begin) = TOKEN_STREAMBEGIN;
994 982 stream->tokenlist = &begin->next;
995 983 return begin;
996 984 }
997 985
998 986 static struct token *tokenize_stream(stream_t *stream)
999 987 {
1000 988 int c = nextchar(stream);
1001 989 while (c != EOF) {
1002 990 if (!isspace(c)) {
1003 991 struct token *token = alloc_token(stream);
1004 992 stream->token = token;
1005 993 stream->newline = 0;
1006 994 stream->whitespace = 0;
1007 995 c = get_one_token(c, stream);
1008 996 continue;
1009 997 }
1010 998 stream->whitespace = 1;
1011 999 c = nextchar(stream);
1012 1000 }
1013 1001 return mark_eof(stream);
1014 1002 }
1015 1003
1016 1004 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1017 1005 {
1018 1006 stream_t stream;
1019 1007 struct token *begin;
1020 1008
1021 1009 begin = setup_stream(&stream, 0, -1, buffer, size);
1022 1010 *endtoken = tokenize_stream(&stream);
1023 1011 return begin;
1024 1012 }
1025 1013
1026 1014 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1027 1015 {
1028 1016 struct token *begin, *end;
1029 1017 stream_t stream;
1030 1018 unsigned char buffer[BUFSIZE];
1031 1019 int idx;
1032 1020
1033 1021 idx = init_stream(name, fd, next_path);
1034 1022 if (idx < 0) {
1035 1023 // info(endtoken->pos, "File %s is const", name);
1036 1024 return endtoken;
1037 1025 }
1038 1026
1039 1027 begin = setup_stream(&stream, idx, fd, buffer, 0);
1040 1028 end = tokenize_stream(&stream);
1041 1029 if (endtoken)
1042 1030 end->next = endtoken;
1043 1031 return begin;
1044 1032 }
↓ open down ↓ |
428 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX