2 Wdiff usr/src/tools/smatch/src/tokenize.c

Print this page

12724 update smatch to 0.6.1-rc1-il-5

Split	Close
Expand all
Collapse all

          --- old/usr/src/tools/smatch/src/tokenize.c
          +++ new/usr/src/tools/smatch/src/tokenize.c

   1    1  /*
   2    2   * This is a really stupid C tokenizer. It doesn't do any include
   3    3   * files or anything complex at all. That's the preprocessor.
   4    4   *
   5    5   * Copyright (C) 2003 Transmeta Corp.
   6    6   *               2003 Linus Torvalds
   7    7   *
   8    8   * Permission is hereby granted, free of charge, to any person obtaining a copy
   9    9   * of this software and associated documentation files (the "Software"), to deal
  10   10   * in the Software without restriction, including without limitation the rights
  11   11   * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12   12   * copies of the Software, and to permit persons to whom the Software is
  13   13   * furnished to do so, subject to the following conditions:
  14   14   *
  15   15   * The above copyright notice and this permission notice shall be included in
  16   16   * all copies or substantial portions of the Software.
  17   17   *
  18   18   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19   19   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20   20   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21   21   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22   22   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23   23   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24   24   * THE SOFTWARE.
  25   25   */
  26   26  #include <stdio.h>
  27   27  #include <stdlib.h>
  28   28  #include <stdarg.h>
  29   29  #include <stddef.h>
  30   30  #include <string.h>
  31   31  #include <ctype.h>
  32   32  #include <unistd.h>
  33   33  #include <stdint.h>
  34   34  
  35   35  #include "lib.h"
  36   36  #include "allocate.h"
  37   37  #include "token.h"
  38   38  #include "symbol.h"
  39   39  
  40   40  #define EOF (-1)
  41   41  
  42   42  int input_stream_nr = 0;
  43   43  struct stream *input_streams;
  44   44  static int input_streams_allocated;
  45   45  unsigned int tabstop = 8;
  46   46  int no_lineno = 0;
  47   47  
  48   48  #define BUFSIZE (8192)
  49   49  
  50   50  typedef struct {
  51   51          int fd, offset, size;
  52   52          int pos, line, nr;
  53   53          int newline, whitespace;
  54   54          struct token **tokenlist;
  55   55          struct token *token;
  56   56          unsigned char *buffer;
  57   57  } stream_t;
  58   58  
  59   59  const char *stream_name(int stream)
  60   60  {
  61   61          if (stream < 0 || stream > input_stream_nr)
  62   62                  return "<bad stream>";
  63   63          return input_streams[stream].name;
  64   64  }
  65   65  
  66   66  static struct position stream_pos(stream_t *stream)
  67   67  {
  68   68          struct position pos;
  69   69          pos.type = 0;
  70   70          pos.stream = stream->nr;
  71   71          pos.newline = stream->newline;
  72   72          pos.whitespace = stream->whitespace;
  73   73          pos.pos = stream->pos;
  74   74  
  75   75          pos.line = stream->line;
  76   76          if (no_lineno)
  77   77                  pos.line = 123456;
  78   78  
  79   79          pos.noexpand = 0;
  80   80          return pos;
  81   81  }
  82   82  
  83   83  const char *show_special(int val)
  84   84  {
  85   85          static char buffer[4];
  86   86  
  87   87          buffer[0] = val;
  88   88          buffer[1] = 0;
  89   89          if (val >= SPECIAL_BASE)
  90   90                  strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
  91   91          return buffer;
  92   92  }
  93   93  
  94   94  const char *show_ident(const struct ident *ident)
  95   95  {
  96   96          static char buff[4][256];
  97   97          static int n;
  98   98          char *buffer;
  99   99  
 100  100          if (!ident)
 101  101                  return "<noident>";
 102  102          buffer = buff[3 & ++n];
 103  103          sprintf(buffer, "%.*s", ident->len, ident->name);
 104  104          return buffer;
 105  105  }
 106  106  
 107  107  static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
 108  108  {
 109  109          if (isprint(c)) {
 110  110                  if (c == escape || c == '\\')
 111  111                          *ptr++ = '\\';
 112  112                  *ptr++ = c;
 113  113                  return ptr;
 114  114          }
 115  115          *ptr++ = '\\';
 116  116          switch (c) {
 117  117          case '\n':
 118  118                  *ptr++ = 'n';
 119  119                  return ptr;
 120  120          case '\t':
 121  121                  *ptr++ = 't';
 122  122                  return ptr;
 123  123          }
 124  124          if (!isdigit(next))
 125  125                  return ptr + sprintf(ptr, "%o", c);
 126  126                  
 127  127          return ptr + sprintf(ptr, "%03o", c);
 128  128  }
 129  129  
 130  130  const char *show_string(const struct string *string)
 131  131  {
 132  132          static char buffer[4 * MAX_STRING + 3];
 133  133          char *ptr;
 134  134          int i;
 135  135  
 136  136          if (!string || !string->length)
 137  137                  return "<bad_string>";
 138  138          ptr = buffer;
 139  139          *ptr++ = '"';
 140  140          for (i = 0; i < string->length-1; i++) {
 141  141                  const char *p = string->data + i;
 142  142                  ptr = charstr(ptr, p[0], '"', p[1]);
 143  143          }
 144  144          *ptr++ = '"';
 145  145          *ptr = '\0';
 146  146          return buffer;
 147  147  }
 148  148  
 149  149  static const char *show_char(const char *s, size_t len, char prefix, char delim)
 150  150  {
 151  151          static char buffer[MAX_STRING + 4];
 152  152          char *p = buffer;
 153  153          if (prefix)
 154  154                  *p++ = prefix;
 155  155          *p++ = delim;
 156  156          memcpy(p, s, len);
 157  157          p += len;
 158  158          *p++ = delim;
 159  159          *p++ = '\0';
 160  160          return buffer;
 161  161  }
 162  162  
 163  163  static const char *quote_char(const char *s, size_t len, char prefix, char delim)
 164  164  {
 165  165          static char buffer[2*MAX_STRING + 6];
 166  166          size_t i;
 167  167          char *p = buffer;
 168  168          if (prefix)
 169  169                  *p++ = prefix;
 170  170          if (delim == '"')
 171  171                  *p++ = '\\';
 172  172          *p++ = delim;
 173  173          for (i = 0; i < len; i++) {
 174  174                  if (s[i] == '"' || s[i] == '\\')
 175  175                          *p++ = '\\';
 176  176                  *p++ = s[i];
 177  177          }
 178  178          if (delim == '"')
 179  179                  *p++ = '\\';
 180  180          *p++ = delim;
 181  181          *p++ = '\0';
 182  182          return buffer;
 183  183  }
 184  184  
 185  185  const char *show_token(const struct token *token)
 186  186  {
 187  187          static char buffer[256];
 188  188  
 189  189          if (!token)
 190  190                  return "<no token>";
 191  191          switch (token_type(token)) {
 192  192          case TOKEN_ERROR:
 193  193                  return "syntax error";
 194  194  
 195  195          case TOKEN_EOF:
 196  196                  return "end-of-input";
 197  197  
 198  198          case TOKEN_IDENT:
 199  199                  return show_ident(token->ident);
 200  200  
 201  201          case TOKEN_NUMBER:
 202  202                  return token->number;
 203  203  
 204  204          case TOKEN_SPECIAL:
 205  205                  return show_special(token->special);
 206  206  
 207  207          case TOKEN_CHAR: 
 208  208                  return show_char(token->string->data,
 209  209                          token->string->length - 1, 0, '\'');
 210  210          case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 211  211                  return show_char(token->embedded,
 212  212                          token_type(token) - TOKEN_CHAR, 0, '\'');
 213  213          case TOKEN_WIDE_CHAR: 
 214  214                  return show_char(token->string->data,
 215  215                          token->string->length - 1, 'L', '\'');
 216  216          case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 217  217                  return show_char(token->embedded,
 218  218                          token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 219  219          case TOKEN_STRING: 
 220  220                  return show_char(token->string->data,
 221  221                          token->string->length - 1, 0, '"');
 222  222          case TOKEN_WIDE_STRING: 
 223  223                  return show_char(token->string->data,
 224  224                          token->string->length - 1, 'L', '"');
 225  225  
 226  226          case TOKEN_STREAMBEGIN:
 227  227                  sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 228  228                  return buffer;
 229  229  
 230  230          case TOKEN_STREAMEND:
 231  231                  sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 232  232                  return buffer;
 233  233  
 234  234          case TOKEN_UNTAINT:
 235  235                  sprintf(buffer, "<untaint>");
 236  236                  return buffer;
 237  237  
 238  238          case TOKEN_ARG_COUNT:
 239  239                  sprintf(buffer, "<argcnt>");
 240  240                  return buffer;
 241  241  
 242  242          default:
 243  243                  sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 244  244                  return buffer;
 245  245          }
 246  246  }
 247  247  
 248  248  const char *quote_token(const struct token *token)
 249  249  {
 250  250          static char buffer[256];
 251  251  
 252  252          switch (token_type(token)) {
 253  253          case TOKEN_ERROR:
 254  254                  return "syntax error";
 255  255  
 256  256          case TOKEN_IDENT:
 257  257                  return show_ident(token->ident);
 258  258  
 259  259          case TOKEN_NUMBER:
 260  260                  return token->number;
 261  261  
 262  262          case TOKEN_SPECIAL:
 263  263                  return show_special(token->special);
 264  264  
 265  265          case TOKEN_CHAR: 
 266  266                  return quote_char(token->string->data,
 267  267                          token->string->length - 1, 0, '\'');
 268  268          case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 269  269                  return quote_char(token->embedded,
 270  270                          token_type(token) - TOKEN_CHAR, 0, '\'');
 271  271          case TOKEN_WIDE_CHAR: 
 272  272                  return quote_char(token->string->data,
 273  273                          token->string->length - 1, 'L', '\'');
 274  274          case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 275  275                  return quote_char(token->embedded,
 276  276                          token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 277  277          case TOKEN_STRING: 
 278  278                  return quote_char(token->string->data,
 279  279                          token->string->length - 1, 0, '"');
 280  280          case TOKEN_WIDE_STRING: 
 281  281                  return quote_char(token->string->data,
 282  282                          token->string->length - 1, 'L', '"');
 283  283          default:
 284  284                  sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 285  285                  return buffer;
 286  286          }
 287  287  }
 288  288  
 289  289  #define HASHED_INPUT_BITS (6)
 290  290  #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
 291  291  #define HASH_PRIME 0x9e370001UL
 292  292  
 293  293  static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
 294  294  
 295  295  int *hash_stream(const char *name)
 296  296  {
 297  297          uint32_t hash = 0;
 298  298          unsigned char c;
 299  299  
 300  300          while ((c = *name++) != 0)
 301  301                  hash = (hash + (c << 4) + (c >> 4)) * 11;
 302  302  
 303  303          hash *= HASH_PRIME;
 304  304          hash >>= 32 - HASHED_INPUT_BITS;
 305  305          return input_stream_hashes + hash;
 306  306  }
 307  307  
 308  308  int init_stream(const char *name, int fd, const char **next_path)
 309  309  {
 310  310          int stream = input_stream_nr, *hash;
 311  311          struct stream *current;
 312  312  
 313  313          if (stream >= input_streams_allocated) {
 314  314                  int newalloc = stream * 4 / 3 + 10;
 315  315                  input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 316  316                  if (!input_streams)
 317  317                          die("Unable to allocate more streams space");
 318  318                  input_streams_allocated = newalloc;
 319  319          }
 320  320          current = input_streams + stream;
 321  321          memset(current, 0, sizeof(*current));
 322  322          current->name = name;
 323  323          current->fd = fd;
 324  324          current->next_path = next_path;
 325  325          current->path = NULL;
 326  326          current->constant = CONSTANT_FILE_MAYBE;
 327  327          input_stream_nr = stream+1;
 328  328          hash = hash_stream(name);
 329  329          current->next_stream = *hash;
 330  330          *hash = stream;
 331  331          return stream;
 332  332  }
 333  333  
 334  334  static struct token * alloc_token(stream_t *stream)
 335  335  {
 336  336          struct token *token = __alloc_token(0);
 337  337          token->pos = stream_pos(stream);
 338  338          return token;
 339  339  }
 340  340  
 341  341  /*
 342  342   *  Argh...  That was surprisingly messy - handling '\r' complicates the
 343  343   *  things a _lot_.
 344  344   */
 345  345  static int nextchar_slow(stream_t *stream)
 346  346  {
 347  347          int offset = stream->offset;
 348  348          int size = stream->size;
 349  349          int c;
 350  350          int spliced = 0, had_cr, had_backslash;
 351  351  
 352  352  restart:
 353  353          had_cr = had_backslash = 0;
 354  354  
 355  355  repeat:
 356  356          if (offset >= size) {
 357  357                  if (stream->fd < 0)
 358  358                          goto got_eof;
 359  359                  size = read(stream->fd, stream->buffer, BUFSIZE);
 360  360                  if (size <= 0)
 361  361                          goto got_eof;
 362  362                  stream->size = size;
 363  363                  stream->offset = offset = 0;
 364  364          }
 365  365  
 366  366          c = stream->buffer[offset++];
 367  367          if (had_cr)
 368  368                  goto check_lf;
 369  369  
 370  370          if (c == '\r') {
 371  371                  had_cr = 1;
 372  372                  goto repeat;
 373  373          }
 374  374  
 375  375  norm:
 376  376          if (!had_backslash) {
 377  377                  switch (c) {
 378  378                  case '\t':
 379  379                          stream->pos += tabstop - stream->pos % tabstop;
 380  380                          break;
 381  381                  case '\n':
 382  382                          stream->line++;
 383  383                          stream->pos = 0;
 384  384                          stream->newline = 1;
 385  385                          break;
 386  386                  case '\\':
 387  387                          had_backslash = 1;
 388  388                          stream->pos++;
 389  389                          goto repeat;
 390  390                  default:
 391  391                          stream->pos++;
 392  392                  }
 393  393          } else {
 394  394                  if (c == '\n') {
 395  395                          stream->line++;
 396  396                          stream->pos = 0;
 397  397                          spliced = 1;
 398  398                          goto restart;
 399  399                  }
 400  400                  offset--;
 401  401                  c = '\\';
 402  402          }
 403  403  out:
 404  404          stream->offset = offset;
 405  405  
 406  406          return c;
 407  407  
 408  408  check_lf:
 409  409          if (c != '\n')
 410  410                  offset--;
 411  411          c = '\n';
 412  412          goto norm;
 413  413  
 414  414  got_eof:
 415  415          if (had_backslash) {
 416  416                  c = '\\';
 417  417                  goto out;
 418  418          }
 419  419          if (stream->pos)
 420  420                  warning(stream_pos(stream), "no newline at end of file");
 421  421          else if (spliced)
 422  422                  warning(stream_pos(stream), "backslash-newline at end of file");
 423  423          return EOF;
 424  424  }
 425  425  
 426  426  /*
 427  427   *  We want that as light as possible while covering all normal cases.
 428  428   *  Slow path (including the logics with line-splicing and EOF sanity
 429  429   *  checks) is in nextchar_slow().
 430  430   */
 431  431  static inline int nextchar(stream_t *stream)
 432  432  {
 433  433          int offset = stream->offset;
 434  434  
 435  435          if (offset < stream->size) {
 436  436                  int c = stream->buffer[offset++];
 437  437                  static const char special[256] = {
 438  438                          ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 439  439                  };
 440  440                  if (!special[c]) {
 441  441                          stream->offset = offset;
 442  442                          stream->pos++;
 443  443                          return c;
 444  444                  }
 445  445          }
 446  446          return nextchar_slow(stream);
 447  447  }
 448  448  
 449  449  struct token eof_token_entry;
 450  450  
 451  451  static struct token *mark_eof(stream_t *stream)
 452  452  {
 453  453          struct token *end;
 454  454  
 455  455          end = alloc_token(stream);
 456  456          eof_token_entry.pos = end->pos;
 457  457          token_type(end) = TOKEN_STREAMEND;
 458  458          end->pos.newline = 1;
 459  459  
 460  460          eof_token_entry.next = &eof_token_entry;
 461  461          eof_token_entry.pos.newline = 1;
 462  462  
 463  463          end->next =  &eof_token_entry;
 464  464          *stream->tokenlist = end;
 465  465          stream->tokenlist = NULL;
 466  466          return end;
 467  467  }
 468  468  
 469  469  static void add_token(stream_t *stream)
 470  470  {
 471  471          struct token *token = stream->token;
 472  472  
 473  473          stream->token = NULL;
 474  474          token->next = NULL;
 475  475          *stream->tokenlist = token;
 476  476          stream->tokenlist = &token->next;
 477  477  }
 478  478  
 479  479  static void drop_token(stream_t *stream)
 480  480  {
 481  481          stream->newline |= stream->token->pos.newline;
 482  482          stream->whitespace |= stream->token->pos.whitespace;
 483  483          stream->token = NULL;
 484  484  }
 485  485  
 486  486  enum {
 487  487          Letter = 1,
 488  488          Digit = 2,
 489  489          Hex = 4,
 490  490          Exp = 8,
 491  491          Dot = 16,
 492  492          ValidSecond = 32,
 493  493          Quote = 64,
 494  494  };
 495  495  
 496  496  static const char cclass[257] = {
 497  497          ['0' + 1 ... '9' + 1] = Digit | Hex,
 498  498          ['A' + 1 ... 'D' + 1] = Letter | Hex,
 499  499          ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
 500  500          ['F' + 1] = Letter | Hex,
 501  501          ['G' + 1 ... 'O' + 1] = Letter,
 502  502          ['P' + 1] = Letter | Exp,       /* P<exp> */
 503  503          ['Q' + 1 ... 'Z' + 1] = Letter,
 504  504          ['a' + 1 ... 'd' + 1] = Letter | Hex,
 505  505          ['e' + 1] = Letter | Hex | Exp, /* e<exp> */
 506  506          ['f' + 1] = Letter | Hex,
 507  507          ['g' + 1 ... 'o' + 1] = Letter,
 508  508          ['p' + 1] = Letter | Exp,       /* p<exp> */
 509  509          ['q' + 1 ... 'z' + 1] = Letter,
 510  510          ['_' + 1] = Letter,
 511  511          ['.' + 1] = Dot | ValidSecond,
 512  512          ['=' + 1] = ValidSecond,
 513  513          ['+' + 1] = ValidSecond,
 514  514          ['-' + 1] = ValidSecond,
 515  515          ['>' + 1] = ValidSecond,
 516  516          ['<' + 1] = ValidSecond,
 517  517          ['&' + 1] = ValidSecond,
 518  518          ['|' + 1] = ValidSecond,
 519  519          ['#' + 1] = ValidSecond,
 520  520          ['\'' + 1] = Quote,
 521  521          ['"' + 1] = Quote,
 522  522  };
 523  523  
 524  524  /*
 525  525   * pp-number:
 526  526   *      digit
 527  527   *      . digit
 528  528   *      pp-number digit
 529  529   *      pp-number identifier-nodigit
 530  530   *      pp-number e sign
 531  531   *      pp-number E sign
 532  532   *      pp-number p sign
 533  533   *      pp-number P sign
 534  534   *      pp-number .
 535  535   */
 536  536  static int get_one_number(int c, int next, stream_t *stream)
 537  537  {
 538  538          struct token *token;
 539  539          static char buffer[4095];
 540  540          char *p = buffer, *buffer_end = buffer + sizeof (buffer);
 541  541  
 542  542          *p++ = c;
 543  543          for (;;) {
 544  544                  long class =  cclass[next + 1];
 545  545                  if (!(class & (Dot | Digit | Letter)))
 546  546                          break;
 547  547                  if (p != buffer_end)
 548  548                          *p++ = next;
 549  549                  next = nextchar(stream);
 550  550                  if (class & Exp) {
 551  551                          if (next == '-' || next == '+') {
 552  552                                  if (p != buffer_end)
 553  553                                          *p++ = next;
 554  554                                  next = nextchar(stream);
 555  555                          }
 556  556                  }
 557  557          }
 558  558  
 559  559          if (p == buffer_end) {
 560  560                  sparse_error(stream_pos(stream), "number token exceeds %td characters",
 561  561                        buffer_end - buffer);
 562  562                  // Pretend we saw just "1".
 563  563                  buffer[0] = '1';
 564  564                  p = buffer + 1;
 565  565          }
 566  566  
 567  567          *p++ = 0;
 568  568          token = stream->token;
 569  569          token_type(token) = TOKEN_NUMBER;
 570  570          token->number = xmemdup(buffer, p - buffer);
 571  571          add_token(stream);
 572  572  
 573  573          return next;
 574  574  }
 575  575  
 576  576  static int eat_string(int next, stream_t *stream, enum token_type type)
 577  577  {
 578  578          static char buffer[MAX_STRING];
 579  579          struct string *string;
 580  580          struct token *token = stream->token;
 581  581          int len = 0;
 582  582          int escape;
 583  583          int want_hex = 0;
 584  584          char delim = type < TOKEN_STRING ? '\'' : '"';
 585  585  
 586  586          for (escape = 0; escape || next != delim; next = nextchar(stream)) {
 587  587                  if (len < MAX_STRING)
 588  588                          buffer[len] = next;
 589  589                  len++;
 590  590                  if (next == '\n') {
 591  591                          warning(stream_pos(stream),
 592  592                                  "missing terminating %c character", delim);
 593  593                          /* assume delimiter is lost */
 594  594                          break;
 595  595                  }
 596  596                  if (next == EOF) {
 597  597                          warning(stream_pos(stream),
 598  598                                  "End of file in middle of string");
 599  599                          return next;
 600  600                  }
 601  601                  if (!escape) {
 602  602                          if (want_hex && !(cclass[next + 1] & Hex))
 603  603                                  warning(stream_pos(stream),
 604  604                                          "\\x used with no following hex digits");
 605  605                          want_hex = 0;
 606  606                          escape = next == '\\';
 607  607                  } else {
 608  608                          escape = 0;
 609  609                          want_hex = next == 'x';
 610  610                  }
 611  611          }
 612  612          if (want_hex)
 613  613                  warning(stream_pos(stream),
 614  614                          "\\x used with no following hex digits");
 615  615          if (len > MAX_STRING) {
 616  616                  warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 617  617                  len = MAX_STRING;
 618  618          }
 619  619          if (delim == '\'' && len <= 4) {
 620  620                  if (len == 0) {
 621  621                          sparse_error(stream_pos(stream),
 622  622                                  "empty character constant");
 623  623                          return nextchar(stream);
 624  624                  }
 625  625                  token_type(token) = type + len;
 626  626                  memset(buffer + len, '\0', 4 - len);
 627  627                  memcpy(token->embedded, buffer, 4);
 628  628          } else {
 629  629                  token_type(token) = type;
 630  630                  string = __alloc_string(len+1);
 631  631                  memcpy(string->data, buffer, len);
 632  632                  string->data[len] = '\0';
 633  633                  string->length = len+1;
 634  634                  token->string = string;
 635  635          }
 636  636  
 637  637          /* Pass it on.. */
 638  638          token = stream->token;
 639  639          add_token(stream);
 640  640          return nextchar(stream);
 641  641  }
 642  642  
 643  643  static int drop_stream_eoln(stream_t *stream)
 644  644  {
 645  645          drop_token(stream);
 646  646          for (;;) {
 647  647                  switch (nextchar(stream)) {
 648  648                  case EOF:
 649  649                          return EOF;
 650  650                  case '\n':
 651  651                          return nextchar(stream);
 652  652                  }
 653  653          }
 654  654  }
 655  655  
 656  656  static int drop_stream_comment(stream_t *stream)
 657  657  {
 658  658          int newline;
 659  659          int next;
 660  660          drop_token(stream);
 661  661          newline = stream->newline;
 662  662  
 663  663          next = nextchar(stream);
 664  664          for (;;) {
 665  665                  int curr = next;
 666  666                  if (curr == EOF) {
 667  667                          warning(stream_pos(stream), "End of file in the middle of a comment");
 668  668                          return curr;
 669  669                  }
 670  670                  next = nextchar(stream);
 671  671                  if (curr == '*' && next == '/')
 672  672                          break;
 673  673          }
 674  674          stream->newline = newline;
 675  675          return nextchar(stream);
 676  676  }
 677  677  
 678  678  unsigned char combinations[][4] = COMBINATION_STRINGS;
 679  679  
 680  680  #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 681  681  
 682  682  /* hash function for two-character punctuators - all give unique values */
 683  683  #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 684  684  
 685  685  /*
 686  686   * note that we won't get false positives - special_hash(0,0) is 0 and
 687  687   * entry 0 is filled (by +=), so all the missing ones are OK.
 688  688   */
 689  689  static unsigned char hash_results[32][2] = {
 690  690  #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 691  691          RES('+', '='), /* 00 */
 692  692          RES('/', '='), /* 01 */
 693  693          RES('^', '='), /* 05 */
 694  694          RES('&', '&'), /* 07 */
 695  695          RES('#', '#'), /* 08 */
 696  696          RES('<', '<'), /* 0a */
 697  697          RES('<', '='), /* 0c */
 698  698          RES('!', '='), /* 0e */
 699  699          RES('%', '='), /* 0f */
 700  700          RES('-', '-'), /* 10 */
 701  701          RES('-', '='), /* 11 */
 702  702          RES('-', '>'), /* 13 */
 703  703          RES('=', '='), /* 15 */
 704  704          RES('&', '='), /* 17 */
 705  705          RES('*', '='), /* 18 */
 706  706          RES('.', '.'), /* 1a */
 707  707          RES('+', '+'), /* 1b */
 708  708          RES('|', '='), /* 1c */
 709  709          RES('>', '='), /* 1d */
 710  710          RES('|', '|'), /* 1e */
 711  711          RES('>', '>')  /* 1f */
 712  712  #undef RES
 713  713  };
 714  714  static int code[32] = {
 715  715  #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 716  716          CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 717  717          CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 718  718          CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 719  719          CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 720  720          CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 721  721          CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 722  722          CODE('<', '=', SPECIAL_LTE), /* 0c */
 723  723          CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 724  724          CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 725  725          CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 726  726          CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 727  727          CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 728  728          CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 729  729          CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 730  730          CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 731  731          CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 732  732          CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 733  733          CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 734  734          CODE('>', '=', SPECIAL_GTE), /* 1d */
 735  735          CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 736  736          CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 737  737  #undef CODE
 738  738  };
 739  739  
 740  740  static int get_one_special(int c, stream_t *stream)
 741  741  {
 742  742          struct token *token;
 743  743          int next, value, i;
 744  744  
 745  745          next = nextchar(stream);
 746  746  
 747  747          /*
 748  748           * Check for numbers, strings, character constants, and comments
 749  749           */
 750  750          switch (c) {
 751  751          case '.':
 752  752                  if (next >= '0' && next <= '9')
 753  753                          return get_one_number(c, next, stream);
 754  754                  break;
 755  755          case '"':
 756  756                  return eat_string(next, stream, TOKEN_STRING);
 757  757          case '\'':
 758  758                  return eat_string(next, stream, TOKEN_CHAR);
 759  759          case '/':
 760  760                  if (next == '/')
 761  761                          return drop_stream_eoln(stream);
 762  762                  if (next == '*')
 763  763                          return drop_stream_comment(stream);
 764  764          }
 765  765  
 766  766          /*
 767  767           * Check for combinations
 768  768           */
 769  769          value = c;
 770  770          if (cclass[next + 1] & ValidSecond) {
 771  771                  i = special_hash(c, next);
 772  772                  if (hash_results[i][0] == c && hash_results[i][1] == next) {
 773  773                          value = code[i];
 774  774                          next = nextchar(stream);
 775  775                          if (value >= SPECIAL_LEFTSHIFT &&
 776  776                              next == "==."[value - SPECIAL_LEFTSHIFT]) {
 777  777                                  value += 3;
 778  778                                  next = nextchar(stream);
 779  779                          }
 780  780                  }
 781  781          }
 782  782  
 783  783          /* Pass it on.. */
 784  784          token = stream->token;
 785  785          token_type(token) = TOKEN_SPECIAL;
 786  786          token->special = value;
 787  787          add_token(stream);
 788  788          return next;
 789  789  }
 790  790  
 791  791  #define IDENT_HASH_BITS (13)
 792  792  #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 793  793  #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 794  794  
 795  795  #define ident_hash_init(c)              (c)
 796  796  #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 797  797  #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 798  798  
 799  799  static struct ident *hash_table[IDENT_HASH_SIZE];
 800  800  static int ident_hit, ident_miss, idents;
 801  801  
 802  802  void show_identifier_stats(void)
 803  803  {
 804  804          int i;
 805  805          int distribution[100];
 806  806  
 807  807          fprintf(stderr, "identifiers: %d hits, %d misses\n",
 808  808                  ident_hit, ident_miss);
 809  809  
 810  810          for (i = 0; i < 100; i++)
 811  811                  distribution[i] = 0;
 812  812  
 813  813          for (i = 0; i < IDENT_HASH_SIZE; i++) {
 814  814                  struct ident * ident = hash_table[i];
 815  815                  int count = 0;
 816  816  
 817  817                  while (ident) {
 818  818                          count++;
 819  819                          ident = ident->next;
 820  820                  }
 821  821                  if (count > 99)

↓ open down ↓

821 lines elided

↑ open up ↑

 822  822                          count = 99;
 823  823                  distribution[count]++;
 824  824          }
 825  825  
 826  826          for (i = 0; i < 100; i++) {
 827  827                  if (distribution[i])
 828  828                          fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 829  829          }
 830  830  }
 831  831  
 832      -static struct ident *alloc_ident(const char *name, int len)
      832 +struct ident *alloc_ident(const char *name, int len)
 833  833  {
 834  834          struct ident *ident = __alloc_ident(len);
 835  835          ident->symbols = NULL;
 836  836          ident->len = len;
 837  837          ident->tainted = 0;
 838  838          memcpy(ident->name, name, len);
 839  839          return ident;
 840  840  }
 841  841  
 842  842  static struct ident * insert_hash(struct ident *ident, unsigned long hash)

 843  843  {
 844  844          ident->next = hash_table[hash];
 845  845          hash_table[hash] = ident;
 846  846          ident_miss++;
 847  847          return ident;
 848  848  }
 849  849  
 850  850  static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 851  851  {
 852  852          struct ident *ident;
 853  853          struct ident **p;
 854  854  
 855  855          p = &hash_table[hash];
 856  856          while ((ident = *p) != NULL) {
 857  857                  if (ident->len == (unsigned char) len) {
 858  858                          if (strncmp(name, ident->name, len) != 0)
 859  859                                  goto next;
 860  860  
 861  861                          ident_hit++;
 862  862                          return ident;
 863  863                  }
 864  864  next:
 865  865                  //misses++;
 866  866                  p = &ident->next;
 867  867          }
 868  868          ident = alloc_ident(name, len);
 869  869          *p = ident;
 870  870          ident->next = NULL;
 871  871          ident_miss++;
 872  872          idents++;
 873  873          return ident;
 874  874  }
 875  875  
 876  876  static unsigned long hash_name(const char *name, int len)
 877  877  {
 878  878          unsigned long hash;
 879  879          const unsigned char *p = (const unsigned char *)name;
 880  880  
 881  881          hash = ident_hash_init(*p++);
 882  882          while (--len) {
 883  883                  unsigned int i = *p++;
 884  884                  hash = ident_hash_add(hash, i);
 885  885          }
 886  886          return ident_hash_end(hash);
 887  887  }
 888  888  
 889  889  struct ident *hash_ident(struct ident *ident)
 890  890  {
 891  891          return insert_hash(ident, hash_name(ident->name, ident->len));
 892  892  }
 893  893  
 894  894  struct ident *built_in_ident(const char *name)
 895  895  {
 896  896          int len = strlen(name);
 897  897          return create_hashed_ident(name, len, hash_name(name, len));
 898  898  }
 899  899  
 900  900  struct token *built_in_token(int stream, struct ident *ident)
 901  901  {
 902  902          struct token *token;
 903  903  
 904  904          token = __alloc_token(0);
 905  905          token->pos.stream = stream;
 906  906          token_type(token) = TOKEN_IDENT;
 907  907          token->ident = ident;
 908  908          return token;
 909  909  }
 910  910  
 911  911  static int get_one_identifier(int c, stream_t *stream)
 912  912  {
 913  913          struct token *token;
 914  914          struct ident *ident;
 915  915          unsigned long hash;
 916  916          char buf[256];
 917  917          int len = 1;
 918  918          int next;
 919  919  
 920  920          hash = ident_hash_init(c);
 921  921          buf[0] = c;
 922  922          for (;;) {
 923  923                  next = nextchar(stream);
 924  924                  if (!(cclass[next + 1] & (Letter | Digit)))
 925  925                          break;
 926  926                  if (len >= sizeof(buf))
 927  927                          break;
 928  928                  hash = ident_hash_add(hash, next);
 929  929                  buf[len] = next;
 930  930                  len++;
 931  931          };
 932  932          if (cclass[next + 1] & Quote) {
 933  933                  if (len == 1 && buf[0] == 'L') {
 934  934                          if (next == '\'')
 935  935                                  return eat_string(nextchar(stream), stream,
 936  936                                                          TOKEN_WIDE_CHAR);
 937  937                          else
 938  938                                  return eat_string(nextchar(stream), stream,
 939  939                                                          TOKEN_WIDE_STRING);
 940  940                  }
 941  941          }
 942  942          hash = ident_hash_end(hash);
 943  943          ident = create_hashed_ident(buf, len, hash);
 944  944  
 945  945          /* Pass it on.. */
 946  946          token = stream->token;
 947  947          token_type(token) = TOKEN_IDENT;
 948  948          token->ident = ident;
 949  949          add_token(stream);
 950  950          return next;
 951  951  }               
 952  952  
 953  953  static int get_one_token(int c, stream_t *stream)
 954  954  {
 955  955          long class = cclass[c + 1];
 956  956          if (class & Digit)
 957  957                  return get_one_number(c, nextchar(stream), stream);
 958  958          if (class & Letter)
 959  959                  return get_one_identifier(c, stream);
 960  960          return get_one_special(c, stream);
 961  961  }
 962  962  
 963  963  static struct token *setup_stream(stream_t *stream, int idx, int fd,
 964  964          unsigned char *buf, unsigned int buf_size)
 965  965  {
 966  966          struct token *begin;
 967  967  
 968  968          stream->nr = idx;
 969  969          stream->line = 1;
 970  970          stream->newline = 1;
 971  971          stream->whitespace = 0;
 972  972          stream->pos = 0;
 973  973  
 974  974          stream->token = NULL;
 975  975          stream->fd = fd;
 976  976          stream->offset = 0;
 977  977          stream->size = buf_size;
 978  978          stream->buffer = buf;
 979  979  
 980  980          begin = alloc_token(stream);
 981  981          token_type(begin) = TOKEN_STREAMBEGIN;
 982  982          stream->tokenlist = &begin->next;
 983  983          return begin;
 984  984  }
 985  985  
 986  986  static struct token *tokenize_stream(stream_t *stream)
 987  987  {
 988  988          int c = nextchar(stream);
 989  989          while (c != EOF) {
 990  990                  if (!isspace(c)) {
 991  991                          struct token *token = alloc_token(stream);
 992  992                          stream->token = token;
 993  993                          stream->newline = 0;
 994  994                          stream->whitespace = 0;
 995  995                          c = get_one_token(c, stream);
 996  996                          continue;
 997  997                  }
 998  998                  stream->whitespace = 1;
 999  999                  c = nextchar(stream);
1000 1000          }
1001 1001          return mark_eof(stream);
1002 1002  }
1003 1003  
1004 1004  struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1005 1005  {
1006 1006          stream_t stream;
1007 1007          struct token *begin;
1008 1008  
1009 1009          begin = setup_stream(&stream, 0, -1, buffer, size);
1010 1010          *endtoken = tokenize_stream(&stream);
1011 1011          return begin;
1012 1012  }
1013 1013  
1014 1014  struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1015 1015  {
1016 1016          struct token *begin, *end;
1017 1017          stream_t stream;
1018 1018          unsigned char buffer[BUFSIZE];
1019 1019          int idx;
1020 1020  
1021 1021          idx = init_stream(name, fd, next_path);
1022 1022          if (idx < 0) {
1023 1023                  // info(endtoken->pos, "File %s is const", name);
1024 1024                  return endtoken;
1025 1025          }
1026 1026  
1027 1027          begin = setup_stream(&stream, idx, fd, buffer, 0);
1028 1028          end = tokenize_stream(&stream);
1029 1029          if (endtoken)
1030 1030                  end->next = endtoken;
1031 1031          return begin;
1032 1032  }

↓ open down ↓

190 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX