illumos-gate Wdiff usr/src/tools/smatch/src/tokenize.c

Print this page

new smatch

Split	Close
Expand all
Collapse all

          --- old/usr/src/tools/smatch/src/tokenize.c
          +++ new/usr/src/tools/smatch/src/tokenize.c

   1    1  /*
   2    2   * This is a really stupid C tokenizer. It doesn't do any include
   3    3   * files or anything complex at all. That's the preprocessor.
   4    4   *
   5    5   * Copyright (C) 2003 Transmeta Corp.
   6    6   *               2003 Linus Torvalds
   7    7   *
   8    8   * Permission is hereby granted, free of charge, to any person obtaining a copy
   9    9   * of this software and associated documentation files (the "Software"), to deal
  10   10   * in the Software without restriction, including without limitation the rights
  11   11   * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12   12   * copies of the Software, and to permit persons to whom the Software is
  13   13   * furnished to do so, subject to the following conditions:
  14   14   *
  15   15   * The above copyright notice and this permission notice shall be included in
  16   16   * all copies or substantial portions of the Software.
  17   17   *
  18   18   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19   19   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20   20   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21   21   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22   22   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23   23   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24   24   * THE SOFTWARE.
  25   25   */
  26   26  #include <stdio.h>
  27   27  #include <stdlib.h>
  28   28  #include <stdarg.h>
  29   29  #include <stddef.h>
  30   30  #include <string.h>
  31   31  #include <ctype.h>
  32   32  #include <unistd.h>
  33   33  #include <stdint.h>
  34   34  
  35   35  #include "lib.h"
  36   36  #include "allocate.h"
  37   37  #include "token.h"
  38   38  #include "symbol.h"
  39   39  
  40   40  #define EOF (-1)
  41   41  
  42   42  int input_stream_nr = 0;
  43   43  struct stream *input_streams;
  44   44  static int input_streams_allocated;
  45   45  unsigned int tabstop = 8;
  46   46  int no_lineno = 0;
  47   47  
  48   48  #define BUFSIZE (8192)
  49   49  
  50   50  typedef struct {
  51   51          int fd, offset, size;
  52   52          int pos, line, nr;
  53   53          int newline, whitespace;
  54   54          struct token **tokenlist;
  55   55          struct token *token;
  56   56          unsigned char *buffer;
  57   57  } stream_t;
  58   58  
  59   59  const char *stream_name(int stream)
  60   60  {
  61   61          if (stream < 0 || stream > input_stream_nr)
  62   62                  return "<bad stream>";
  63   63          return input_streams[stream].name;
  64   64  }
  65   65  
  66   66  static struct position stream_pos(stream_t *stream)
  67   67  {
  68   68          struct position pos;
  69   69          pos.type = 0;
  70   70          pos.stream = stream->nr;
  71   71          pos.newline = stream->newline;
  72   72          pos.whitespace = stream->whitespace;
  73   73          pos.pos = stream->pos;
  74   74  
  75   75          pos.line = stream->line;
  76   76          if (no_lineno)
  77   77                  pos.line = 123456;
  78   78  
  79   79          pos.noexpand = 0;
  80   80          return pos;
  81   81  }
  82   82  
  83   83  const char *show_special(int val)
  84   84  {
  85   85          static char buffer[4];

↓ open down ↓

85 lines elided

↑ open up ↑

  86   86  
  87   87          buffer[0] = val;
  88   88          buffer[1] = 0;
  89   89          if (val >= SPECIAL_BASE)
  90   90                  strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
  91   91          return buffer;
  92   92  }
  93   93  
  94   94  const char *show_ident(const struct ident *ident)
  95   95  {
  96      -        static char buffer[256];
       96 +        static char buff[4][256];
       97 +        static int n;
       98 +        char *buffer;
       99 +
  97  100          if (!ident)
  98  101                  return "<noident>";
      102 +        buffer = buff[3 & ++n];
  99  103          sprintf(buffer, "%.*s", ident->len, ident->name);
 100  104          return buffer;
 101  105  }
 102  106  
 103  107  static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
 104  108  {
 105  109          if (isprint(c)) {
 106  110                  if (c == escape || c == '\\')
 107  111                          *ptr++ = '\\';
 108  112                  *ptr++ = c;

 109  113                  return ptr;
 110  114          }
 111  115          *ptr++ = '\\';
 112  116          switch (c) {
 113  117          case '\n':
 114  118                  *ptr++ = 'n';
 115  119                  return ptr;
 116  120          case '\t':
 117  121                  *ptr++ = 't';
 118  122                  return ptr;
 119  123          }
 120  124          if (!isdigit(next))
 121  125                  return ptr + sprintf(ptr, "%o", c);

↓ open down ↓

13 lines elided

↑ open up ↑

 122  126                  
 123  127          return ptr + sprintf(ptr, "%03o", c);
 124  128  }
 125  129  
 126  130  const char *show_string(const struct string *string)
 127  131  {
 128  132          static char buffer[4 * MAX_STRING + 3];
 129  133          char *ptr;
 130  134          int i;
 131  135  
 132      -        if (!string->length)
      136 +        if (!string || !string->length)
 133  137                  return "<bad_string>";
 134  138          ptr = buffer;
 135  139          *ptr++ = '"';
 136  140          for (i = 0; i < string->length-1; i++) {
 137  141                  const char *p = string->data + i;
 138  142                  ptr = charstr(ptr, p[0], '"', p[1]);
 139  143          }
 140  144          *ptr++ = '"';
 141  145          *ptr = '\0';
 142  146          return buffer;

 143  147  }
 144  148  
 145  149  static const char *show_char(const char *s, size_t len, char prefix, char delim)
 146  150  {
 147  151          static char buffer[MAX_STRING + 4];
 148  152          char *p = buffer;
 149  153          if (prefix)
 150  154                  *p++ = prefix;
 151  155          *p++ = delim;
 152  156          memcpy(p, s, len);
 153  157          p += len;
 154  158          *p++ = delim;
 155  159          *p++ = '\0';
 156  160          return buffer;
 157  161  }
 158  162  
 159  163  static const char *quote_char(const char *s, size_t len, char prefix, char delim)
 160  164  {
 161  165          static char buffer[2*MAX_STRING + 6];
 162  166          size_t i;
 163  167          char *p = buffer;
 164  168          if (prefix)
 165  169                  *p++ = prefix;
 166  170          if (delim == '"')
 167  171                  *p++ = '\\';
 168  172          *p++ = delim;
 169  173          for (i = 0; i < len; i++) {
 170  174                  if (s[i] == '"' || s[i] == '\\')
 171  175                          *p++ = '\\';
 172  176                  *p++ = s[i];
 173  177          }
 174  178          if (delim == '"')
 175  179                  *p++ = '\\';
 176  180          *p++ = delim;
 177  181          *p++ = '\0';
 178  182          return buffer;
 179  183  }
 180  184  
 181  185  const char *show_token(const struct token *token)
 182  186  {
 183  187          static char buffer[256];
 184  188  
 185  189          if (!token)
 186  190                  return "<no token>";
 187  191          switch (token_type(token)) {
 188  192          case TOKEN_ERROR:
 189  193                  return "syntax error";
 190  194  
 191  195          case TOKEN_EOF:
 192  196                  return "end-of-input";
 193  197  
 194  198          case TOKEN_IDENT:
 195  199                  return show_ident(token->ident);
 196  200  
 197  201          case TOKEN_NUMBER:
 198  202                  return token->number;
 199  203  
 200  204          case TOKEN_SPECIAL:
 201  205                  return show_special(token->special);
 202  206  
 203  207          case TOKEN_CHAR: 
 204  208                  return show_char(token->string->data,
 205  209                          token->string->length - 1, 0, '\'');
 206  210          case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 207  211                  return show_char(token->embedded,
 208  212                          token_type(token) - TOKEN_CHAR, 0, '\'');
 209  213          case TOKEN_WIDE_CHAR: 
 210  214                  return show_char(token->string->data,
 211  215                          token->string->length - 1, 'L', '\'');
 212  216          case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 213  217                  return show_char(token->embedded,
 214  218                          token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 215  219          case TOKEN_STRING: 
 216  220                  return show_char(token->string->data,
 217  221                          token->string->length - 1, 0, '"');
 218  222          case TOKEN_WIDE_STRING: 
 219  223                  return show_char(token->string->data,
 220  224                          token->string->length - 1, 'L', '"');
 221  225  
 222  226          case TOKEN_STREAMBEGIN:
 223  227                  sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 224  228                  return buffer;
 225  229  
 226  230          case TOKEN_STREAMEND:
 227  231                  sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 228  232                  return buffer;
 229  233  
 230  234          case TOKEN_UNTAINT:
 231  235                  sprintf(buffer, "<untaint>");
 232  236                  return buffer;
 233  237  
 234  238          case TOKEN_ARG_COUNT:
 235  239                  sprintf(buffer, "<argcnt>");
 236  240                  return buffer;
 237  241  
 238  242          default:
 239  243                  sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 240  244                  return buffer;
 241  245          }
 242  246  }
 243  247  
 244  248  const char *quote_token(const struct token *token)
 245  249  {
 246  250          static char buffer[256];
 247  251  
 248  252          switch (token_type(token)) {
 249  253          case TOKEN_ERROR:
 250  254                  return "syntax error";
 251  255  
 252  256          case TOKEN_IDENT:
 253  257                  return show_ident(token->ident);
 254  258  
 255  259          case TOKEN_NUMBER:
 256  260                  return token->number;
 257  261  
 258  262          case TOKEN_SPECIAL:
 259  263                  return show_special(token->special);
 260  264  
 261  265          case TOKEN_CHAR: 
 262  266                  return quote_char(token->string->data,
 263  267                          token->string->length - 1, 0, '\'');
 264  268          case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 265  269                  return quote_char(token->embedded,
 266  270                          token_type(token) - TOKEN_CHAR, 0, '\'');
 267  271          case TOKEN_WIDE_CHAR: 
 268  272                  return quote_char(token->string->data,
 269  273                          token->string->length - 1, 'L', '\'');
 270  274          case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 271  275                  return quote_char(token->embedded,
 272  276                          token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 273  277          case TOKEN_STRING: 
 274  278                  return quote_char(token->string->data,
 275  279                          token->string->length - 1, 0, '"');
 276  280          case TOKEN_WIDE_STRING: 
 277  281                  return quote_char(token->string->data,
 278  282                          token->string->length - 1, 'L', '"');
 279  283          default:
 280  284                  sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 281  285                  return buffer;
 282  286          }
 283  287  }
 284  288  
 285  289  #define HASHED_INPUT_BITS (6)
 286  290  #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
 287  291  #define HASH_PRIME 0x9e370001UL
 288  292  
 289  293  static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
 290  294  
 291  295  int *hash_stream(const char *name)
 292  296  {
 293  297          uint32_t hash = 0;
 294  298          unsigned char c;
 295  299  
 296  300          while ((c = *name++) != 0)
 297  301                  hash = (hash + (c << 4) + (c >> 4)) * 11;
 298  302  
 299  303          hash *= HASH_PRIME;
 300  304          hash >>= 32 - HASHED_INPUT_BITS;
 301  305          return input_stream_hashes + hash;
 302  306  }
 303  307  
 304  308  int init_stream(const char *name, int fd, const char **next_path)
 305  309  {
 306  310          int stream = input_stream_nr, *hash;
 307  311          struct stream *current;
 308  312  
 309  313          if (stream >= input_streams_allocated) {
 310  314                  int newalloc = stream * 4 / 3 + 10;
 311  315                  input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 312  316                  if (!input_streams)
 313  317                          die("Unable to allocate more streams space");
 314  318                  input_streams_allocated = newalloc;
 315  319          }
 316  320          current = input_streams + stream;
 317  321          memset(current, 0, sizeof(*current));
 318  322          current->name = name;
 319  323          current->fd = fd;
 320  324          current->next_path = next_path;
 321  325          current->path = NULL;
 322  326          current->constant = CONSTANT_FILE_MAYBE;
 323  327          input_stream_nr = stream+1;
 324  328          hash = hash_stream(name);
 325  329          current->next_stream = *hash;
 326  330          *hash = stream;
 327  331          return stream;
 328  332  }
 329  333  
 330  334  static struct token * alloc_token(stream_t *stream)
 331  335  {
 332  336          struct token *token = __alloc_token(0);
 333  337          token->pos = stream_pos(stream);
 334  338          return token;
 335  339  }
 336  340  
 337  341  /*
 338  342   *  Argh...  That was surprisingly messy - handling '\r' complicates the
 339  343   *  things a _lot_.
 340  344   */
 341  345  static int nextchar_slow(stream_t *stream)
 342  346  {
 343  347          int offset = stream->offset;
 344  348          int size = stream->size;
 345  349          int c;
 346  350          int spliced = 0, had_cr, had_backslash;
 347  351  
 348  352  restart:
 349  353          had_cr = had_backslash = 0;
 350  354  
 351  355  repeat:
 352  356          if (offset >= size) {
 353  357                  if (stream->fd < 0)
 354  358                          goto got_eof;
 355  359                  size = read(stream->fd, stream->buffer, BUFSIZE);
 356  360                  if (size <= 0)
 357  361                          goto got_eof;
 358  362                  stream->size = size;
 359  363                  stream->offset = offset = 0;
 360  364          }
 361  365  
 362  366          c = stream->buffer[offset++];
 363  367          if (had_cr)
 364  368                  goto check_lf;
 365  369  
 366  370          if (c == '\r') {
 367  371                  had_cr = 1;
 368  372                  goto repeat;
 369  373          }
 370  374  
 371  375  norm:
 372  376          if (!had_backslash) {
 373  377                  switch (c) {
 374  378                  case '\t':
 375  379                          stream->pos += tabstop - stream->pos % tabstop;
 376  380                          break;
 377  381                  case '\n':
 378  382                          stream->line++;
 379  383                          stream->pos = 0;
 380  384                          stream->newline = 1;
 381  385                          break;
 382  386                  case '\\':
 383  387                          had_backslash = 1;
 384  388                          stream->pos++;
 385  389                          goto repeat;
 386  390                  default:
 387  391                          stream->pos++;
 388  392                  }
 389  393          } else {
 390  394                  if (c == '\n') {
 391  395                          stream->line++;
 392  396                          stream->pos = 0;
 393  397                          spliced = 1;
 394  398                          goto restart;
 395  399                  }
 396  400                  offset--;
 397  401                  c = '\\';
 398  402          }
 399  403  out:
 400  404          stream->offset = offset;
 401  405  
 402  406          return c;
 403  407  
 404  408  check_lf:
 405  409          if (c != '\n')
 406  410                  offset--;
 407  411          c = '\n';
 408  412          goto norm;
 409  413  
 410  414  got_eof:
 411  415          if (had_backslash) {
 412  416                  c = '\\';
 413  417                  goto out;
 414  418          }
 415  419          if (stream->pos)
 416  420                  warning(stream_pos(stream), "no newline at end of file");
 417  421          else if (spliced)
 418  422                  warning(stream_pos(stream), "backslash-newline at end of file");
 419  423          return EOF;
 420  424  }
 421  425  
 422  426  /*
 423  427   *  We want that as light as possible while covering all normal cases.
 424  428   *  Slow path (including the logics with line-splicing and EOF sanity
 425  429   *  checks) is in nextchar_slow().
 426  430   */
 427  431  static inline int nextchar(stream_t *stream)
 428  432  {
 429  433          int offset = stream->offset;
 430  434  
 431  435          if (offset < stream->size) {
 432  436                  int c = stream->buffer[offset++];
 433  437                  static const char special[256] = {
 434  438                          ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 435  439                  };
 436  440                  if (!special[c]) {
 437  441                          stream->offset = offset;
 438  442                          stream->pos++;
 439  443                          return c;
 440  444                  }
 441  445          }

↓ open down ↓

299 lines elided

↑ open up ↑

 442  446          return nextchar_slow(stream);
 443  447  }
 444  448  
 445  449  struct token eof_token_entry;
 446  450  
 447  451  static struct token *mark_eof(stream_t *stream)
 448  452  {
 449  453          struct token *end;
 450  454  
 451  455          end = alloc_token(stream);
      456 +        eof_token_entry.pos = end->pos;
 452  457          token_type(end) = TOKEN_STREAMEND;
 453  458          end->pos.newline = 1;
 454  459  
 455  460          eof_token_entry.next = &eof_token_entry;
 456  461          eof_token_entry.pos.newline = 1;
 457  462  
 458  463          end->next =  &eof_token_entry;
 459  464          *stream->tokenlist = end;
 460  465          stream->tokenlist = NULL;
 461  466          return end;

 462  467  }
 463  468  
 464  469  static void add_token(stream_t *stream)
 465  470  {
 466  471          struct token *token = stream->token;
 467  472  
 468  473          stream->token = NULL;
 469  474          token->next = NULL;
 470  475          *stream->tokenlist = token;
 471  476          stream->tokenlist = &token->next;
 472  477  }
 473  478  
 474  479  static void drop_token(stream_t *stream)
 475  480  {
 476  481          stream->newline |= stream->token->pos.newline;
 477  482          stream->whitespace |= stream->token->pos.whitespace;
 478  483          stream->token = NULL;
 479  484  }
 480  485

↓ open down ↓

19 lines elided

↑ open up ↑

 481  486  enum {
 482  487          Letter = 1,
 483  488          Digit = 2,
 484  489          Hex = 4,
 485  490          Exp = 8,
 486  491          Dot = 16,
 487  492          ValidSecond = 32,
 488  493          Quote = 64,
 489  494  };
 490  495  
 491      -static const long cclass[257] = {
 492      -        ['0' + 1 ... '7' + 1] = Digit | Hex,    /* \<octal> */
 493      -        ['8' + 1 ... '9' + 1] = Digit | Hex,
      496 +static const char cclass[257] = {
      497 +        ['0' + 1 ... '9' + 1] = Digit | Hex,
 494  498          ['A' + 1 ... 'D' + 1] = Letter | Hex,
 495  499          ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
 496  500          ['F' + 1] = Letter | Hex,
 497  501          ['G' + 1 ... 'O' + 1] = Letter,
 498  502          ['P' + 1] = Letter | Exp,       /* P<exp> */
 499  503          ['Q' + 1 ... 'Z' + 1] = Letter,
 500      -        ['a' + 1 ... 'b' + 1] = Letter | Hex, /* \a, \b */
 501      -        ['c' + 1 ... 'd' + 1] = Letter | Hex,
 502      -        ['e' + 1] = Letter | Hex | Exp,/* \e, e<exp> */
 503      -        ['f' + 1] = Letter | Hex,       /* \f */
 504      -        ['g' + 1 ... 'm' + 1] = Letter,
 505      -        ['n' + 1] = Letter,     /* \n */
 506      -        ['o' + 1] = Letter,
      504 +        ['a' + 1 ... 'd' + 1] = Letter | Hex,
      505 +        ['e' + 1] = Letter | Hex | Exp, /* e<exp> */
      506 +        ['f' + 1] = Letter | Hex,
      507 +        ['g' + 1 ... 'o' + 1] = Letter,
 507  508          ['p' + 1] = Letter | Exp,       /* p<exp> */
 508      -        ['q' + 1] = Letter,
 509      -        ['r' + 1] = Letter,     /* \r */
 510      -        ['s' + 1] = Letter,
 511      -        ['t' + 1] = Letter,     /* \t */
 512      -        ['u' + 1] = Letter,
 513      -        ['v' + 1] = Letter,     /* \v */
 514      -        ['w' + 1] = Letter,
 515      -        ['x' + 1] = Letter,     /* \x<hex> */
 516      -        ['y' + 1 ... 'z' + 1] = Letter,
      509 +        ['q' + 1 ... 'z' + 1] = Letter,
 517  510          ['_' + 1] = Letter,
 518  511          ['.' + 1] = Dot | ValidSecond,
 519  512          ['=' + 1] = ValidSecond,
 520  513          ['+' + 1] = ValidSecond,
 521  514          ['-' + 1] = ValidSecond,
 522  515          ['>' + 1] = ValidSecond,
 523  516          ['<' + 1] = ValidSecond,
 524  517          ['&' + 1] = ValidSecond,
 525  518          ['|' + 1] = ValidSecond,
 526  519          ['#' + 1] = ValidSecond,

 527  520          ['\'' + 1] = Quote,
 528  521          ['"' + 1] = Quote,
 529  522  };
 530  523  
 531  524  /*
 532  525   * pp-number:
 533  526   *      digit
 534  527   *      . digit
 535  528   *      pp-number digit
 536  529   *      pp-number identifier-nodigit

↓ open down ↓

10 lines elided

↑ open up ↑

 537  530   *      pp-number e sign
 538  531   *      pp-number E sign
 539  532   *      pp-number p sign
 540  533   *      pp-number P sign
 541  534   *      pp-number .
 542  535   */
 543  536  static int get_one_number(int c, int next, stream_t *stream)
 544  537  {
 545  538          struct token *token;
 546  539          static char buffer[4095];
 547      -        char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
 548      -        int len;
      540 +        char *p = buffer, *buffer_end = buffer + sizeof (buffer);
 549  541  
 550  542          *p++ = c;
 551  543          for (;;) {
 552  544                  long class =  cclass[next + 1];
 553  545                  if (!(class & (Dot | Digit | Letter)))
 554  546                          break;
 555  547                  if (p != buffer_end)
 556  548                          *p++ = next;
 557  549                  next = nextchar(stream);
 558  550                  if (class & Exp) {

 559  551                          if (next == '-' || next == '+') {
 560  552                                  if (p != buffer_end)
 561  553                                          *p++ = next;
 562  554                                  next = nextchar(stream);
 563  555                          }
 564  556                  }
 565  557          }

↓ open down ↓

7 lines elided

↑ open up ↑

 566  558  
 567  559          if (p == buffer_end) {
 568  560                  sparse_error(stream_pos(stream), "number token exceeds %td characters",
 569  561                        buffer_end - buffer);
 570  562                  // Pretend we saw just "1".
 571  563                  buffer[0] = '1';
 572  564                  p = buffer + 1;
 573  565          }
 574  566  
 575  567          *p++ = 0;
 576      -        len = p - buffer;
 577      -        buf = __alloc_bytes(len);
 578      -        memcpy(buf, buffer, len);
 579      -
 580  568          token = stream->token;
 581  569          token_type(token) = TOKEN_NUMBER;
 582      -        token->number = buf;
      570 +        token->number = xmemdup(buffer, p - buffer);
 583  571          add_token(stream);
 584  572  
 585  573          return next;
 586  574  }
 587  575  
 588  576  static int eat_string(int next, stream_t *stream, enum token_type type)
 589  577  {
 590  578          static char buffer[MAX_STRING];
 591  579          struct string *string;
 592  580          struct token *token = stream->token;

 593  581          int len = 0;

↓ open down ↓

1 lines elided

↑ open up ↑

 594  582          int escape;
 595  583          int want_hex = 0;
 596  584          char delim = type < TOKEN_STRING ? '\'' : '"';
 597  585  
 598  586          for (escape = 0; escape || next != delim; next = nextchar(stream)) {
 599  587                  if (len < MAX_STRING)
 600  588                          buffer[len] = next;
 601  589                  len++;
 602  590                  if (next == '\n') {
 603  591                          warning(stream_pos(stream),
 604      -                                "Newline in string or character constant");
 605      -                        if (delim == '\'') /* assume it's lost ' */
 606      -                                break;
      592 +                                "missing terminating %c character", delim);
      593 +                        /* assume delimiter is lost */
      594 +                        break;
 607  595                  }
 608  596                  if (next == EOF) {
 609  597                          warning(stream_pos(stream),
 610  598                                  "End of file in middle of string");
 611  599                          return next;
 612  600                  }
 613  601                  if (!escape) {
 614  602                          if (want_hex && !(cclass[next + 1] & Hex))
 615  603                                  warning(stream_pos(stream),
 616  604                                          "\\x used with no following hex digits");

 617  605                          want_hex = 0;
 618  606                          escape = next == '\\';
 619  607                  } else {
 620  608                          escape = 0;
 621  609                          want_hex = next == 'x';
 622  610                  }
 623  611          }
 624  612          if (want_hex)
 625  613                  warning(stream_pos(stream),
 626  614                          "\\x used with no following hex digits");
 627  615          if (len > MAX_STRING) {
 628  616                  warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 629  617                  len = MAX_STRING;
 630  618          }
 631  619          if (delim == '\'' && len <= 4) {
 632  620                  if (len == 0) {
 633  621                          sparse_error(stream_pos(stream),
 634  622                                  "empty character constant");
 635  623                          return nextchar(stream);
 636  624                  }
 637  625                  token_type(token) = type + len;
 638  626                  memset(buffer + len, '\0', 4 - len);
 639  627                  memcpy(token->embedded, buffer, 4);
 640  628          } else {
 641  629                  token_type(token) = type;
 642  630                  string = __alloc_string(len+1);
 643  631                  memcpy(string->data, buffer, len);
 644  632                  string->data[len] = '\0';
 645  633                  string->length = len+1;
 646  634                  token->string = string;
 647  635          }
 648  636  
 649  637          /* Pass it on.. */
 650  638          token = stream->token;
 651  639          add_token(stream);
 652  640          return nextchar(stream);
 653  641  }
 654  642  
 655  643  static int drop_stream_eoln(stream_t *stream)
 656  644  {
 657  645          drop_token(stream);
 658  646          for (;;) {
 659  647                  switch (nextchar(stream)) {
 660  648                  case EOF:
 661  649                          return EOF;
 662  650                  case '\n':
 663  651                          return nextchar(stream);
 664  652                  }
 665  653          }
 666  654  }
 667  655  
 668  656  static int drop_stream_comment(stream_t *stream)
 669  657  {
 670  658          int newline;
 671  659          int next;
 672  660          drop_token(stream);
 673  661          newline = stream->newline;
 674  662  
 675  663          next = nextchar(stream);
 676  664          for (;;) {
 677  665                  int curr = next;
 678  666                  if (curr == EOF) {
 679  667                          warning(stream_pos(stream), "End of file in the middle of a comment");
 680  668                          return curr;
 681  669                  }
 682  670                  next = nextchar(stream);
 683  671                  if (curr == '*' && next == '/')
 684  672                          break;
 685  673          }
 686  674          stream->newline = newline;
 687  675          return nextchar(stream);
 688  676  }
 689  677  
 690  678  unsigned char combinations[][4] = COMBINATION_STRINGS;
 691  679  
 692  680  #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 693  681  
 694  682  /* hash function for two-character punctuators - all give unique values */
 695  683  #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 696  684  
 697  685  /*
 698  686   * note that we won't get false positives - special_hash(0,0) is 0 and
 699  687   * entry 0 is filled (by +=), so all the missing ones are OK.
 700  688   */
 701  689  static unsigned char hash_results[32][2] = {
 702  690  #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 703  691          RES('+', '='), /* 00 */
 704  692          RES('/', '='), /* 01 */
 705  693          RES('^', '='), /* 05 */
 706  694          RES('&', '&'), /* 07 */
 707  695          RES('#', '#'), /* 08 */
 708  696          RES('<', '<'), /* 0a */
 709  697          RES('<', '='), /* 0c */
 710  698          RES('!', '='), /* 0e */
 711  699          RES('%', '='), /* 0f */
 712  700          RES('-', '-'), /* 10 */
 713  701          RES('-', '='), /* 11 */
 714  702          RES('-', '>'), /* 13 */
 715  703          RES('=', '='), /* 15 */
 716  704          RES('&', '='), /* 17 */
 717  705          RES('*', '='), /* 18 */
 718  706          RES('.', '.'), /* 1a */
 719  707          RES('+', '+'), /* 1b */
 720  708          RES('|', '='), /* 1c */
 721  709          RES('>', '='), /* 1d */
 722  710          RES('|', '|'), /* 1e */
 723  711          RES('>', '>')  /* 1f */
 724  712  #undef RES
 725  713  };
 726  714  static int code[32] = {
 727  715  #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 728  716          CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 729  717          CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 730  718          CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 731  719          CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 732  720          CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 733  721          CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 734  722          CODE('<', '=', SPECIAL_LTE), /* 0c */
 735  723          CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 736  724          CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 737  725          CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 738  726          CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 739  727          CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 740  728          CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 741  729          CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 742  730          CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 743  731          CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 744  732          CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 745  733          CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 746  734          CODE('>', '=', SPECIAL_GTE), /* 1d */
 747  735          CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 748  736          CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 749  737  #undef CODE
 750  738  };
 751  739  
 752  740  static int get_one_special(int c, stream_t *stream)
 753  741  {
 754  742          struct token *token;
 755  743          int next, value, i;
 756  744  
 757  745          next = nextchar(stream);
 758  746  
 759  747          /*
 760  748           * Check for numbers, strings, character constants, and comments
 761  749           */
 762  750          switch (c) {
 763  751          case '.':
 764  752                  if (next >= '0' && next <= '9')
 765  753                          return get_one_number(c, next, stream);
 766  754                  break;
 767  755          case '"':
 768  756                  return eat_string(next, stream, TOKEN_STRING);
 769  757          case '\'':
 770  758                  return eat_string(next, stream, TOKEN_CHAR);
 771  759          case '/':
 772  760                  if (next == '/')
 773  761                          return drop_stream_eoln(stream);
 774  762                  if (next == '*')
 775  763                          return drop_stream_comment(stream);
 776  764          }
 777  765  
 778  766          /*
 779  767           * Check for combinations
 780  768           */
 781  769          value = c;
 782  770          if (cclass[next + 1] & ValidSecond) {
 783  771                  i = special_hash(c, next);
 784  772                  if (hash_results[i][0] == c && hash_results[i][1] == next) {
 785  773                          value = code[i];
 786  774                          next = nextchar(stream);
 787  775                          if (value >= SPECIAL_LEFTSHIFT &&
 788  776                              next == "==."[value - SPECIAL_LEFTSHIFT]) {
 789  777                                  value += 3;
 790  778                                  next = nextchar(stream);
 791  779                          }
 792  780                  }
 793  781          }
 794  782  
 795  783          /* Pass it on.. */
 796  784          token = stream->token;
 797  785          token_type(token) = TOKEN_SPECIAL;
 798  786          token->special = value;
 799  787          add_token(stream);
 800  788          return next;
 801  789  }
 802  790  
 803  791  #define IDENT_HASH_BITS (13)
 804  792  #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 805  793  #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 806  794  
 807  795  #define ident_hash_init(c)              (c)
 808  796  #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 809  797  #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 810  798  
 811  799  static struct ident *hash_table[IDENT_HASH_SIZE];
 812  800  static int ident_hit, ident_miss, idents;
 813  801  
 814  802  void show_identifier_stats(void)
 815  803  {
 816  804          int i;
 817  805          int distribution[100];
 818  806  
 819  807          fprintf(stderr, "identifiers: %d hits, %d misses\n",
 820  808                  ident_hit, ident_miss);
 821  809  
 822  810          for (i = 0; i < 100; i++)
 823  811                  distribution[i] = 0;
 824  812  
 825  813          for (i = 0; i < IDENT_HASH_SIZE; i++) {
 826  814                  struct ident * ident = hash_table[i];
 827  815                  int count = 0;
 828  816  
 829  817                  while (ident) {
 830  818                          count++;
 831  819                          ident = ident->next;
 832  820                  }
 833  821                  if (count > 99)
 834  822                          count = 99;
 835  823                  distribution[count]++;
 836  824          }
 837  825  
 838  826          for (i = 0; i < 100; i++) {
 839  827                  if (distribution[i])
 840  828                          fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 841  829          }
 842  830  }
 843  831  
 844  832  static struct ident *alloc_ident(const char *name, int len)
 845  833  {
 846  834          struct ident *ident = __alloc_ident(len);
 847  835          ident->symbols = NULL;
 848  836          ident->len = len;
 849  837          ident->tainted = 0;
 850  838          memcpy(ident->name, name, len);
 851  839          return ident;
 852  840  }
 853  841  
 854  842  static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 855  843  {
 856  844          ident->next = hash_table[hash];
 857  845          hash_table[hash] = ident;
 858  846          ident_miss++;
 859  847          return ident;
 860  848  }
 861  849  
 862  850  static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 863  851  {
 864  852          struct ident *ident;
 865  853          struct ident **p;
 866  854  
 867  855          p = &hash_table[hash];
 868  856          while ((ident = *p) != NULL) {
 869  857                  if (ident->len == (unsigned char) len) {
 870  858                          if (strncmp(name, ident->name, len) != 0)
 871  859                                  goto next;
 872  860  
 873  861                          ident_hit++;
 874  862                          return ident;
 875  863                  }
 876  864  next:
 877  865                  //misses++;
 878  866                  p = &ident->next;
 879  867          }
 880  868          ident = alloc_ident(name, len);
 881  869          *p = ident;
 882  870          ident->next = NULL;
 883  871          ident_miss++;
 884  872          idents++;
 885  873          return ident;
 886  874  }
 887  875  
 888  876  static unsigned long hash_name(const char *name, int len)
 889  877  {
 890  878          unsigned long hash;
 891  879          const unsigned char *p = (const unsigned char *)name;
 892  880  
 893  881          hash = ident_hash_init(*p++);
 894  882          while (--len) {
 895  883                  unsigned int i = *p++;
 896  884                  hash = ident_hash_add(hash, i);
 897  885          }
 898  886          return ident_hash_end(hash);
 899  887  }
 900  888  
 901  889  struct ident *hash_ident(struct ident *ident)
 902  890  {
 903  891          return insert_hash(ident, hash_name(ident->name, ident->len));
 904  892  }
 905  893  
 906  894  struct ident *built_in_ident(const char *name)
 907  895  {
 908  896          int len = strlen(name);
 909  897          return create_hashed_ident(name, len, hash_name(name, len));
 910  898  }
 911  899  
 912  900  struct token *built_in_token(int stream, struct ident *ident)
 913  901  {
 914  902          struct token *token;
 915  903  
 916  904          token = __alloc_token(0);
 917  905          token->pos.stream = stream;
 918  906          token_type(token) = TOKEN_IDENT;
 919  907          token->ident = ident;
 920  908          return token;
 921  909  }
 922  910  
 923  911  static int get_one_identifier(int c, stream_t *stream)
 924  912  {
 925  913          struct token *token;
 926  914          struct ident *ident;
 927  915          unsigned long hash;
 928  916          char buf[256];
 929  917          int len = 1;
 930  918          int next;
 931  919  
 932  920          hash = ident_hash_init(c);
 933  921          buf[0] = c;
 934  922          for (;;) {
 935  923                  next = nextchar(stream);
 936  924                  if (!(cclass[next + 1] & (Letter | Digit)))
 937  925                          break;
 938  926                  if (len >= sizeof(buf))
 939  927                          break;
 940  928                  hash = ident_hash_add(hash, next);
 941  929                  buf[len] = next;
 942  930                  len++;
 943  931          };
 944  932          if (cclass[next + 1] & Quote) {
 945  933                  if (len == 1 && buf[0] == 'L') {
 946  934                          if (next == '\'')
 947  935                                  return eat_string(nextchar(stream), stream,
 948  936                                                          TOKEN_WIDE_CHAR);
 949  937                          else
 950  938                                  return eat_string(nextchar(stream), stream,
 951  939                                                          TOKEN_WIDE_STRING);
 952  940                  }
 953  941          }
 954  942          hash = ident_hash_end(hash);
 955  943          ident = create_hashed_ident(buf, len, hash);
 956  944  
 957  945          /* Pass it on.. */
 958  946          token = stream->token;
 959  947          token_type(token) = TOKEN_IDENT;
 960  948          token->ident = ident;
 961  949          add_token(stream);
 962  950          return next;
 963  951  }               
 964  952  
 965  953  static int get_one_token(int c, stream_t *stream)
 966  954  {
 967  955          long class = cclass[c + 1];
 968  956          if (class & Digit)
 969  957                  return get_one_number(c, nextchar(stream), stream);
 970  958          if (class & Letter)
 971  959                  return get_one_identifier(c, stream);
 972  960          return get_one_special(c, stream);
 973  961  }
 974  962  
 975  963  static struct token *setup_stream(stream_t *stream, int idx, int fd,
 976  964          unsigned char *buf, unsigned int buf_size)
 977  965  {
 978  966          struct token *begin;
 979  967  
 980  968          stream->nr = idx;
 981  969          stream->line = 1;
 982  970          stream->newline = 1;
 983  971          stream->whitespace = 0;
 984  972          stream->pos = 0;
 985  973  
 986  974          stream->token = NULL;
 987  975          stream->fd = fd;
 988  976          stream->offset = 0;
 989  977          stream->size = buf_size;
 990  978          stream->buffer = buf;
 991  979  
 992  980          begin = alloc_token(stream);
 993  981          token_type(begin) = TOKEN_STREAMBEGIN;
 994  982          stream->tokenlist = &begin->next;
 995  983          return begin;
 996  984  }
 997  985  
 998  986  static struct token *tokenize_stream(stream_t *stream)
 999  987  {
1000  988          int c = nextchar(stream);
1001  989          while (c != EOF) {
1002  990                  if (!isspace(c)) {
1003  991                          struct token *token = alloc_token(stream);
1004  992                          stream->token = token;
1005  993                          stream->newline = 0;
1006  994                          stream->whitespace = 0;
1007  995                          c = get_one_token(c, stream);
1008  996                          continue;
1009  997                  }
1010  998                  stream->whitespace = 1;
1011  999                  c = nextchar(stream);
1012 1000          }
1013 1001          return mark_eof(stream);
1014 1002  }
1015 1003  
1016 1004  struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1017 1005  {
1018 1006          stream_t stream;
1019 1007          struct token *begin;
1020 1008  
1021 1009          begin = setup_stream(&stream, 0, -1, buffer, size);
1022 1010          *endtoken = tokenize_stream(&stream);
1023 1011          return begin;
1024 1012  }
1025 1013  
1026 1014  struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1027 1015  {
1028 1016          struct token *begin, *end;
1029 1017          stream_t stream;
1030 1018          unsigned char buffer[BUFSIZE];
1031 1019          int idx;
1032 1020  
1033 1021          idx = init_stream(name, fd, next_path);
1034 1022          if (idx < 0) {
1035 1023                  // info(endtoken->pos, "File %s is const", name);
1036 1024                  return endtoken;
1037 1025          }
1038 1026  
1039 1027          begin = setup_stream(&stream, idx, fd, buffer, 0);
1040 1028          end = tokenize_stream(&stream);
1041 1029          if (endtoken)
1042 1030                  end->next = endtoken;
1043 1031          return begin;
1044 1032  }

↓ open down ↓

428 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX