1 /*
   2  * Copyright (C) Lucent Technologies 1997
   3  * All Rights Reserved
   4  *
   5  * Permission to use, copy, modify, and distribute this software and
   6  * its documentation for any purpose and without fee is hereby
   7  * granted, provided that the above copyright notice appear in all
   8  * copies and that both that the copyright notice and this
   9  * permission notice and warranty disclaimer appear in supporting
  10  * documentation, and that the name Lucent Technologies or any of
  11  * its entities not be used in advertising or publicity pertaining
  12  * to distribution of the software without specific, written prior
  13  * permission.
  14  *
  15  * LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
  16  * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
  17  * IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
  18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
  20  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
  21  * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
  22  * THIS SOFTWARE.
  23  */
  24 
  25 #include <stdio.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <ctype.h>
  29 #include "awk.h"
  30 #include "y.tab.h"
  31 
  32 extern YYSTYPE  yylval;
  33 extern int      infunc;
  34 
  35 off_t   lineno  = 1;
  36 int     bracecnt = 0;
  37 int     brackcnt  = 0;
  38 int     parencnt = 0;
  39 
  40 typedef struct Keyword {
  41         const char *word;
  42         int     sub;
  43         int     type;
  44 } Keyword;
  45 
  46 Keyword keywords[] = {  /* keep sorted: binary searched */
  47         { "BEGIN",      XBEGIN,         XBEGIN },
  48         { "END",        XEND,           XEND },
  49         { "NF",         VARNF,          VARNF },
  50         { "atan2",      FATAN,          BLTIN },
  51         { "break",      BREAK,          BREAK },
  52         { "close",      CLOSE,          CLOSE },
  53         { "continue",   CONTINUE,       CONTINUE },
  54         { "cos",        FCOS,           BLTIN },
  55         { "delete",     DELETE,         DELETE },
  56         { "do",         DO,             DO },
  57         { "else",       ELSE,           ELSE },
  58         { "exit",       EXIT,           EXIT },
  59         { "exp",        FEXP,           BLTIN },
  60         { "fflush",     FFLUSH,         BLTIN },
  61         { "for",        FOR,            FOR },
  62         { "func",       FUNC,           FUNC },
  63         { "function",   FUNC,           FUNC },
  64         { "getline",    GETLINE,        GETLINE },
  65         { "gsub",       GSUB,           GSUB },
  66         { "if",         IF,             IF },
  67         { "in",         IN,             IN },
  68         { "index",      INDEX,          INDEX },
  69         { "int",        FINT,           BLTIN },
  70         { "length",     FLENGTH,        BLTIN },
  71         { "log",        FLOG,           BLTIN },
  72         { "match",      MATCHFCN,       MATCHFCN },
  73         { "next",       NEXT,           NEXT },
  74         { "nextfile",   NEXTFILE,       NEXTFILE },
  75         { "print",      PRINT,          PRINT },
  76         { "printf",     PRINTF,         PRINTF },
  77         { "rand",       FRAND,          BLTIN },
  78         { "return",     RETURN,         RETURN },
  79         { "sin",        FSIN,           BLTIN },
  80         { "split",      SPLIT,          SPLIT },
  81         { "sprintf",    SPRINTF,        SPRINTF },
  82         { "sqrt",       FSQRT,          BLTIN },
  83         { "srand",      FSRAND,         BLTIN },
  84         { "sub",        SUB,            SUB },
  85         { "substr",     SUBSTR,         SUBSTR },
  86         { "system",     FSYSTEM,        BLTIN },
  87         { "tolower",    FTOLOWER,       BLTIN },
  88         { "toupper",    FTOUPPER,       BLTIN },
  89         { "while",      WHILE,          WHILE },
  90 };
  91 
  92 #define RET(x)  { if (dbg) (void) printf("lex %s\n", tokname(x)); return (x); }
  93 
  94 static int
  95 peek(void)
  96 {
  97         int c = input();
  98         unput(c);
  99         return (c);
 100 }
 101 
 102 static int
 103 gettok(uchar **pbuf, int *psz)  /* get next input token */
 104 {
 105         int c, retc;
 106         uchar *buf = *pbuf;
 107         size_t sz = *psz;
 108         uchar *bp = buf;
 109 
 110         c = input();
 111         if (c == 0)
 112                 return (0);
 113         buf[0] = c;
 114         buf[1] = 0;
 115         if (!isalnum(c) && c != '.' && c != '_')
 116                 return (c);
 117 
 118         *bp++ = c;
 119         if (isalpha(c) || c == '_') {   /* it's a varname */
 120                 for (; (c = input()) != 0; ) {
 121                         if (bp-buf >= sz)
 122                                 if (!adjbuf(&buf, &sz, bp - buf + 2, 100,
 123                                     &bp, "gettok"))
 124                                         FATAL(
 125                                 "out of space for name %.10s...", buf);
 126                         if (isalnum(c) || c == '_')
 127                                 *bp++ = c;
 128                         else {
 129                                 *bp = 0;
 130                                 unput(c);
 131                                 break;
 132                         }
 133                 }
 134                 *bp = 0;
 135                 retc = 'a';     /* alphanumeric */
 136         } else {        /* maybe it's a number, but could be . */
 137                 char *rem;
 138                 /* read input until can't be a number */
 139                 for (; (c = input()) != 0; ) {
 140                         if (bp-buf >= sz)
 141                                 if (!adjbuf(&buf, &sz, bp - buf + 2, 100,
 142                                     &bp, "gettok"))
 143                                         FATAL(
 144                                 "out of space for number %.10s...", buf);
 145                         if (isdigit(c) || c == 'e' || c == 'E' ||
 146                             c == '.' || c == '+' || c == '-')
 147                                 *bp++ = c;
 148                         else {
 149                                 unput(c);
 150                                 break;
 151                         }
 152                 }
 153                 *bp = 0;
 154                 (void) strtod((char *)buf, &rem);   /* parse the number */
 155                 /* it wasn't a valid number at all */
 156                 if (rem == (char *)buf) {
 157                         buf[1] = 0;     /* return one character as token */
 158                         retc = buf[0];  /* character is its own type */
 159                         unputstr(rem+1); /* put rest back for later */
 160                 } else {        /* some prefix was a number */
 161                         unputstr(rem);  /* put rest back for later */
 162                         rem[0] = 0;     /* truncate buf after number part */
 163                         retc = '0';     /* type is number */
 164                 }
 165         }
 166         *pbuf = buf;
 167         *psz = sz;
 168         return (retc);
 169 }
 170 
 171 int     word(char *);
 172 int     string(void);
 173 int     regexpr(void);
 174 int     sc      = 0;    /* 1 => return a } right now */
 175 int     reg     = 0;    /* 1 => return a REGEXPR now */
 176 
 177 int
 178 yylex(void)
 179 {
 180         int c;
 181         static uchar *buf = 0;
 182         static int bufsize = 5; /* BUG: setting this small causes core dump! */
 183 
 184         if (buf == 0 && (buf = (uchar *)malloc(bufsize)) == NULL)
 185                 FATAL("out of space in yylex");
 186         if (sc) {
 187                 sc = 0;
 188                 RET('}');
 189         }
 190         if (reg) {
 191                 reg = 0;
 192                 return (regexpr());
 193         }
 194         for (;;) {
 195                 c = gettok(&buf, &bufsize);
 196                 if (c == 0)
 197                         return (0);
 198                 if (isalpha(c) || c == '_')
 199                         return (word((char *)buf));
 200                 if (isdigit(c)) {
 201                         yylval.cp = setsymtab(buf, tostring(buf),
 202                             atof((char *)buf), CON|NUM, symtab);
 203                         /* should this also have STR set? */
 204                         RET(NUMBER);
 205                 }
 206 
 207                 yylval.i = c;
 208                 switch (c) {
 209                 case '\n':      /* {EOL} */
 210                         RET(NL);
 211                 case '\r':      /* assume \n is coming */
 212                 case ' ':       /* {WS}+ */
 213                 case '\t':
 214                         break;
 215                 case '#':       /* #.* strip comments */
 216                         while ((c = input()) != '\n' && c != 0)
 217                                 ;
 218                         unput(c);
 219                         break;
 220                 case ';':
 221                         RET(';');
 222                 case '\\':
 223                         if (peek() == '\n') {
 224                                 (void) input();
 225                         } else if (peek() == '\r') {
 226                                 (void) input();
 227                                 (void) input(); /* \n */
 228                                 lineno++;
 229                         } else {
 230                                 RET(c);
 231                         }
 232                         break;
 233                 case '&':
 234                         if (peek() == '&') {
 235                                 (void) input(); RET(AND);
 236                         } else
 237                                 RET('&');
 238                 case '|':
 239                         if (peek() == '|') {
 240                                 (void) input(); RET(BOR);
 241                         } else
 242                                 RET('|');
 243                 case '!':
 244                         if (peek() == '=') {
 245                                 (void) input(); yylval.i = NE; RET(NE);
 246                         } else if (peek() == '~') {
 247                                 (void) input(); yylval.i = NOTMATCH;
 248                                 RET(MATCHOP);
 249                         } else
 250                                 RET(NOT);
 251                 case '~':
 252                         yylval.i = MATCH;
 253                         RET(MATCHOP);
 254                 case '<':
 255                         if (peek() == '=') {
 256                                 (void) input(); yylval.i = LE; RET(LE);
 257                         } else {
 258                                 yylval.i = LT; RET(LT);
 259                         }
 260                 case '=':
 261                         if (peek() == '=') {
 262                                 (void) input(); yylval.i = EQ; RET(EQ);
 263                         } else {
 264                                 yylval.i = ASSIGN; RET(ASGNOP);
 265                         }
 266                 case '>':
 267                         if (peek() == '=') {
 268                                 (void) input(); yylval.i = GE; RET(GE);
 269                         } else if (peek() == '>') {
 270                                 (void) input(); yylval.i = APPEND; RET(APPEND);
 271                         } else {
 272                                 yylval.i = GT; RET(GT);
 273                         }
 274                 case '+':
 275                         if (peek() == '+') {
 276                                 (void) input(); yylval.i = INCR; RET(INCR);
 277                         } else if (peek() == '=') {
 278                                 (void) input(); yylval.i = ADDEQ; RET(ASGNOP);
 279                         } else
 280                                 RET('+');
 281                 case '-':
 282                         if (peek() == '-') {
 283                                 (void) input(); yylval.i = DECR; RET(DECR);
 284                         } else if (peek() == '=') {
 285                                 (void) input(); yylval.i = SUBEQ; RET(ASGNOP);
 286                         } else
 287                                 RET('-');
 288                 case '*':
 289                         if (peek() == '=') {    /* *= */
 290                                 (void) input(); yylval.i = MULTEQ; RET(ASGNOP);
 291                         } else if (peek() == '*') {     /* ** or **= */
 292                                 (void) input(); /* eat 2nd * */
 293                                 if (peek() == '=') {
 294                                         (void) input(); yylval.i = POWEQ;
 295                                         RET(ASGNOP);
 296                                 } else {
 297                                         RET(POWER);
 298                                 }
 299                         } else
 300                                 RET('*');
 301                 case '/':
 302                         RET('/');
 303                 case '%':
 304                         if (peek() == '=') {
 305                                 (void) input(); yylval.i = MODEQ; RET(ASGNOP);
 306                         } else
 307                                 RET('%');
 308                 case '^':
 309                         if (peek() == '=') {
 310                                 (void) input(); yylval.i = POWEQ; RET(ASGNOP);
 311                         } else
 312                                 RET(POWER);
 313 
 314                 case '$':
 315                         /* BUG: awkward, if not wrong */
 316                         c = gettok(&buf, &bufsize);
 317                         if (isalpha(c)) {
 318                                 /* very special */
 319                                 if (strcmp((char *)buf, "NF") == 0) {
 320                                         unputstr("(NF)");
 321                                         RET(INDIRECT);
 322                                 }
 323                                 c = peek();
 324                                 if (c == '(' || c == '[' ||
 325                                     (infunc && isarg(buf) >= 0)) {
 326                                         unputstr((char *)buf);
 327                                         RET(INDIRECT);
 328                                 }
 329                                 yylval.cp = setsymtab(buf, (uchar *)"", 0.0,
 330                                     STR | NUM, symtab);
 331                                 RET(IVAR);
 332                         } else if (c == 0) {    /*  */
 333                                 SYNTAX("unexpected end of input after $");
 334                                 RET(';');
 335                         } else {
 336                                 unputstr((char *)buf);
 337                                 RET(INDIRECT);
 338                         }
 339 
 340                 case '}':
 341                         if (--bracecnt < 0)
 342                                 SYNTAX("extra }");
 343                         sc = 1;
 344                         RET(';');
 345                 case ']':
 346                         if (--brackcnt < 0)
 347                                 SYNTAX("extra ]");
 348                         RET(']');
 349                 case ')':
 350                         if (--parencnt < 0)
 351                                 SYNTAX("extra )");
 352                         RET(')');
 353                 case '{':
 354                         bracecnt++;
 355                         RET('{');
 356                 case '[':
 357                         brackcnt++;
 358                         RET('[');
 359                 case '(':
 360                         parencnt++;
 361                         RET('(');
 362 
 363                 case '"':
 364                         /* BUG: should be like tran.c ? */
 365                         return (string());
 366 
 367                 default:
 368                         RET(c);
 369                 }
 370         }
 371 }
 372 
 373 int
 374 string(void)
 375 {
 376         int c, n;
 377         uchar *s, *bp;
 378         static uchar *buf = NULL;
 379         static size_t bufsz = 500;
 380 
 381         if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL)
 382                 FATAL("out of space for strings");
 383         for (bp = buf; (c = input()) != '"'; ) {
 384                 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
 385                         FATAL("out of space for string %.10s...", buf);
 386                 switch (c) {
 387                 case '\n':
 388                 case '\r':
 389                 case 0:
 390                         SYNTAX("non-terminated string %.10s...", buf);
 391                         lineno++;
 392                         if (c == 0)     /* hopeless */
 393                                 FATAL("giving up");
 394                         break;
 395                 case '\\':
 396                         c = input();
 397                         switch (c) {
 398                         case '"': *bp++ = '"'; break;
 399                         case 'n': *bp++ = '\n'; break;
 400                         case 't': *bp++ = '\t'; break;
 401                         case 'f': *bp++ = '\f'; break;
 402                         case 'r': *bp++ = '\r'; break;
 403                         case 'b': *bp++ = '\b'; break;
 404                         case 'v': *bp++ = '\v'; break;
 405                         case 'a': *bp++ = '\007'; break;
 406                         case '\\': *bp++ = '\\'; break;
 407 
 408                         case '0': case '1': case '2': /* octal: \d \dd \ddd */
 409                         case '3': case '4': case '5': case '6': case '7':
 410                                 n = c - '0';
 411                                 if ((c = peek()) >= '0' && c < '8') {
 412                                         n = 8 * n + input() - '0';
 413                                         if ((c = peek()) >= '0' && c < '8')
 414                                                 n = 8 * n + input() - '0';
 415                                 }
 416                                 *bp++ = n;
 417                                 break;
 418 
 419                         case 'x': {     /* hex  \x0-9a-fA-F + */
 420                                 char xbuf[100], *px;
 421                 for (px = xbuf; (c = input()) != 0 && px - xbuf < 100 - 2; ) {
 422                                         if (isdigit(c) ||
 423                                             (c >= 'a' && c <= 'f') ||
 424                                             (c >= 'A' && c <= 'F'))
 425                                                 *px++ = c;
 426                                         else
 427                                                 break;
 428                                 }
 429                                 *px = 0;
 430                                 unput(c);
 431                                 (void) sscanf(xbuf, "%x", (unsigned int *)&n);
 432                                 *bp++ = n;
 433                                 break;
 434                         }
 435 
 436                         default:
 437                                 *bp++ = c;
 438                                 break;
 439                         }
 440                         break;
 441                 default:
 442                         *bp++ = c;
 443                         break;
 444                 }
 445         }
 446         *bp = 0;
 447         s = tostring(buf);
 448         *bp++ = ' '; *bp++ = 0;
 449         yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
 450         RET(STRING);
 451 }
 452 
 453 
 454 int
 455 binsearch(char *w, Keyword *kp, int n)
 456 {
 457         int cond, low, mid, high;
 458 
 459         low = 0;
 460         high = n - 1;
 461         while (low <= high) {
 462                 mid = (low + high) / 2;
 463                 if ((cond = strcmp(w, kp[mid].word)) < 0)
 464                         high = mid - 1;
 465                 else if (cond > 0)
 466                         low = mid + 1;
 467                 else
 468                         return (mid);
 469         }
 470         return (-1);
 471 }
 472 
 473 int
 474 word(char *w)
 475 {
 476         Keyword *kp;
 477         int c, n;
 478 
 479         n = binsearch(w, keywords, sizeof (keywords) / sizeof (keywords[0]));
 480         /*
 481          * BUG: this ought to be inside the if;
 482          * in theory could fault (daniel barrett)
 483          */
 484         kp = keywords + n;
 485         if (n != -1) {  /* found in table */
 486                 yylval.i = kp->sub;
 487                 switch (kp->type) {  /* special handling */
 488                 case BLTIN:
 489                         if (kp->sub == FSYSTEM && safe)
 490                                 SYNTAX("system is unsafe");
 491                         RET(kp->type);
 492                 case FUNC:
 493                         if (infunc)
 494                                 SYNTAX("illegal nested function");
 495                         RET(kp->type);
 496                 case RETURN:
 497                         if (!infunc)
 498                                 SYNTAX("return not in function");
 499                         RET(kp->type);
 500                 case VARNF:
 501                         yylval.cp = setsymtab((uchar *)"NF", (uchar *)"", 0.0,
 502                             NUM, symtab);
 503                         RET(VARNF);
 504                 default:
 505                         RET(kp->type);
 506                 }
 507         }
 508         c = peek();     /* look for '(' */
 509         if (c != '(' && infunc && (n = isarg((uchar *)w)) >= 0) {
 510                 yylval.i = n;
 511                 RET(ARG);
 512         } else {
 513                 yylval.cp = setsymtab((uchar *)w, (uchar *)"", 0.0,
 514                     STR | NUM | DONTFREE, symtab);
 515                 if (c == '(') {
 516                         RET(CALL);
 517                 } else {
 518                         RET(VAR);
 519                 }
 520         }
 521 }
 522 
 523 void
 524 startreg(void)  /* next call to yylex will return a regular expression */
 525 {
 526         reg = 1;
 527 }
 528 
 529 int
 530 regexpr(void)
 531 {
 532         int c;
 533         static uchar *buf = NULL;
 534         static size_t bufsz = 500;
 535         uchar *bp;
 536 
 537         if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL)
 538                 FATAL("out of space for rex expr");
 539         bp = buf;
 540         for (; (c = input()) != '/' && c != 0; ) {
 541                 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
 542                         FATAL("out of space for reg expr %.10s...", buf);
 543                 if (c == '\n') {
 544                         SYNTAX("newline in regular expression %.10s...", buf);
 545                         unput('\n');
 546                         break;
 547                 } else if (c == '\\') {
 548                         *bp++ = '\\';
 549                         *bp++ = input();
 550                 } else {
 551                         *bp++ = c;
 552                 }
 553         }
 554         *bp = 0;
 555         if (c == 0)
 556                 SYNTAX("non-terminated regular expression %.10s...", buf);
 557         yylval.s = tostring(buf);
 558         unput('/');
 559         RET(REGEXPR);
 560 }
 561 
 562 /* low-level lexical stuff, sort of inherited from lex */
 563 
 564 char    ebuf[300];
 565 char    *ep = ebuf;
 566 char    yysbuf[100];    /* pushback buffer */
 567 char    *yysptr = yysbuf;
 568 FILE    *yyin = 0;
 569 
 570 int
 571 input(void)     /* get next lexical input character */
 572 {
 573         int c;
 574         extern uchar *lexprog;
 575 
 576         if (yysptr > yysbuf)
 577                 c = (uchar)*--yysptr;
 578         else if (lexprog != NULL) {     /* awk '...' */
 579                 if ((c = (uchar)*lexprog) != 0)
 580                         lexprog++;
 581         } else                          /* awk -f ... */
 582                 c = pgetc();
 583         if (c == '\n')
 584                 lineno++;
 585         else if (c == EOF)
 586                 c = 0;
 587         if (ep >= ebuf + sizeof (ebuf))
 588                 ep = ebuf;
 589         return (*ep++ = c);
 590 }
 591 
 592 void
 593 unput(int c)    /* put lexical character back on input */
 594 {
 595         if (c == '\n')
 596                 lineno--;
 597         if (yysptr >= yysbuf + sizeof (yysbuf))
 598                 FATAL("pushed back too much: %.20s...", yysbuf);
 599         *yysptr++ = c;
 600         if (--ep < ebuf)
 601                 ep = ebuf + sizeof (ebuf) - 1;
 602 }
 603 
 604 void
 605 unputstr(const char *s) /* put a string back on input */
 606 {
 607         int i;
 608 
 609         for (i = strlen(s)-1; i >= 0; i--)
 610                 unput(s[i]);
 611 }