1 /* 2 * Copyright (C) Lucent Technologies 1997 3 * All Rights Reserved 4 * 5 * Permission to use, copy, modify, and distribute this software and 6 * its documentation for any purpose and without fee is hereby 7 * granted, provided that the above copyright notice appear in all 8 * copies and that both that the copyright notice and this 9 * permission notice and warranty disclaimer appear in supporting 10 * documentation, and that the name Lucent Technologies or any of 11 * its entities not be used in advertising or publicity pertaining 12 * to distribution of the software without specific, written prior 13 * permission. 14 * 15 * LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 * IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 * THIS SOFTWARE. 23 */ 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <ctype.h> 29 #include "awk.h" 30 #include "y.tab.h" 31 32 extern YYSTYPE yylval; 33 extern int infunc; 34 35 off_t lineno = 1; 36 int bracecnt = 0; 37 int brackcnt = 0; 38 int parencnt = 0; 39 40 typedef struct Keyword { 41 const char *word; 42 int sub; 43 int type; 44 } Keyword; 45 46 Keyword keywords[] = { /* keep sorted: binary searched */ 47 { "BEGIN", XBEGIN, XBEGIN }, 48 { "END", XEND, XEND }, 49 { "NF", VARNF, VARNF }, 50 { "atan2", FATAN, BLTIN }, 51 { "break", BREAK, BREAK }, 52 { "close", CLOSE, CLOSE }, 53 { "continue", CONTINUE, CONTINUE }, 54 { "cos", FCOS, BLTIN }, 55 { "delete", DELETE, DELETE }, 56 { "do", DO, DO }, 57 { "else", ELSE, ELSE }, 58 { "exit", EXIT, EXIT }, 59 { "exp", FEXP, BLTIN }, 60 { "fflush", FFLUSH, BLTIN }, 61 { "for", FOR, FOR }, 62 { "func", FUNC, FUNC }, 63 { "function", FUNC, FUNC }, 64 { "getline", GETLINE, GETLINE }, 65 { "gsub", GSUB, GSUB }, 66 { "if", IF, IF }, 67 { "in", IN, IN }, 68 { "index", INDEX, INDEX }, 69 { "int", FINT, BLTIN }, 70 { "length", FLENGTH, BLTIN }, 71 { "log", FLOG, BLTIN }, 72 { "match", MATCHFCN, MATCHFCN }, 73 { "next", NEXT, NEXT }, 74 { "nextfile", NEXTFILE, NEXTFILE }, 75 { "print", PRINT, PRINT }, 76 { "printf", PRINTF, PRINTF }, 77 { "rand", FRAND, BLTIN }, 78 { "return", RETURN, RETURN }, 79 { "sin", FSIN, BLTIN }, 80 { "split", SPLIT, SPLIT }, 81 { "sprintf", SPRINTF, SPRINTF }, 82 { "sqrt", FSQRT, BLTIN }, 83 { "srand", FSRAND, BLTIN }, 84 { "sub", SUB, SUB }, 85 { "substr", SUBSTR, SUBSTR }, 86 { "system", FSYSTEM, BLTIN }, 87 { "tolower", FTOLOWER, BLTIN }, 88 { "toupper", FTOUPPER, BLTIN }, 89 { "while", WHILE, WHILE }, 90 }; 91 92 #define RET(x) { if (dbg) (void) printf("lex %s\n", tokname(x)); return (x); } 93 94 static int 95 peek(void) 96 { 97 int c = input(); 98 unput(c); 99 return (c); 100 } 101 102 static int 103 gettok(uchar **pbuf, int *psz) /* get next input token */ 104 { 105 int c, retc; 106 uchar *buf = *pbuf; 107 size_t sz = *psz; 108 uchar *bp = buf; 109 110 c = input(); 111 if (c == 0) 112 return (0); 113 buf[0] = c; 114 buf[1] = 0; 115 if (!isalnum(c) && c != '.' && c != '_') 116 return (c); 117 118 *bp++ = c; 119 if (isalpha(c) || c == '_') { /* it's a varname */ 120 for (; (c = input()) != 0; ) { 121 if (bp-buf >= sz) 122 if (!adjbuf(&buf, &sz, bp - buf + 2, 100, 123 &bp, "gettok")) 124 FATAL( 125 "out of space for name %.10s...", buf); 126 if (isalnum(c) || c == '_') 127 *bp++ = c; 128 else { 129 *bp = 0; 130 unput(c); 131 break; 132 } 133 } 134 *bp = 0; 135 retc = 'a'; /* alphanumeric */ 136 } else { /* maybe it's a number, but could be . */ 137 char *rem; 138 /* read input until can't be a number */ 139 for (; (c = input()) != 0; ) { 140 if (bp-buf >= sz) 141 if (!adjbuf(&buf, &sz, bp - buf + 2, 100, 142 &bp, "gettok")) 143 FATAL( 144 "out of space for number %.10s...", buf); 145 if (isdigit(c) || c == 'e' || c == 'E' || 146 c == '.' || c == '+' || c == '-') 147 *bp++ = c; 148 else { 149 unput(c); 150 break; 151 } 152 } 153 *bp = 0; 154 (void) strtod((char *)buf, &rem); /* parse the number */ 155 /* it wasn't a valid number at all */ 156 if (rem == (char *)buf) { 157 buf[1] = 0; /* return one character as token */ 158 retc = buf[0]; /* character is its own type */ 159 unputstr(rem+1); /* put rest back for later */ 160 } else { /* some prefix was a number */ 161 unputstr(rem); /* put rest back for later */ 162 rem[0] = 0; /* truncate buf after number part */ 163 retc = '0'; /* type is number */ 164 } 165 } 166 *pbuf = buf; 167 *psz = sz; 168 return (retc); 169 } 170 171 int word(char *); 172 int string(void); 173 int regexpr(void); 174 int sc = 0; /* 1 => return a } right now */ 175 int reg = 0; /* 1 => return a REGEXPR now */ 176 177 int 178 yylex(void) 179 { 180 int c; 181 static uchar *buf = 0; 182 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 183 184 if (buf == 0 && (buf = (uchar *)malloc(bufsize)) == NULL) 185 FATAL("out of space in yylex"); 186 if (sc) { 187 sc = 0; 188 RET('}'); 189 } 190 if (reg) { 191 reg = 0; 192 return (regexpr()); 193 } 194 for (;;) { 195 c = gettok(&buf, &bufsize); 196 if (c == 0) 197 return (0); 198 if (isalpha(c) || c == '_') 199 return (word((char *)buf)); 200 if (isdigit(c)) { 201 yylval.cp = setsymtab(buf, tostring(buf), 202 atof((char *)buf), CON|NUM, symtab); 203 /* should this also have STR set? */ 204 RET(NUMBER); 205 } 206 207 yylval.i = c; 208 switch (c) { 209 case '\n': /* {EOL} */ 210 RET(NL); 211 case '\r': /* assume \n is coming */ 212 case ' ': /* {WS}+ */ 213 case '\t': 214 break; 215 case '#': /* #.* strip comments */ 216 while ((c = input()) != '\n' && c != 0) 217 ; 218 unput(c); 219 break; 220 case ';': 221 RET(';'); 222 case '\\': 223 if (peek() == '\n') { 224 (void) input(); 225 } else if (peek() == '\r') { 226 (void) input(); 227 (void) input(); /* \n */ 228 lineno++; 229 } else { 230 RET(c); 231 } 232 break; 233 case '&': 234 if (peek() == '&') { 235 (void) input(); RET(AND); 236 } else 237 RET('&'); 238 case '|': 239 if (peek() == '|') { 240 (void) input(); RET(BOR); 241 } else 242 RET('|'); 243 case '!': 244 if (peek() == '=') { 245 (void) input(); yylval.i = NE; RET(NE); 246 } else if (peek() == '~') { 247 (void) input(); yylval.i = NOTMATCH; 248 RET(MATCHOP); 249 } else 250 RET(NOT); 251 case '~': 252 yylval.i = MATCH; 253 RET(MATCHOP); 254 case '<': 255 if (peek() == '=') { 256 (void) input(); yylval.i = LE; RET(LE); 257 } else { 258 yylval.i = LT; RET(LT); 259 } 260 case '=': 261 if (peek() == '=') { 262 (void) input(); yylval.i = EQ; RET(EQ); 263 } else { 264 yylval.i = ASSIGN; RET(ASGNOP); 265 } 266 case '>': 267 if (peek() == '=') { 268 (void) input(); yylval.i = GE; RET(GE); 269 } else if (peek() == '>') { 270 (void) input(); yylval.i = APPEND; RET(APPEND); 271 } else { 272 yylval.i = GT; RET(GT); 273 } 274 case '+': 275 if (peek() == '+') { 276 (void) input(); yylval.i = INCR; RET(INCR); 277 } else if (peek() == '=') { 278 (void) input(); yylval.i = ADDEQ; RET(ASGNOP); 279 } else 280 RET('+'); 281 case '-': 282 if (peek() == '-') { 283 (void) input(); yylval.i = DECR; RET(DECR); 284 } else if (peek() == '=') { 285 (void) input(); yylval.i = SUBEQ; RET(ASGNOP); 286 } else 287 RET('-'); 288 case '*': 289 if (peek() == '=') { /* *= */ 290 (void) input(); yylval.i = MULTEQ; RET(ASGNOP); 291 } else if (peek() == '*') { /* ** or **= */ 292 (void) input(); /* eat 2nd * */ 293 if (peek() == '=') { 294 (void) input(); yylval.i = POWEQ; 295 RET(ASGNOP); 296 } else { 297 RET(POWER); 298 } 299 } else 300 RET('*'); 301 case '/': 302 RET('/'); 303 case '%': 304 if (peek() == '=') { 305 (void) input(); yylval.i = MODEQ; RET(ASGNOP); 306 } else 307 RET('%'); 308 case '^': 309 if (peek() == '=') { 310 (void) input(); yylval.i = POWEQ; RET(ASGNOP); 311 } else 312 RET(POWER); 313 314 case '$': 315 /* BUG: awkward, if not wrong */ 316 c = gettok(&buf, &bufsize); 317 if (isalpha(c)) { 318 /* very special */ 319 if (strcmp((char *)buf, "NF") == 0) { 320 unputstr("(NF)"); 321 RET(INDIRECT); 322 } 323 c = peek(); 324 if (c == '(' || c == '[' || 325 (infunc && isarg(buf) >= 0)) { 326 unputstr((char *)buf); 327 RET(INDIRECT); 328 } 329 yylval.cp = setsymtab(buf, (uchar *)"", 0.0, 330 STR | NUM, symtab); 331 RET(IVAR); 332 } else if (c == 0) { /* */ 333 SYNTAX("unexpected end of input after $"); 334 RET(';'); 335 } else { 336 unputstr((char *)buf); 337 RET(INDIRECT); 338 } 339 340 case '}': 341 if (--bracecnt < 0) 342 SYNTAX("extra }"); 343 sc = 1; 344 RET(';'); 345 case ']': 346 if (--brackcnt < 0) 347 SYNTAX("extra ]"); 348 RET(']'); 349 case ')': 350 if (--parencnt < 0) 351 SYNTAX("extra )"); 352 RET(')'); 353 case '{': 354 bracecnt++; 355 RET('{'); 356 case '[': 357 brackcnt++; 358 RET('['); 359 case '(': 360 parencnt++; 361 RET('('); 362 363 case '"': 364 /* BUG: should be like tran.c ? */ 365 return (string()); 366 367 default: 368 RET(c); 369 } 370 } 371 } 372 373 int 374 string(void) 375 { 376 int c, n; 377 uchar *s, *bp; 378 static uchar *buf = NULL; 379 static size_t bufsz = 500; 380 381 if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL) 382 FATAL("out of space for strings"); 383 for (bp = buf; (c = input()) != '"'; ) { 384 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 385 FATAL("out of space for string %.10s...", buf); 386 switch (c) { 387 case '\n': 388 case '\r': 389 case 0: 390 SYNTAX("non-terminated string %.10s...", buf); 391 lineno++; 392 if (c == 0) /* hopeless */ 393 FATAL("giving up"); 394 break; 395 case '\\': 396 c = input(); 397 switch (c) { 398 case '"': *bp++ = '"'; break; 399 case 'n': *bp++ = '\n'; break; 400 case 't': *bp++ = '\t'; break; 401 case 'f': *bp++ = '\f'; break; 402 case 'r': *bp++ = '\r'; break; 403 case 'b': *bp++ = '\b'; break; 404 case 'v': *bp++ = '\v'; break; 405 case 'a': *bp++ = '\007'; break; 406 case '\\': *bp++ = '\\'; break; 407 408 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 409 case '3': case '4': case '5': case '6': case '7': 410 n = c - '0'; 411 if ((c = peek()) >= '0' && c < '8') { 412 n = 8 * n + input() - '0'; 413 if ((c = peek()) >= '0' && c < '8') 414 n = 8 * n + input() - '0'; 415 } 416 *bp++ = n; 417 break; 418 419 case 'x': { /* hex \x0-9a-fA-F + */ 420 char xbuf[100], *px; 421 for (px = xbuf; (c = input()) != 0 && px - xbuf < 100 - 2; ) { 422 if (isdigit(c) || 423 (c >= 'a' && c <= 'f') || 424 (c >= 'A' && c <= 'F')) 425 *px++ = c; 426 else 427 break; 428 } 429 *px = 0; 430 unput(c); 431 (void) sscanf(xbuf, "%x", (unsigned int *)&n); 432 *bp++ = n; 433 break; 434 } 435 436 default: 437 *bp++ = c; 438 break; 439 } 440 break; 441 default: 442 *bp++ = c; 443 break; 444 } 445 } 446 *bp = 0; 447 s = tostring(buf); 448 *bp++ = ' '; *bp++ = 0; 449 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 450 RET(STRING); 451 } 452 453 454 int 455 binsearch(char *w, Keyword *kp, int n) 456 { 457 int cond, low, mid, high; 458 459 low = 0; 460 high = n - 1; 461 while (low <= high) { 462 mid = (low + high) / 2; 463 if ((cond = strcmp(w, kp[mid].word)) < 0) 464 high = mid - 1; 465 else if (cond > 0) 466 low = mid + 1; 467 else 468 return (mid); 469 } 470 return (-1); 471 } 472 473 int 474 word(char *w) 475 { 476 Keyword *kp; 477 int c, n; 478 479 n = binsearch(w, keywords, sizeof (keywords) / sizeof (keywords[0])); 480 /* 481 * BUG: this ought to be inside the if; 482 * in theory could fault (daniel barrett) 483 */ 484 kp = keywords + n; 485 if (n != -1) { /* found in table */ 486 yylval.i = kp->sub; 487 switch (kp->type) { /* special handling */ 488 case BLTIN: 489 if (kp->sub == FSYSTEM && safe) 490 SYNTAX("system is unsafe"); 491 RET(kp->type); 492 case FUNC: 493 if (infunc) 494 SYNTAX("illegal nested function"); 495 RET(kp->type); 496 case RETURN: 497 if (!infunc) 498 SYNTAX("return not in function"); 499 RET(kp->type); 500 case VARNF: 501 yylval.cp = setsymtab((uchar *)"NF", (uchar *)"", 0.0, 502 NUM, symtab); 503 RET(VARNF); 504 default: 505 RET(kp->type); 506 } 507 } 508 c = peek(); /* look for '(' */ 509 if (c != '(' && infunc && (n = isarg((uchar *)w)) >= 0) { 510 yylval.i = n; 511 RET(ARG); 512 } else { 513 yylval.cp = setsymtab((uchar *)w, (uchar *)"", 0.0, 514 STR | NUM | DONTFREE, symtab); 515 if (c == '(') { 516 RET(CALL); 517 } else { 518 RET(VAR); 519 } 520 } 521 } 522 523 void 524 startreg(void) /* next call to yylex will return a regular expression */ 525 { 526 reg = 1; 527 } 528 529 int 530 regexpr(void) 531 { 532 int c; 533 static uchar *buf = NULL; 534 static size_t bufsz = 500; 535 uchar *bp; 536 537 if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL) 538 FATAL("out of space for rex expr"); 539 bp = buf; 540 for (; (c = input()) != '/' && c != 0; ) { 541 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 542 FATAL("out of space for reg expr %.10s...", buf); 543 if (c == '\n') { 544 SYNTAX("newline in regular expression %.10s...", buf); 545 unput('\n'); 546 break; 547 } else if (c == '\\') { 548 *bp++ = '\\'; 549 *bp++ = input(); 550 } else { 551 *bp++ = c; 552 } 553 } 554 *bp = 0; 555 if (c == 0) 556 SYNTAX("non-terminated regular expression %.10s...", buf); 557 yylval.s = tostring(buf); 558 unput('/'); 559 RET(REGEXPR); 560 } 561 562 /* low-level lexical stuff, sort of inherited from lex */ 563 564 char ebuf[300]; 565 char *ep = ebuf; 566 char yysbuf[100]; /* pushback buffer */ 567 char *yysptr = yysbuf; 568 FILE *yyin = 0; 569 570 int 571 input(void) /* get next lexical input character */ 572 { 573 int c; 574 extern uchar *lexprog; 575 576 if (yysptr > yysbuf) 577 c = (uchar)*--yysptr; 578 else if (lexprog != NULL) { /* awk '...' */ 579 if ((c = (uchar)*lexprog) != 0) 580 lexprog++; 581 } else /* awk -f ... */ 582 c = pgetc(); 583 if (c == '\n') 584 lineno++; 585 else if (c == EOF) 586 c = 0; 587 if (ep >= ebuf + sizeof (ebuf)) 588 ep = ebuf; 589 return (*ep++ = c); 590 } 591 592 void 593 unput(int c) /* put lexical character back on input */ 594 { 595 if (c == '\n') 596 lineno--; 597 if (yysptr >= yysbuf + sizeof (yysbuf)) 598 FATAL("pushed back too much: %.20s...", yysbuf); 599 *yysptr++ = c; 600 if (--ep < ebuf) 601 ep = ebuf + sizeof (ebuf) - 1; 602 } 603 604 void 605 unputstr(const char *s) /* put a string back on input */ 606 { 607 int i; 608 609 for (i = strlen(s)-1; i >= 0; i--) 610 unput(s[i]); 611 }