1 /*
2 * Copyright (C) Lucent Technologies 1997
3 * All Rights Reserved
4 *
5 * Permission to use, copy, modify, and distribute this software and
6 * its documentation for any purpose and without fee is hereby
7 * granted, provided that the above copyright notice appear in all
8 * copies and that both that the copyright notice and this
9 * permission notice and warranty disclaimer appear in supporting
10 * documentation, and that the name Lucent Technologies or any of
11 * its entities not be used in advertising or publicity pertaining
12 * to distribution of the software without specific, written prior
13 * permission.
14 *
15 * LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 * IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 * THIS SOFTWARE.
23 */
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "y.tab.h"
31
32 extern YYSTYPE yylval;
33 extern int infunc;
34
35 off_t lineno = 1;
36 int bracecnt = 0;
37 int brackcnt = 0;
38 int parencnt = 0;
39
40 typedef struct Keyword {
41 const char *word;
42 int sub;
43 int type;
44 } Keyword;
45
46 Keyword keywords[] = { /* keep sorted: binary searched */
47 { "BEGIN", XBEGIN, XBEGIN },
48 { "END", XEND, XEND },
49 { "NF", VARNF, VARNF },
50 { "atan2", FATAN, BLTIN },
51 { "break", BREAK, BREAK },
52 { "close", CLOSE, CLOSE },
53 { "continue", CONTINUE, CONTINUE },
54 { "cos", FCOS, BLTIN },
55 { "delete", DELETE, DELETE },
56 { "do", DO, DO },
57 { "else", ELSE, ELSE },
58 { "exit", EXIT, EXIT },
59 { "exp", FEXP, BLTIN },
60 { "fflush", FFLUSH, BLTIN },
61 { "for", FOR, FOR },
62 { "func", FUNC, FUNC },
63 { "function", FUNC, FUNC },
64 { "getline", GETLINE, GETLINE },
65 { "gsub", GSUB, GSUB },
66 { "if", IF, IF },
67 { "in", IN, IN },
68 { "index", INDEX, INDEX },
69 { "int", FINT, BLTIN },
70 { "length", FLENGTH, BLTIN },
71 { "log", FLOG, BLTIN },
72 { "match", MATCHFCN, MATCHFCN },
73 { "next", NEXT, NEXT },
74 { "nextfile", NEXTFILE, NEXTFILE },
75 { "print", PRINT, PRINT },
76 { "printf", PRINTF, PRINTF },
77 { "rand", FRAND, BLTIN },
78 { "return", RETURN, RETURN },
79 { "sin", FSIN, BLTIN },
80 { "split", SPLIT, SPLIT },
81 { "sprintf", SPRINTF, SPRINTF },
82 { "sqrt", FSQRT, BLTIN },
83 { "srand", FSRAND, BLTIN },
84 { "sub", SUB, SUB },
85 { "substr", SUBSTR, SUBSTR },
86 { "system", FSYSTEM, BLTIN },
87 { "tolower", FTOLOWER, BLTIN },
88 { "toupper", FTOUPPER, BLTIN },
89 { "while", WHILE, WHILE },
90 };
91
92 #define RET(x) { if (dbg) (void) printf("lex %s\n", tokname(x)); return (x); }
93
94 static int
95 peek(void)
96 {
97 int c = input();
98 unput(c);
99 return (c);
100 }
101
102 static int
103 gettok(uchar **pbuf, int *psz) /* get next input token */
104 {
105 int c, retc;
106 uchar *buf = *pbuf;
107 size_t sz = *psz;
108 uchar *bp = buf;
109
110 c = input();
111 if (c == 0)
112 return (0);
113 buf[0] = c;
114 buf[1] = 0;
115 if (!isalnum(c) && c != '.' && c != '_')
116 return (c);
117
118 *bp++ = c;
119 if (isalpha(c) || c == '_') { /* it's a varname */
120 for (; (c = input()) != 0; ) {
121 if (bp-buf >= sz)
122 if (!adjbuf(&buf, &sz, bp - buf + 2, 100,
123 &bp, "gettok"))
124 FATAL(
125 "out of space for name %.10s...", buf);
126 if (isalnum(c) || c == '_')
127 *bp++ = c;
128 else {
129 *bp = 0;
130 unput(c);
131 break;
132 }
133 }
134 *bp = 0;
135 retc = 'a'; /* alphanumeric */
136 } else { /* maybe it's a number, but could be . */
137 char *rem;
138 /* read input until can't be a number */
139 for (; (c = input()) != 0; ) {
140 if (bp-buf >= sz)
141 if (!adjbuf(&buf, &sz, bp - buf + 2, 100,
142 &bp, "gettok"))
143 FATAL(
144 "out of space for number %.10s...", buf);
145 if (isdigit(c) || c == 'e' || c == 'E' ||
146 c == '.' || c == '+' || c == '-')
147 *bp++ = c;
148 else {
149 unput(c);
150 break;
151 }
152 }
153 *bp = 0;
154 (void) strtod((char *)buf, &rem); /* parse the number */
155 /* it wasn't a valid number at all */
156 if (rem == (char *)buf) {
157 buf[1] = 0; /* return one character as token */
158 retc = buf[0]; /* character is its own type */
159 unputstr(rem+1); /* put rest back for later */
160 } else { /* some prefix was a number */
161 unputstr(rem); /* put rest back for later */
162 rem[0] = 0; /* truncate buf after number part */
163 retc = '0'; /* type is number */
164 }
165 }
166 *pbuf = buf;
167 *psz = sz;
168 return (retc);
169 }
170
171 int word(char *);
172 int string(void);
173 int regexpr(void);
174 int sc = 0; /* 1 => return a } right now */
175 int reg = 0; /* 1 => return a REGEXPR now */
176
177 int
178 yylex(void)
179 {
180 int c;
181 static uchar *buf = 0;
182 static int bufsize = 5; /* BUG: setting this small causes core dump! */
183
184 if (buf == 0 && (buf = (uchar *)malloc(bufsize)) == NULL)
185 FATAL("out of space in yylex");
186 if (sc) {
187 sc = 0;
188 RET('}');
189 }
190 if (reg) {
191 reg = 0;
192 return (regexpr());
193 }
194 for (;;) {
195 c = gettok(&buf, &bufsize);
196 if (c == 0)
197 return (0);
198 if (isalpha(c) || c == '_')
199 return (word((char *)buf));
200 if (isdigit(c)) {
201 yylval.cp = setsymtab(buf, tostring(buf),
202 atof((char *)buf), CON|NUM, symtab);
203 /* should this also have STR set? */
204 RET(NUMBER);
205 }
206
207 yylval.i = c;
208 switch (c) {
209 case '\n': /* {EOL} */
210 RET(NL);
211 case '\r': /* assume \n is coming */
212 case ' ': /* {WS}+ */
213 case '\t':
214 break;
215 case '#': /* #.* strip comments */
216 while ((c = input()) != '\n' && c != 0)
217 ;
218 unput(c);
219 break;
220 case ';':
221 RET(';');
222 case '\\':
223 if (peek() == '\n') {
224 (void) input();
225 } else if (peek() == '\r') {
226 (void) input();
227 (void) input(); /* \n */
228 lineno++;
229 } else {
230 RET(c);
231 }
232 break;
233 case '&':
234 if (peek() == '&') {
235 (void) input(); RET(AND);
236 } else
237 RET('&');
238 case '|':
239 if (peek() == '|') {
240 (void) input(); RET(BOR);
241 } else
242 RET('|');
243 case '!':
244 if (peek() == '=') {
245 (void) input(); yylval.i = NE; RET(NE);
246 } else if (peek() == '~') {
247 (void) input(); yylval.i = NOTMATCH;
248 RET(MATCHOP);
249 } else
250 RET(NOT);
251 case '~':
252 yylval.i = MATCH;
253 RET(MATCHOP);
254 case '<':
255 if (peek() == '=') {
256 (void) input(); yylval.i = LE; RET(LE);
257 } else {
258 yylval.i = LT; RET(LT);
259 }
260 case '=':
261 if (peek() == '=') {
262 (void) input(); yylval.i = EQ; RET(EQ);
263 } else {
264 yylval.i = ASSIGN; RET(ASGNOP);
265 }
266 case '>':
267 if (peek() == '=') {
268 (void) input(); yylval.i = GE; RET(GE);
269 } else if (peek() == '>') {
270 (void) input(); yylval.i = APPEND; RET(APPEND);
271 } else {
272 yylval.i = GT; RET(GT);
273 }
274 case '+':
275 if (peek() == '+') {
276 (void) input(); yylval.i = INCR; RET(INCR);
277 } else if (peek() == '=') {
278 (void) input(); yylval.i = ADDEQ; RET(ASGNOP);
279 } else
280 RET('+');
281 case '-':
282 if (peek() == '-') {
283 (void) input(); yylval.i = DECR; RET(DECR);
284 } else if (peek() == '=') {
285 (void) input(); yylval.i = SUBEQ; RET(ASGNOP);
286 } else
287 RET('-');
288 case '*':
289 if (peek() == '=') { /* *= */
290 (void) input(); yylval.i = MULTEQ; RET(ASGNOP);
291 } else if (peek() == '*') { /* ** or **= */
292 (void) input(); /* eat 2nd * */
293 if (peek() == '=') {
294 (void) input(); yylval.i = POWEQ;
295 RET(ASGNOP);
296 } else {
297 RET(POWER);
298 }
299 } else
300 RET('*');
301 case '/':
302 RET('/');
303 case '%':
304 if (peek() == '=') {
305 (void) input(); yylval.i = MODEQ; RET(ASGNOP);
306 } else
307 RET('%');
308 case '^':
309 if (peek() == '=') {
310 (void) input(); yylval.i = POWEQ; RET(ASGNOP);
311 } else
312 RET(POWER);
313
314 case '$':
315 /* BUG: awkward, if not wrong */
316 c = gettok(&buf, &bufsize);
317 if (isalpha(c)) {
318 /* very special */
319 if (strcmp((char *)buf, "NF") == 0) {
320 unputstr("(NF)");
321 RET(INDIRECT);
322 }
323 c = peek();
324 if (c == '(' || c == '[' ||
325 (infunc && isarg(buf) >= 0)) {
326 unputstr((char *)buf);
327 RET(INDIRECT);
328 }
329 yylval.cp = setsymtab(buf, (uchar *)"", 0.0,
330 STR | NUM, symtab);
331 RET(IVAR);
332 } else if (c == 0) { /* */
333 SYNTAX("unexpected end of input after $");
334 RET(';');
335 } else {
336 unputstr((char *)buf);
337 RET(INDIRECT);
338 }
339
340 case '}':
341 if (--bracecnt < 0)
342 SYNTAX("extra }");
343 sc = 1;
344 RET(';');
345 case ']':
346 if (--brackcnt < 0)
347 SYNTAX("extra ]");
348 RET(']');
349 case ')':
350 if (--parencnt < 0)
351 SYNTAX("extra )");
352 RET(')');
353 case '{':
354 bracecnt++;
355 RET('{');
356 case '[':
357 brackcnt++;
358 RET('[');
359 case '(':
360 parencnt++;
361 RET('(');
362
363 case '"':
364 /* BUG: should be like tran.c ? */
365 return (string());
366
367 default:
368 RET(c);
369 }
370 }
371 }
372
373 int
374 string(void)
375 {
376 int c, n;
377 uchar *s, *bp;
378 static uchar *buf = NULL;
379 static size_t bufsz = 500;
380
381 if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL)
382 FATAL("out of space for strings");
383 for (bp = buf; (c = input()) != '"'; ) {
384 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
385 FATAL("out of space for string %.10s...", buf);
386 switch (c) {
387 case '\n':
388 case '\r':
389 case 0:
390 SYNTAX("non-terminated string %.10s...", buf);
391 lineno++;
392 if (c == 0) /* hopeless */
393 FATAL("giving up");
394 break;
395 case '\\':
396 c = input();
397 switch (c) {
398 case '"': *bp++ = '"'; break;
399 case 'n': *bp++ = '\n'; break;
400 case 't': *bp++ = '\t'; break;
401 case 'f': *bp++ = '\f'; break;
402 case 'r': *bp++ = '\r'; break;
403 case 'b': *bp++ = '\b'; break;
404 case 'v': *bp++ = '\v'; break;
405 case 'a': *bp++ = '\007'; break;
406 case '\\': *bp++ = '\\'; break;
407
408 case '0': case '1': case '2': /* octal: \d \dd \ddd */
409 case '3': case '4': case '5': case '6': case '7':
410 n = c - '0';
411 if ((c = peek()) >= '0' && c < '8') {
412 n = 8 * n + input() - '0';
413 if ((c = peek()) >= '0' && c < '8')
414 n = 8 * n + input() - '0';
415 }
416 *bp++ = n;
417 break;
418
419 case 'x': { /* hex \x0-9a-fA-F + */
420 char xbuf[100], *px;
421 for (px = xbuf; (c = input()) != 0 && px - xbuf < 100 - 2; ) {
422 if (isdigit(c) ||
423 (c >= 'a' && c <= 'f') ||
424 (c >= 'A' && c <= 'F'))
425 *px++ = c;
426 else
427 break;
428 }
429 *px = 0;
430 unput(c);
431 (void) sscanf(xbuf, "%x", (unsigned int *)&n);
432 *bp++ = n;
433 break;
434 }
435
436 default:
437 *bp++ = c;
438 break;
439 }
440 break;
441 default:
442 *bp++ = c;
443 break;
444 }
445 }
446 *bp = 0;
447 s = tostring(buf);
448 *bp++ = ' '; *bp++ = 0;
449 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
450 RET(STRING);
451 }
452
453
454 int
455 binsearch(char *w, Keyword *kp, int n)
456 {
457 int cond, low, mid, high;
458
459 low = 0;
460 high = n - 1;
461 while (low <= high) {
462 mid = (low + high) / 2;
463 if ((cond = strcmp(w, kp[mid].word)) < 0)
464 high = mid - 1;
465 else if (cond > 0)
466 low = mid + 1;
467 else
468 return (mid);
469 }
470 return (-1);
471 }
472
473 int
474 word(char *w)
475 {
476 Keyword *kp;
477 int c, n;
478
479 n = binsearch(w, keywords, sizeof (keywords) / sizeof (keywords[0]));
480 /*
481 * BUG: this ought to be inside the if;
482 * in theory could fault (daniel barrett)
483 */
484 kp = keywords + n;
485 if (n != -1) { /* found in table */
486 yylval.i = kp->sub;
487 switch (kp->type) { /* special handling */
488 case BLTIN:
489 if (kp->sub == FSYSTEM && safe)
490 SYNTAX("system is unsafe");
491 RET(kp->type);
492 case FUNC:
493 if (infunc)
494 SYNTAX("illegal nested function");
495 RET(kp->type);
496 case RETURN:
497 if (!infunc)
498 SYNTAX("return not in function");
499 RET(kp->type);
500 case VARNF:
501 yylval.cp = setsymtab((uchar *)"NF", (uchar *)"", 0.0,
502 NUM, symtab);
503 RET(VARNF);
504 default:
505 RET(kp->type);
506 }
507 }
508 c = peek(); /* look for '(' */
509 if (c != '(' && infunc && (n = isarg((uchar *)w)) >= 0) {
510 yylval.i = n;
511 RET(ARG);
512 } else {
513 yylval.cp = setsymtab((uchar *)w, (uchar *)"", 0.0,
514 STR | NUM | DONTFREE, symtab);
515 if (c == '(') {
516 RET(CALL);
517 } else {
518 RET(VAR);
519 }
520 }
521 }
522
523 void
524 startreg(void) /* next call to yylex will return a regular expression */
525 {
526 reg = 1;
527 }
528
529 int
530 regexpr(void)
531 {
532 int c;
533 static uchar *buf = NULL;
534 static size_t bufsz = 500;
535 uchar *bp;
536
537 if (buf == 0 && (buf = (uchar *)malloc(bufsz)) == NULL)
538 FATAL("out of space for rex expr");
539 bp = buf;
540 for (; (c = input()) != '/' && c != 0; ) {
541 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
542 FATAL("out of space for reg expr %.10s...", buf);
543 if (c == '\n') {
544 SYNTAX("newline in regular expression %.10s...", buf);
545 unput('\n');
546 break;
547 } else if (c == '\\') {
548 *bp++ = '\\';
549 *bp++ = input();
550 } else {
551 *bp++ = c;
552 }
553 }
554 *bp = 0;
555 if (c == 0)
556 SYNTAX("non-terminated regular expression %.10s...", buf);
557 yylval.s = tostring(buf);
558 unput('/');
559 RET(REGEXPR);
560 }
561
562 /* low-level lexical stuff, sort of inherited from lex */
563
564 char ebuf[300];
565 char *ep = ebuf;
566 char yysbuf[100]; /* pushback buffer */
567 char *yysptr = yysbuf;
568 FILE *yyin = 0;
569
570 int
571 input(void) /* get next lexical input character */
572 {
573 int c;
574 extern uchar *lexprog;
575
576 if (yysptr > yysbuf)
577 c = (uchar)*--yysptr;
578 else if (lexprog != NULL) { /* awk '...' */
579 if ((c = (uchar)*lexprog) != 0)
580 lexprog++;
581 } else /* awk -f ... */
582 c = pgetc();
583 if (c == '\n')
584 lineno++;
585 else if (c == EOF)
586 c = 0;
587 if (ep >= ebuf + sizeof (ebuf))
588 ep = ebuf;
589 return (*ep++ = c);
590 }
591
592 void
593 unput(int c) /* put lexical character back on input */
594 {
595 if (c == '\n')
596 lineno--;
597 if (yysptr >= yysbuf + sizeof (yysbuf))
598 FATAL("pushed back too much: %.20s...", yysbuf);
599 *yysptr++ = c;
600 if (--ep < ebuf)
601 ep = ebuf + sizeof (ebuf) - 1;
602 }
603
604 void
605 unputstr(const char *s) /* put a string back on input */
606 {
607 int i;
608
609 for (i = strlen(s)-1; i >= 0; i--)
610 unput(s[i]);
611 }