1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
14 */
15
16 /*
17 * This file contains the "scanner", which tokenizes charmap files
18 * for iconv for processing by the higher level grammar processor.
19 */
20
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <ctype.h>
24 #include <limits.h>
25 #include <string.h>
26 #include <widec.h>
27 #include <sys/types.h>
28 #include <assert.h>
29 #include "charmap.h"
30 #include "parser.tab.h"
31
32 int com_char = '#';
33 int esc_char = '\\';
34 int mb_cur_min = 1;
35 int mb_cur_max = 1;
36 int lineno = 1;
37 int warnings = 0;
38 static int nextline;
39 static FILE *input = stdin;
40 static const char *filename = "<stdin>";
41 static int instring = 0;
42 static int escaped = 0;
43
44 /*
45 * Token space ... grows on demand.
46 */
47 static char *token = NULL;
48 static int tokidx;
49 static int toksz = 0;
50 static int hadtok = 0;
51
52 /*
53 * The last keyword seen. This is useful to trigger the special lexer rules
54 * for "copy" and also collating symbols and elements.
55 */
56 int last_kw = 0;
57 static int category = T_END;
58
59 static struct token {
60 int id;
61 const char *name;
62 } keywords[] = {
63 { T_COM_CHAR, "comment_char" },
64 { T_ESC_CHAR, "escape_char" },
65 { T_END, "END" },
66
67 /*
68 * These are keywords used in the charmap file. Note that
69 * Solaris orginally used angle brackets to wrap some of them,
70 * but we removed that to simplify our parser. The first of these
71 * items are "global items."
72 */
73 { T_CHARMAP, "CHARMAP" },
74 { T_WIDTH, "WIDTH" },
75 { T_WIDTH_DEFAULT, "WIDTH_DEFAULT" },
76
77 { -1, NULL },
78 };
79
80 /*
81 * These special words are only used in a charmap file, enclosed in <>.
82 */
83 static struct token symwords[] = {
84 { T_COM_CHAR, "comment_char" },
85 { T_ESC_CHAR, "escape_char" },
86 { T_CODE_SET, "code_set_name" },
87 { T_MB_CUR_MAX, "mb_cur_max" },
88 { T_MB_CUR_MIN, "mb_cur_min" },
89 { -1, NULL },
90 };
91
92 static int categories[] = {
93 T_CHARMAP,
94 0
95 };
96
97 void
98 reset_scanner(const char *fname)
99 {
100 if (fname == NULL) {
101 filename = "<stdin>";
102 input = stdin;
103 } else {
104 if (input != stdin)
105 (void) fclose(input);
106 if ((input = fopen(fname, "r")) == NULL) {
107 perror(fname);
108 exit(1);
109 }
110 filename = fname;
111 }
112 com_char = '#';
113 esc_char = '\\';
114 instring = 0;
115 escaped = 0;
116 lineno = 1;
117 nextline = 1;
118 tokidx = 0;
119 last_kw = 0;
120 category = T_END;
121 }
122
123 #define hex(x) \
124 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
125 #define isodigit(x) ((x >= '0') && (x <= '7'))
126
127 static int
128 scanc(void)
129 {
130 int c;
131
132 c = getc(input);
133 lineno = nextline;
134 if (c == '\n') {
135 nextline++;
136 }
137 return (c);
138 }
139
140 static void
141 unscanc(int c)
142 {
143 if (c == '\n') {
144 nextline--;
145 }
146 if (ungetc(c, input) < 0) {
147 yyerror(_("ungetc failed"));
148 }
149 }
150
151 static int
152 scan_hex_byte(void)
153 {
154 int c1, c2;
155 int v;
156
157 c1 = scanc();
158 if (!isxdigit(c1)) {
159 yyerror(_("malformed hex digit"));
160 return (0);
161 }
162 c2 = scanc();
163 if (!isxdigit(c2)) {
164 yyerror(_("malformed hex digit"));
165 return (0);
166 }
167 v = ((hex(c1) << 4) | hex(c2));
168 return (v);
169 }
170
171 static int
172 scan_dec_byte(void)
173 {
174 int c1, c2, c3;
175 int b;
176
177 c1 = scanc();
178 if (!isdigit(c1)) {
179 yyerror(_("malformed decimal digit"));
180 return (0);
181 }
182 b = c1 - '0';
183 c2 = scanc();
184 if (!isdigit(c2)) {
185 yyerror(_("malformed decimal digit"));
186 return (0);
187 }
188 b *= 10;
189 b += (c2 - '0');
190 c3 = scanc();
191 if (!isdigit(c3)) {
192 unscanc(c3);
193 } else {
194 b *= 10;
195 b += (c3 - '0');
196 }
197 return (b);
198 }
199
200 static int
201 scan_oct_byte(void)
202 {
203 int c1, c2, c3;
204 int b;
205
206 b = 0;
207
208 c1 = scanc();
209 if (!isodigit(c1)) {
210 yyerror(_("malformed octal digit"));
211 return (0);
212 }
213 b = c1 - '0';
214 c2 = scanc();
215 if (!isodigit(c2)) {
216 yyerror(_("malformed octal digit"));
217 return (0);
218 }
219 b *= 8;
220 b += (c2 - '0');
221 c3 = scanc();
222 if (!isodigit(c3)) {
223 unscanc(c3);
224 } else {
225 b *= 8;
226 b += (c3 - '0');
227 }
228 return (b);
229 }
230
231 void
232 add_tok(int c)
233 {
234 if ((tokidx + 1) >= toksz) {
235 toksz += 64;
236 if ((token = realloc(token, toksz)) == NULL) {
237 yyerror(_("out of memory"));
238 tokidx = 0;
239 toksz = 0;
240 return;
241 }
242 }
243
244 token[tokidx++] = (char)c;
245 token[tokidx] = 0;
246 }
247
248 static int
249 get_byte(void)
250 {
251 int c;
252
253 if ((c = scanc()) != esc_char) {
254 unscanc(c);
255 return (EOF);
256 }
257 c = scanc();
258
259 switch (c) {
260 case 'd':
261 case 'D':
262 return (scan_dec_byte());
263 case 'x':
264 case 'X':
265 return (scan_hex_byte());
266 case '0':
267 case '1':
268 case '2':
269 case '3':
270 case '4':
271 case '5':
272 case '6':
273 case '7':
274 /* put the character back so we can get it */
275 unscanc(c);
276 return (scan_oct_byte());
277 default:
278 unscanc(c);
279 unscanc(esc_char);
280 return (EOF);
281 }
282 }
283
284 int
285 get_escaped(int c)
286 {
287 switch (c) {
288 case 'n':
289 return ('\n');
290 case 'r':
291 return ('\r');
292 case 't':
293 return ('\t');
294 case 'f':
295 return ('\f');
296 case 'v':
297 return ('\v');
298 case 'b':
299 return ('\b');
300 case 'a':
301 return ('\a');
302 default:
303 return (c);
304 }
305 }
306
307 int
308 get_wide(void)
309 {
310 /* NB: yylval.mbs[0] is the length */
311 char *mbs = &yylval.mbs[1];
312 int mbi = 0;
313 int c;
314
315 mbs[mbi] = 0;
316 if (mb_cur_max > MB_LEN_MAX) {
317 yyerror(_("max multibyte character size too big"));
318 return (T_NULL);
319 }
320 for (;;) {
321 if ((c = get_byte()) == EOF)
322 break;
323 if (mbi == mb_cur_max) {
324 unscanc(c);
325 yyerror(_("length > mb_cur_max"));
326 return (T_NULL);
327 }
328 mbs[mbi++] = c;
329 mbs[mbi] = 0;
330 }
331
332 /* result in yylval.mbs */
333 mbs[-1] = mbi;
334 return (T_CHAR);
335 }
336
337 int
338 get_symbol(void)
339 {
340 int c;
341
342 while ((c = scanc()) != EOF) {
343 if (escaped) {
344 escaped = 0;
345 if (c == '\n')
346 continue;
347 add_tok(get_escaped(c));
348 continue;
349 }
350 if (c == esc_char) {
351 escaped = 1;
352 continue;
353 }
354 if (c == '\n') { /* well that's strange! */
355 yyerror(_("unterminated symbolic name"));
356 continue;
357 }
358 if (c == '>') { /* end of symbol */
359
360 /*
361 * This restarts the token from the beginning
362 * the next time we scan a character. (This
363 * token is complete.)
364 */
365
366 if (token == NULL) {
367 yyerror(_("missing symbolic name"));
368 return (T_NULL);
369 }
370 tokidx = 0;
371
372 /*
373 * A few symbols are handled as keywords outside
374 * of the normal categories.
375 */
376 if (category == T_END) {
377 int i;
378 for (i = 0; symwords[i].name != 0; i++) {
379 if (strcmp(token, symwords[i].name) ==
380 0) {
381 last_kw = symwords[i].id;
382 return (last_kw);
383 }
384 }
385 }
386 /* its an undefined symbol */
387 yylval.token = strdup(token);
388 token = NULL;
389 toksz = 0;
390 tokidx = 0;
391 return (T_SYMBOL);
392 }
393 add_tok(c);
394 }
395
396 yyerror(_("unterminated symbolic name"));
397 return (EOF);
398 }
399
400
401 static int
402 consume_token(void)
403 {
404 int len = tokidx;
405 int i;
406
407 tokidx = 0;
408 if (token == NULL)
409 return (T_NULL);
410
411 /*
412 * this one is special, because we don't want it to alter the
413 * last_kw field.
414 */
415 if (strcmp(token, "...") == 0) {
416 return (T_ELLIPSIS);
417 }
418
419 /* search for reserved words first */
420 for (i = 0; keywords[i].name; i++) {
421 int j;
422 if (strcmp(keywords[i].name, token) != 0) {
423 continue;
424 }
425
426 last_kw = keywords[i].id;
427
428 /* clear the top level category if we're done with it */
429 if (last_kw == T_END) {
430 category = T_END;
431 }
432
433 /* set the top level category if we're changing */
434 for (j = 0; categories[j]; j++) {
435 if (categories[j] != last_kw)
436 continue;
437 category = last_kw;
438 }
439
440 return (keywords[i].id);
441 }
442
443 /* maybe its a numeric constant? */
444 if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
445 char *eptr;
446 yylval.num = strtol(token, &eptr, 10);
447 if (*eptr != 0)
448 yyerror(_("malformed number"));
449 return (T_NUMBER);
450 }
451
452 /*
453 * A single lone character is treated as a character literal.
454 * To avoid duplication of effort, we stick in the charmap.
455 */
456 if (len == 1) {
457 yylval.mbs[0] = 1; /* length */
458 yylval.mbs[1] = token[0];
459 yylval.mbs[2] = '\0';
460 return (T_CHAR);
461 }
462
463 /* anything else is treated as a symbolic name */
464 yylval.token = strdup(token);
465 token = NULL;
466 toksz = 0;
467 tokidx = 0;
468 return (T_NAME);
469 }
470
471 void
472 scan_to_eol(void)
473 {
474 int c;
475 while ((c = scanc()) != '\n') {
476 if (c == EOF) {
477 /* end of file without newline! */
478 errf(_("missing newline"));
479 return;
480 }
481 }
482 assert(c == '\n');
483 }
484
485 int
486 yylex(void)
487 {
488 int c;
489
490 while ((c = scanc()) != EOF) {
491
492 /* special handling for quoted string */
493 if (instring) {
494 if (escaped) {
495 escaped = 0;
496
497 /* if newline, just eat and forget it */
498 if (c == '\n')
499 continue;
500
501 if (strchr("xXd01234567", c)) {
502 unscanc(c);
503 unscanc(esc_char);
504 return (get_wide());
505 }
506 yylval.mbs[0] = 1; /* length */
507 yylval.mbs[1] = get_escaped(c);
508 yylval.mbs[2] = '\0';
509 return (T_CHAR);
510 }
511 if (c == esc_char) {
512 escaped = 1;
513 continue;
514 }
515 switch (c) {
516 case '<':
517 return (get_symbol());
518 case '>':
519 /* oops! should generate syntax error */
520 return (T_GT);
521 case '"':
522 instring = 0;
523 return (T_QUOTE);
524 default:
525 yylval.mbs[0] = 1; /* length */
526 yylval.mbs[1] = c;
527 yylval.mbs[2] = '\0';
528 return (T_CHAR);
529 }
530 }
531
532 /* escaped characters first */
533 if (escaped) {
534 escaped = 0;
535 if (c == '\n') {
536 /* eat the newline */
537 continue;
538 }
539 hadtok = 1;
540 if (tokidx) {
541 /* an escape mid-token is nonsense */
542 return (T_NULL);
543 }
544
545 /* numeric escapes are treated as wide characters */
546 if (strchr("xXd01234567", c)) {
547 unscanc(c);
548 unscanc(esc_char);
549 return (get_wide());
550 }
551
552 add_tok(get_escaped(c));
553 continue;
554 }
555
556 /* if it is the escape charter itself note it */
557 if (c == esc_char) {
558 escaped = 1;
559 continue;
560 }
561
562 /* remove from the comment char to end of line */
563 if (c == com_char) {
564 while (c != '\n') {
565 if ((c = scanc()) == EOF) {
566 /* end of file without newline! */
567 return (EOF);
568 }
569 }
570 assert(c == '\n');
571 if (!hadtok) {
572 /*
573 * If there were no tokens on this line,
574 * then just pretend it didn't exist at all.
575 */
576 continue;
577 }
578 hadtok = 0;
579 return (T_NL);
580 }
581
582 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
583 /*
584 * These are all token delimiters. If there
585 * is a token already in progress, we need to
586 * process it.
587 */
588 unscanc(c);
589 return (consume_token());
590 }
591
592 switch (c) {
593 case '\n':
594 if (!hadtok) {
595 /*
596 * If the line was completely devoid of tokens,
597 * then just ignore it.
598 */
599 continue;
600 }
601 /* we're starting a new line, reset the token state */
602 hadtok = 0;
603 return (T_NL);
604 case ',':
605 hadtok = 1;
606 return (T_COMMA);
607 case ';':
608 hadtok = 1;
609 return (T_SEMI);
610 case '(':
611 hadtok = 1;
612 return (T_LPAREN);
613 case ')':
614 hadtok = 1;
615 return (T_RPAREN);
616 case '>':
617 hadtok = 1;
618 return (T_GT);
619 case '<':
620 /* symbol start! */
621 hadtok = 1;
622 return (get_symbol());
623 case ' ':
624 case '\t':
625 /* whitespace, just ignore it */
626 continue;
627 case '"':
628 hadtok = 1;
629 instring = 1;
630 return (T_QUOTE);
631 default:
632 hadtok = 1;
633 add_tok(c);
634 continue;
635 }
636 }
637 return (EOF);
638 }
639
640 void
641 yyerror(const char *msg)
642 {
643 (void) fprintf(stderr, _("%s: %d: error: %s\n"),
644 filename, lineno, msg);
645 exit(1);
646 }
647
648 void
649 errf(const char *fmt, ...)
650 {
651 char *msg;
652
653 va_list va;
654 va_start(va, fmt);
655 (void) vasprintf(&msg, fmt, va);
656 va_end(va);
657
658 (void) fprintf(stderr, _("%s: %d: error: %s\n"),
659 filename, lineno, msg);
660 free(msg);
661 exit(1);
662 }
663
664 void
665 warn(const char *fmt, ...)
666 {
667 char *msg;
668
669 va_list va;
670 va_start(va, fmt);
671 (void) vasprintf(&msg, fmt, va);
672 va_end(va);
673
674 (void) fprintf(stderr, _("%s: %d: warning: %s\n"),
675 filename, lineno, msg);
676 free(msg);
677 warnings++;
678 }