1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
14 * Copyright 2013 David Hoeppner. All rights reserved.
15 */
16
17 /*
18 * Functions to charmap .
19 */
20
21 #include <assert.h>
22 #include <ctype.h>
23 #include <limits.h>
24 #include <widec.h>
25
26 #include "iconv.h"
27 #include "parser.tab.h"
28
29 /*
30 * Helper macros.
31 */
32 #define hex(x) \
33 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
34
35 #define isodigit(x) ((x >= '0') && (x <= '7'))
36
37 /*
38 * Charmap specific.
39 */
40 int com_char = '#';
41 int esc_char = '\\';
42 int mb_cur_max = 1;
43 int mb_cur_min = 1;
44
45 int lineno = 1;
46 static FILE *input = stdin;
47 static const char *filename = "<stdin>";
48 static int escaped = 0;
49 static int instring = 0;
50 static int nextline;
51
52 /*
53 * Tokens.
54 */
55 static char *token = NULL;
56 static int tokidx;
57 static int toksz = 0;
58 static int hadtok = 0;
59
60 /*
61 * Wide strings.
62 */
63 static wchar_t *widestr = NULL;
64 static int wideidx = 0;
65 static int widesz = 0;
66
67 /*
68 * Keywords related.
69 */
70 int last_kw = 0;
71 static int category = T_END;
72
73 static struct token {
74 int id;
75 const char *name;
76 } keywords[] = {
77 { T_COM_CHAR, "comment_char" },
78 { T_ESC_CHAR, "escape_char" },
79 { T_END, "END" },
80 { T_CHARMAP, "CHARMAP" },
81 { T_WIDTH, "WIDTH" },
82 { T_WIDTH_DEFAULT, "WIDTH_DEFAULT" },
83 { -1, NULL },
84 };
85
86 /*
87 * Charmap reserved keywords.
88 */
89 static struct token symwords[] = {
90 { T_COM_CHAR, "comment_char" },
91 { T_ESC_CHAR, "escape_char" },
92 { T_CODE_SET, "code_set_name" },
93 { T_MB_CUR_MAX, "mb_cur_max" },
94 { T_MB_CUR_MIN, "mb_cur_min" },
95 { -1, NULL },
96 };
97
98 static int categories[] = {
99 T_CHARMAP,
100 T_WIDTH,
101 0,
102 };
103
104 char *
105 to_mb_string(const wchar_t *wcs)
106 {
107 return (NULL);
108 }
109
110 void
111 set_wide_encoding(const char *encoding)
112 {
113 }
114
115 /*
116 * Reset the scanner variables and open the supplied charmap file.
117 */
118 void
119 reset_scanner(const char *fname)
120 {
121 input = fopen(fname, "r");
122 if (input == NULL) {
123 perror("fopen");
124 exit(4);
125 }
126
127 filename = fname;
128 com_char = '#';
129 esc_char = '\\';
130 instring = 0;
131 escaped = 0;
132 lineno = 1;
133 nextline = 1;
134 tokidx = 0;
135 wideidx = 0;
136 }
137
138 static int
139 scanc(void)
140 {
141 int c;
142
143 c = getc(input);
144 lineno = nextline;
145 if (c == '\n') {
146 nextline++;
147 }
148
149 return (c);
150 }
151
152 static void
153 unscanc(int c)
154 {
155 if (c == '\n') {
156 nextline--;
157 }
158
159 if (ungetc(c, input) < 0) {
160 yyerror(_("ungetc failed"));
161 }
162 }
163
164 static int
165 scan_hex_byte(void)
166 {
167 int c1, c2;
168 int v;
169
170 c1 = scanc();
171 if (!isxdigit(c1)) {
172 yyerror(_("malformed hex digit"));
173 return (0);
174 }
175 c2 = scanc();
176 if (!isxdigit(c2)) {
177 yyerror(_("malformed hex digit"));
178 return (0);
179 }
180 v = ((hex(c1) << 4) | hex(c2));
181 return (v);
182 }
183
184 static int
185 scan_dec_byte(void)
186 {
187 int c1, c2, c3;
188 int b;
189
190 c1 = scanc();
191 if (!isdigit(c1)) {
192 yyerror(_("malformed decimal digit"));
193 return (0);
194 }
195 b = c1 - '0';
196 c2 = scanc();
197 if (!isdigit(c2)) {
198 yyerror(_("malformed decimal digit"));
199 return (0);
200 }
201 b *= 10;
202 b += (c2 - '0');
203 c3 = scanc();
204 if (!isdigit(c3)) {
205 unscanc(c3);
206 } else {
207 b *= 10;
208 b += (c3 - '0');
209 }
210 return (b);
211 }
212
213 static int
214 scan_oct_byte(void)
215 {
216 int c1, c2, c3;
217 int b;
218
219 b = 0;
220
221 c1 = scanc();
222 if (!isodigit(c1)) {
223 yyerror(_("malformed octal digit"));
224 return (0);
225 }
226 b = c1 - '0';
227 c2 = scanc();
228 if (!isodigit(c2)) {
229 yyerror(_("malformed octal digit"));
230 return (0);
231 }
232 b *= 8;
233 b += (c2 - '0');
234 c3 = scanc();
235 if (!isodigit(c3)) {
236 unscanc(c3);
237 } else {
238 b *= 8;
239 b += (c3 - '0');
240 }
241 return (b);
242 }
243
244 void
245 add_tok(int c)
246 {
247 if ((tokidx + 1) >= toksz) {
248 toksz += 64;
249
250 if ((token = realloc(token, toksz)) == NULL) {
251 yyerror(_("out of memory"));
252 tokidx = 0;
253 toksz = 0;
254 return;
255 }
256 }
257
258 token[tokidx++] = (char)c;
259 token[tokidx] = 0;
260 }
261
262 void
263 add_wcs(wchar_t c)
264 {
265 if ((wideidx + 1) >= widesz) {
266 widesz += 64;
267 widestr = realloc(widestr, (widesz * sizeof (wchar_t)));
268 if (widestr == NULL) {
269 yyerror(_("out of memory"));
270 wideidx = 0;
271 widesz = 0;
272 return;
273 }
274 }
275
276 widestr[wideidx++] = c;
277 widestr[wideidx] = 0;
278 }
279
280 wchar_t *
281 get_wcs(void)
282 {
283 wchar_t *ws = widestr;
284
285 wideidx = 0;
286 widestr = NULL;
287 widesz = 0;
288
289 if (ws == NULL) {
290 if ((ws = wsdup(L"")) == NULL) {
291 yyerror(_("out of memory"));
292 }
293 }
294
295 return (ws);
296 }
297
298 static int
299 get_byte(void)
300 {
301 int c;
302
303 if ((c = scanc()) != esc_char) {
304 unscanc(c);
305 return (EOF);
306 }
307
308 c = scanc();
309
310 switch (c) {
311 case 'd':
312 case 'D':
313 return (scan_dec_byte());
314 case 'x':
315 case 'X':
316 return (scan_hex_byte());
317 case '0' ... '7':
318 /* Put the character back so we can get it */
319 unscanc(c);
320 return (scan_oct_byte());
321 default:
322 unscanc(c);
323 unscanc(esc_char);
324 return (EOF);
325 }
326 }
327
328 int
329 get_escaped(int c)
330 {
331 switch (c) {
332 case 'n':
333 return ('\n');
334 case 'r':
335 return ('\r');
336 case 't':
337 return ('\t');
338 case 'f':
339 return ('\f');
340 case 'v':
341 return ('\v');
342 case 'b':
343 return ('\b');
344 case 'a':
345 return ('\a');
346 default:
347 return (c);
348 }
349 }
350
351 int
352 get_wide(void)
353 {
354 char mbs[MB_LEN_MAX + 1] = "";
355 int mbi = 0;
356 int c;
357 wchar_t wc;
358
359 if (mb_cur_max >= sizeof (mbs)) {
360 yyerror(_("max multibyte character size too big"));
361 mbi = 0;
362 return (T_NULL);
363 }
364
365 for (;;) {
366 if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) {
367 /*
368 * End of the byte sequence reached, but no
369 * valid wide decoding. Fatal error.
370 */
371 mbi = 0;
372 yyerror(_("not a valid character encoding"));
373 return (T_NULL);
374 }
375
376 mbs[mbi++] = c;
377 mbs[mbi] = 0;
378
379 if (mbi == mb_cur_max) {
380 break;
381 }
382 }
383
384 mbi = 0;
385 /* XXX */
386 yylval.wc = (uint8_t)*mbs;
387
388 return (T_CHAR);
389 }
390
391 int
392 get_symbol(void)
393 {
394 int c;
395
396 while ((c = scanc()) != EOF) {
397 if (escaped == 1) {
398 escaped = 0;
399 if (c == '\n') {
400 continue;
401 }
402
403 add_tok(get_escaped(c));
404 continue;
405 }
406
407 if (c == esc_char) {
408 escaped = 1;
409 continue;
410 }
411
412 if (c == '\n') { /* Well that's strange! */
413 yyerror(_("unterminated symbolic name"));
414 continue;
415 }
416
417 if (c == '>') { /* End of symbol */
418 /*
419 * This restarts the token from the beginning
420 * the next time we scan a character. (This
421 * token is complete.)
422 */
423 if (token == NULL) {
424 yyerror(_("missing symbolic name"));
425 return (T_NULL);
426 }
427
428 tokidx = 0;
429
430 /*
431 * A few symbols are handled as keywords outside
432 * of the normal categories.
433 */
434 if (category == T_END) {
435 int i;
436
437 for (i = 0; symwords[i].name != 0; i++) {
438 if (strcmp(token, symwords[i].name) ==
439 0) {
440 last_kw = symwords[i].id;
441 return (last_kw);
442 }
443 }
444 }
445
446 /* XXX */
447
448 /* Its an undefined symbol */
449 yylval.token = strdup(token);
450 token = NULL;
451 toksz = 0;
452 tokidx = 0;
453 printf("returning SYMBOL %s\n", yylval.token);
454 return (T_SYMBOL);
455 }
456
457 add_tok(c);
458 }
459
460 yyerror(_("unterminated symbolic name"));
461
462 return (EOF);
463 }
464
465 static int
466 consume_token(void)
467 {
468 int len = tokidx;
469 int i;
470
471 tokidx = 0;
472 if (token == NULL) {
473 return (T_NULL);
474 }
475
476 /*
477 * This one is special, because we don't want it to alter the
478 * last_kw field.
479 */
480 if (strcmp(token, "...") == 0) {
481 return (T_ELLIPSIS);
482 }
483
484 /* Search for reserved words first */
485 for (i = 0; keywords[i].name; i++) {
486 int j;
487
488 if (strcmp(keywords[i].name, token)) {
489 continue;
490 }
491
492 last_kw = keywords[i].id;
493
494 /* Clear the top level category if we're done with it */
495 if (last_kw == T_END) {
496 category = T_END;
497 }
498
499 /* Set the top level category if we're changing */
500 for (j = 0; categories[j]; j++) {
501 if (categories[j] != last_kw) {
502 continue;
503 }
504 category = last_kw;
505 }
506
507 return (keywords[i].id);
508 }
509
510 /* Maybe its a numeric constant? */
511 if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
512 char *eptr;
513
514 yylval.num = strtol(token, &eptr, 10);
515 if (*eptr != 0) {
516 yyerror(_("malformed number"));
517 }
518
519 return (T_NUMBER);
520 }
521
522 /*
523 * A single lone character is treated as a character literal.
524 * To avoid duplication of effort, we stick in the charmap.
525 */
526 if (len == 1) {
527 yylval.wc = token[0];
528 return (T_CHAR);
529 }
530
531 /* Anything else is treated as a symbolic name */
532 yylval.token = strdup(token);
533 token = NULL;
534 toksz = 0;
535 tokidx = 0;
536
537 return (T_NAME);
538 }
539
540 void
541 scan_to_eol(void)
542 {
543 int c;
544
545 while ((c = scanc()) != '\n') {
546 if (c == EOF) {
547 /* end of file without newline! */
548 errf(_("missing newline"));
549 return;
550 }
551 }
552
553 assert(c == '\n');
554 }
555
556 int
557 yylex(void)
558 {
559 int c;
560
561 while ((c = scanc()) != EOF) {
562 printf("--- yylex --%c--\n", c);
563
564 /* Special handling for quoted strings */
565 if (instring == 1) {
566 if (escaped == 1) {
567 escaped = 0;
568
569 /* If newline, just eat and forget it */
570 if (c == '\n') {
571 continue;
572 }
573
574 if (strchr("xd01234567", c)) {
575 unscanc(c);
576 unscanc(esc_char);
577 return (get_wide());
578 }
579
580 yylval.wc = get_escaped(c);
581 return (T_CHAR);
582 }
583
584 if (c == esc_char) {
585 escaped = 1;
586 continue;
587 }
588
589 switch (c) {
590 case '<':
591 return (get_symbol());
592 case '>':
593 /* Opps! Should generate syntax error */
594 return (T_GT);
595 case '"':
596 instring = 0;
597 return (T_QUOTE);
598 default:
599 yylval.wc = c;
600 return (T_CHAR);
601 }
602 }
603
604 /* Escaped characters first */
605 if (escaped == 1) {
606 escaped = 0;
607 if (c == '\n') {
608 /* Eat the newline */
609 continue;
610 }
611 hadtok = 1;
612 if (tokidx != 0) {
613 /* An escape mid-token is nonsense */
614 return (T_NULL);
615 }
616
617 /* Numeric escapes are treated as wide characters */
618 if (strchr("xXd01234567", c)) {
619 unscanc(c);
620 unscanc(esc_char);
621 return (get_wide());
622 }
623
624 add_tok(get_escaped(c));
625 continue;
626 }
627
628 /* If it is the escape character itself note it */
629 if (c == esc_char) {
630 escaped = 1;
631 continue;
632 }
633
634 /* Remove from the comment character to end of line */
635 if (c == com_char) {
636 while (c != '\n') {
637 if ((c = scanc()) == EOF) {
638 /* End of file without newline */
639 return (EOF);
640 }
641 }
642
643 assert(c == '\n');
644
645 if (hadtok == 0) {
646 /*
647 * If there were no tokens on this line,
648 * then just pretend it didn't exist at all.
649 */
650 continue;
651 }
652
653 hadtok = 0;
654 return (T_NL);
655 }
656
657 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
658 /*
659 * These are all token delimiters. If there
660 * is a token already in progress, we need to
661 * process it.
662 */
663 unscanc(c);
664 return (consume_token());
665 }
666
667 switch (c) {
668 case '\n':
669 if (hadtok == 0) {
670 /*
671 * If the line was completely devoid of tokens,
672 * then just ignore it.
673 */
674 continue;
675 }
676
677 /* We're starting a new line, reset the token state */
678 hadtok = 0;
679 return (T_NL);
680 case '>':
681 hadtok = 1;
682 return (T_GT);
683 case '<':
684 /* Symbol start! */
685 hadtok = 1;
686 return (get_symbol());
687 case ' ':
688 case '\t':
689 /* Whitespace, just ignore */
690 continue;
691 case '"':
692 hadtok = 1;
693 instring = 1;
694 return (T_QUOTE);
695 default:
696 //printf("--- adding %c to token\n", c);
697 hadtok = 1;
698 add_tok(c);
699 continue;
700 }
701 }
702
703 return (EOF);
704 }
705
706 void
707 yyerror(const char *msg)
708 {
709 (void) fprintf(stderr, _("%s: %d: error: %s\n"),
710 filename, lineno, msg);
711 exit(4);
712 }
713
714 void
715 errf(const char *fmt, ...)
716 {
717 char *msg;
718 va_list va;
719
720 va_start(va, fmt);
721 (void) vasprintf(&msg, fmt, va);
722 va_end(va);
723
724 (void) fprintf(stderr, _("%s: %d: error: %s\n"),
725 filename, lineno, msg);
726 free(msg);
727 exit(4);
728 }