Print this page
9083 replace regex implementation with tre
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/cmd/awk_xpg4/awk1.c
+++ new/usr/src/cmd/awk_xpg4/awk1.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 /*
27 27 * Copyright 1986, 1994 by Mortice Kern Systems Inc. All rights reserved.
28 28 */
29 29
30 30 /*
31 31 * awk -- mainline, yylex, etc.
32 32 *
33 33 * Based on MKS awk(1) ported to be /usr/xpg4/bin/awk with POSIX/XCU4 changes
34 34 */
35 35
36 36 #include "awk.h"
37 37 #include "y.tab.h"
38 38 #include <stdarg.h>
39 39 #include <unistd.h>
40 40 #include <locale.h>
41 41 #include <search.h>
42 42
43 43 static char *progfiles[NPFILE]; /* Programmes files for yylex */
44 44 static char **progfilep = &progfiles[0]; /* Pointer to last file */
45 45 static wchar_t *progptr; /* In-memory programme */
46 46 static int proglen; /* Length of progptr */
47 47 static wchar_t context[NCONTEXT]; /* Circular buffer of context */
48 48 static wchar_t *conptr = &context[0]; /* context ptr */
49 49 static FILE *progfp; /* Stdio stream for programme */
50 50 static char *filename;
51 51 #ifdef DEBUG
52 52 static int dflag;
53 53 #endif
54 54
55 55 #define AWK_EXEC_MAGIC "<MKS AWKC>"
56 56 #define LEN_EXEC_MAGIC 10
57 57
58 58 static char unbal[] = "unbalanced E char";
59 59
60 60 static void awkarginit(int c, char **av);
61 61 static int lexid(wint_t c);
62 62 static int lexnumber(wint_t c);
63 63 static int lexstring(wint_t endc);
64 64 static int lexregexp(wint_t endc);
65 65
66 66 static void awkvarinit(void);
67 67 static wint_t lexgetc(void);
68 68 static void lexungetc(wint_t c);
69 69 static size_t lexescape(wint_t endc, int regx, int cmd_line_operand);
70 70 static void awkierr(int perr, char *fmt, va_list ap);
71 71 static int usage(void);
72 72 void strescape(wchar_t *str);
73 73 static const char *toprint(wint_t);
74 74 char *_cmdname;
75 75 static wchar_t *mbconvert(char *str);
76 76
77 77 extern int isclvar(wchar_t *arg);
78 78
79 79 /*
80 80 * mainline for awk
81 81 */
82 82 int
83 83 main(int argc, char *argv[])
84 84 {
85 85 wchar_t *ap;
86 86 char *cmd;
87 87
88 88 cmd = argv[0];
89 89 _cmdname = cmd;
90 90
91 91 linebuf = emalloc(NLINE * sizeof (wchar_t));
92 92
93 93 /*
94 94 * At this point only messaging should be internationalized.
95 95 * numbers are still scanned as in the Posix locale.
96 96 */
97 97 (void) setlocale(LC_ALL, "");
98 98 (void) setlocale(LC_NUMERIC, "C");
99 99 #if !defined(TEXT_DOMAIN)
100 100 #define TEXT_DOMAIN "SYS_TEST"
101 101 #endif
102 102 (void) textdomain(TEXT_DOMAIN);
103 103
104 104 awkvarinit();
105 105 /* running = 1; */
106 106 while (argc > 1 && *argv[1] == '-') {
107 107 void *save_ptr = NULL;
108 108 ap = mbstowcsdup(&argv[1][1]);
109 109 if (ap == NULL)
110 110 break;
111 111 if (*ap == '\0') {
112 112 free(ap);
113 113 break;
114 114 }
115 115 save_ptr = (void *) ap;
116 116 ++argv;
117 117 --argc;
118 118 if (*ap == '-' && ap[1] == '\0')
119 119 break;
120 120 for (; *ap != '\0'; ++ap) {
121 121 switch (*ap) {
122 122 #ifdef DEBUG
123 123 case 'd':
124 124 dflag = 1;
125 125 continue;
126 126
127 127 #endif
128 128 case 'f':
129 129 if (argc < 2) {
130 130 (void) fprintf(stderr,
131 131 gettext("Missing script file\n"));
132 132 return (1);
133 133 }
134 134 *progfilep++ = argv[1];
135 135 --argc;
136 136 ++argv;
137 137 continue;
138 138
139 139 case 'F':
140 140 if (ap[1] == '\0') {
141 141 if (argc < 2) {
142 142 (void) fprintf(stderr,
143 143 gettext("Missing field separator\n"));
144 144 return (1);
145 145 }
146 146 ap = mbstowcsdup(argv[1]);
147 147 --argc;
148 148 ++argv;
149 149 } else
150 150 ++ap;
151 151 strescape(ap);
152 152 strassign(varFS, linebuf, FALLOC,
153 153 wcslen(linebuf));
154 154 break;
155 155
156 156 case 'v': {
157 157 wchar_t *vp;
158 158 wchar_t *arg;
159 159
160 160 if (argc < 2) {
161 161 (void) fprintf(stderr,
162 162 gettext("Missing variable assignment\n"));
163 163 return (1);
164 164 }
165 165 arg = mbconvert(argv[1]);
166 166 /*
167 167 * Ensure the variable expression
168 168 * is valid (correct form).
169 169 */
170 170 if (((vp = wcschr(arg, '=')) != NULL) &&
171 171 isclvar(arg)) {
172 172 *vp = '\0';
173 173 strescape(vp+1);
174 174 strassign(vlook(arg), linebuf,
175 175 FALLOC|FSENSE,
176 176 wcslen(linebuf));
177 177 *vp = '=';
178 178 } else {
179 179 (void) fprintf(stderr, gettext(
180 180 "Invalid form for variable "
181 181 "assignment: %S\n"), arg);
182 182 return (1);
183 183 }
184 184 --argc;
185 185 ++argv;
186 186 continue;
187 187 }
188 188
189 189 default:
190 190 (void) fprintf(stderr,
191 191 gettext("Unknown option \"-%S\"\n"), ap);
192 192 return (usage());
193 193 }
194 194 break;
195 195 }
196 196 if (save_ptr)
197 197 free(save_ptr);
198 198 }
199 199 if (progfilep == &progfiles[0]) {
200 200 if (argc < 2)
201 201 return (usage());
202 202 filename = "[command line]"; /* BUG: NEEDS TRANSLATION */
203 203 progptr = mbstowcsdup(argv[1]);
204 204 proglen = wcslen(progptr);
205 205 --argc;
206 206 ++argv;
207 207 }
208 208
209 209 argv[0] = cmd;
210 210
211 211 awkarginit(argc, argv);
212 212
213 213 /* running = 0; */
214 214 (void) yyparse();
215 215
216 216 lineno = 0;
217 217 /*
218 218 * Ok, done parsing, so now activate the rest of the nls stuff, set
219 219 * the radix character.
220 220 */
221 221 (void) setlocale(LC_ALL, "");
222 222 radixpoint = *localeconv()->decimal_point;
223 223 awk();
224 224 /* NOTREACHED */
225 225 return (0);
226 226 }
227 227
228 228 /*
229 229 * Do initial setup of buffers, etc.
230 230 * This must be called before most processing
231 231 * and especially before lexical analysis.
232 232 * Variables initialised here will be overruled by command
233 233 * line parameter initialisation.
234 234 */
235 235 static void
236 236 awkvarinit()
237 237 {
238 238 NODE *np;
239 239
240 240 (void) setvbuf(stderr, NULL, _IONBF, 0);
241 241
242 242 if ((NIOSTREAM = sysconf(_SC_OPEN_MAX) - 4) <= 0) {
243 243 (void) fprintf(stderr,
244 244 gettext("not enough available file descriptors"));
245 245 exit(1);
246 246 }
247 247 ofiles = (OFILE *)emalloc(sizeof (OFILE)*NIOSTREAM);
248 248 #ifdef A_ZERO_POINTERS
249 249 (void) memset((wchar_t *)ofiles, 0, sizeof (OFILE) * NIOSTREAM);
250 250 #else
251 251 {
252 252 /* initialize file descriptor table */
253 253 OFILE *fp;
254 254 for (fp = ofiles; fp < &ofiles[NIOSTREAM]; fp += 1) {
255 255 fp->f_fp = FNULL;
256 256 fp->f_mode = 0;
257 257 fp->f_name = (char *)0;
258 258 }
259 259 }
260 260 #endif
261 261 constant = intnode((INT)0);
262 262
263 263 const0 = intnode((INT)0);
264 264 const1 = intnode((INT)1);
265 265 constundef = emptynode(CONSTANT, 0);
266 266 constundef->n_flags = FSTRING|FVINT;
267 267 constundef->n_string = _null;
268 268 constundef->n_strlen = 0;
269 269 inc_oper = emptynode(ADD, 0);
270 270 inc_oper->n_right = const1;
271 271 asn_oper = emptynode(ADD, 0);
272 272 field0 = node(FIELD, const0, NNULL);
273 273
274 274 {
275 275 RESFUNC near*rp;
276 276
277 277 for (rp = &resfuncs[0]; rp->rf_name != (LOCCHARP)NULL; ++rp) {
278 278 np = finstall(rp->rf_name, rp->rf_func, rp->rf_type);
279 279 }
280 280 }
281 281 {
282 282 RESERVED near*rp;
283 283
284 284 for (rp = &reserved[0]; rp->r_name != (LOCCHARP)NULL; ++rp) {
285 285 switch (rp->r_type) {
286 286 case SVAR:
287 287 case VAR:
288 288 running = 1;
289 289 np = vlook(rp->r_name);
290 290 if (rp->r_type == SVAR)
291 291 np->n_flags |= FSPECIAL;
292 292 if (rp->r_svalue != NULL)
293 293 strassign(np, rp->r_svalue, FSTATIC,
294 294 (size_t)rp->r_ivalue);
295 295 else {
296 296 constant->n_int = rp->r_ivalue;
297 297 (void) assign(np, constant);
298 298 }
299 299 running = 0;
300 300 break;
301 301
302 302 case KEYWORD:
303 303 kinstall(rp->r_name, (int)rp->r_ivalue);
304 304 break;
305 305 }
306 306 }
307 307 }
308 308
309 309 varNR = vlook(s_NR);
310 310 varFNR = vlook(s_FNR);
311 311 varNF = vlook(s_NF);
312 312 varOFMT = vlook(s_OFMT);
313 313 varCONVFMT = vlook(s_CONVFMT);
314 314 varOFS = vlook(s_OFS);
315 315 varORS = vlook(s_ORS);
316 316 varRS = vlook(s_RS);
317 317 varFS = vlook(s_FS);
318 318 varARGC = vlook(s_ARGC);
319 319 varSUBSEP = vlook(s_SUBSEP);
320 320 varENVIRON = vlook(s_ENVIRON);
321 321 varFILENAME = vlook(s_FILENAME);
322 322 varSYMTAB = vlook(s_SYMTAB);
323 323 incNR = node(ASG, varNR, node(ADD, varNR, const1));
324 324 incFNR = node(ASG, varFNR, node(ADD, varFNR, const1));
325 325 clrFNR = node(ASG, varFNR, const0);
326 326 }
327 327
328 328 /*
329 329 * Initialise awk ARGC, ARGV variables.
330 330 */
331 331 static void
332 332 awkarginit(int ac, char **av)
333 333 {
334 334 int i;
335 335 wchar_t *cp;
336 336
337 337 ARGVsubi = node(INDEX, vlook(s_ARGV), constant);
338 338 running = 1;
339 339 constant->n_int = ac;
340 340 (void) assign(varARGC, constant);
341 341 for (i = 0; i < ac; ++i) {
342 342 cp = mbstowcsdup(av[i]);
343 343 constant->n_int = i;
344 344 strassign(exprreduce(ARGVsubi), cp,
345 345 FSTATIC|FSENSE, wcslen(cp));
346 346 }
347 347 running = 0;
348 348 }
349 349
350 350 /*
351 351 * Clean up when done parsing a function.
352 352 * All formal parameters, because of a deal (funparm) in
353 353 * yylex, get put into the symbol table in front of any
354 354 * global variable of the same name. When the entire
355 355 * function is parsed, remove these formal dummy nodes
356 356 * from the symbol table but retain the nodes because
357 357 * the generated tree points at them.
358 358 */
359 359 void
360 360 uexit(NODE *np)
361 361 {
362 362 NODE *formal;
363 363
364 364 while ((formal = getlist(&np)) != NNULL)
365 365 delsymtab(formal, 0);
366 366 }
367 367
368 368 /*
369 369 * The lexical analyzer.
370 370 */
371 371 int
372 372 yylex()
373 373 {
374 374 wint_t c, c1;
375 375 int i;
376 376 static int savetoken = 0;
377 377 static int wasfield;
378 378 static int isfuncdef;
379 379 static int nbrace, nparen, nbracket;
380 380 static struct ctosymstruct {
381 381 wint_t c, sym;
382 382 } ctosym[] = {
383 383 { '|', BAR }, { '^', CARAT },
384 384 { '~', TILDE }, { '<', LANGLE },
385 385 { '>', RANGLE }, { '+', PLUSC },
386 386 { '-', HYPHEN }, { '*', STAR },
387 387 { '/', SLASH }, { '%', PERCENT },
388 388 { '!', EXCLAMATION }, { '$', DOLLAR },
389 389 { '[', LSQUARE }, { ']', RSQUARE },
390 390 { '(', LPAREN }, { ')', RPAREN },
391 391 { ';', SEMI }, { '{', LBRACE },
392 392 { '}', RBRACE }, { 0, 0 }
393 393 };
394 394
395 395 if (savetoken) {
396 396 c = savetoken;
397 397 savetoken = 0;
398 398 } else if (redelim != '\0') {
399 399 c = redelim;
400 400 redelim = 0;
401 401 catterm = 0;
402 402 savetoken = c;
403 403 c = lexlast = lexregexp(c);
404 404 goto out;
405 405 } else while ((c = lexgetc()) != WEOF) {
406 406 if (iswalpha(c) || c == '_') {
407 407 c = lexid(c);
408 408 } else if (iswdigit(c) || c == '.') {
409 409 c = lexnumber(c);
410 410 } else if (isWblank(c)) {
411 411 continue;
412 412 } else switch (c) {
413 413 #if DOS || OS2
414 414 case 032: /* ^Z */
415 415 continue;
416 416 #endif
417 417
418 418 case '"':
419 419 c = lexstring(c);
420 420 break;
421 421
422 422 case '#':
423 423 while ((c = lexgetc()) != '\n' && c != WEOF)
424 424 ;
425 425 lexungetc(c);
426 426 continue;
427 427
428 428 case '+':
429 429 if ((c1 = lexgetc()) == '+')
430 430 c = INC;
431 431 else if (c1 == '=')
432 432 c = AADD;
433 433 else
434 434 lexungetc(c1);
435 435 break;
436 436
437 437 case '-':
438 438 if ((c1 = lexgetc()) == '-')
439 439 c = DEC;
440 440 else if (c1 == '=')
441 441 c = ASUB;
442 442 else
443 443 lexungetc(c1);
444 444 break;
445 445
446 446 case '*':
447 447 if ((c1 = lexgetc()) == '=')
448 448 c = AMUL;
449 449 else if (c1 == '*') {
450 450 if ((c1 = lexgetc()) == '=')
451 451 c = AEXP;
452 452 else {
453 453 c = EXP;
454 454 lexungetc(c1);
455 455 }
456 456 } else
457 457 lexungetc(c1);
458 458 break;
459 459
460 460 case '^':
461 461 if ((c1 = lexgetc()) == '=') {
462 462 c = AEXP;
463 463 } else {
464 464 c = EXP;
465 465 lexungetc(c1);
466 466 }
467 467 break;
468 468
469 469 case '/':
470 470 if ((c1 = lexgetc()) == '=' &&
471 471 lexlast != RE && lexlast != NRE &&
472 472 lexlast != ';' && lexlast != '\n' &&
473 473 lexlast != ',' && lexlast != '(')
474 474 c = ADIV;
475 475 else
476 476 lexungetc(c1);
477 477 break;
478 478
479 479 case '%':
480 480 if ((c1 = lexgetc()) == '=')
481 481 c = AREM;
482 482 else
483 483 lexungetc(c1);
484 484 break;
485 485
486 486 case '&':
487 487 if ((c1 = lexgetc()) == '&')
488 488 c = AND;
489 489 else
490 490 lexungetc(c1);
491 491 break;
492 492
493 493 case '|':
494 494 if ((c1 = lexgetc()) == '|')
495 495 c = OR;
496 496 else {
497 497 lexungetc(c1);
498 498 if (inprint)
499 499 c = PIPE;
500 500 }
501 501 break;
502 502
503 503 case '>':
504 504 if ((c1 = lexgetc()) == '=')
505 505 c = GE;
506 506 else if (c1 == '>')
507 507 c = APPEND;
508 508 else {
509 509 lexungetc(c1);
510 510 if (nparen == 0 && inprint)
511 511 c = WRITE;
512 512 }
513 513 break;
514 514
515 515 case '<':
516 516 if ((c1 = lexgetc()) == '=')
517 517 c = LE;
518 518 else
519 519 lexungetc(c1);
520 520 break;
521 521
522 522 case '!':
523 523 if ((c1 = lexgetc()) == '=')
524 524 c = NE;
525 525 else if (c1 == '~')
526 526 c = NRE;
527 527 else
528 528 lexungetc(c1);
529 529 break;
530 530
531 531 case '=':
532 532 if ((c1 = lexgetc()) == '=')
533 533 c = EQ;
534 534 else {
535 535 lexungetc(c1);
536 536 c = ASG;
537 537 }
538 538 break;
539 539
540 540 case '\n':
541 541 switch (lexlast) {
542 542 case ')':
543 543 if (catterm || inprint) {
544 544 c = ';';
545 545 break;
546 546 }
547 547 /*FALLTHRU*/
548 548 case AND:
549 549 case OR:
550 550 case COMMA:
551 551 case '{':
552 552 case ELSE:
553 553 case ';':
554 554 case DO:
555 555 continue;
556 556
557 557 case '}':
558 558 if (nbrace != 0)
559 559 continue;
560 560
561 561 default:
562 562 c = ';';
563 563 break;
564 564 }
565 565 break;
566 566
567 567 case ELSE:
568 568 if (lexlast != ';') {
569 569 savetoken = ELSE;
570 570 c = ';';
571 571 }
572 572 break;
573 573
574 574 case '(':
575 575 ++nparen;
576 576 break;
577 577
578 578 case ')':
579 579 if (--nparen < 0)
580 580 awkerr(unbal, "()");
581 581 break;
582 582
583 583 case '{':
584 584 nbrace++;
585 585 break;
586 586
587 587 case '}':
588 588 if (--nbrace < 0) {
589 589 char brk[3];
590 590
591 591 brk[0] = '{';
592 592 brk[1] = '}';
593 593 brk[2] = '\0';
594 594 awkerr(unbal, brk);
595 595 }
596 596 if (lexlast != ';') {
597 597 savetoken = c;
598 598 c = ';';
599 599 }
600 600 break;
601 601
602 602 case '[':
603 603 ++nbracket;
604 604 break;
605 605
606 606 case ']':
607 607 if (--nbracket < 0) {
608 608 char brk[3];
609 609
610 610 brk[0] = '[';
611 611 brk[1] = ']';
612 612 brk[2] = '\0';
613 613 awkerr(unbal, brk);
614 614 }
615 615 break;
616 616
617 617 case '\\':
618 618 if ((c1 = lexgetc()) == '\n')
619 619 continue;
620 620 lexungetc(c1);
621 621 break;
622 622
623 623 case ',':
624 624 c = COMMA;
625 625 break;
626 626
627 627 case '?':
628 628 c = QUEST;
629 629 break;
630 630
631 631 case ':':
632 632 c = COLON;
633 633 break;
634 634
635 635 default:
636 636 if (!iswprint(c))
637 637 awkerr(
638 638 gettext("invalid character \"%s\""),
639 639 toprint(c));
640 640 break;
641 641 }
642 642 break;
643 643 }
644 644
645 645 switch (c) {
646 646 case ']':
647 647 ++catterm;
648 648 break;
649 649
650 650 case VAR:
651 651 if (catterm) {
652 652 savetoken = c;
653 653 c = CONCAT;
654 654 catterm = 0;
655 655 } else if (!isfuncdef) {
656 656 if ((c1 = lexgetc()) != '(')
657 657 ++catterm;
658 658 lexungetc(c1);
659 659 }
660 660 isfuncdef = 0;
661 661 break;
662 662
663 663 case PARM:
664 664 case CONSTANT:
665 665 if (catterm) {
666 666 savetoken = c;
667 667 c = CONCAT;
668 668 catterm = 0;
669 669 } else {
670 670 if (lexlast == '$')
671 671 wasfield = 2;
672 672 ++catterm;
673 673 }
674 674 break;
675 675
676 676 case INC:
677 677 case DEC:
678 678 if (!catterm || lexlast != CONSTANT || wasfield)
679 679 break;
680 680
681 681 /*FALLTHRU*/
682 682 case UFUNC:
683 683 case FUNC:
684 684 case GETLINE:
685 685 case '!':
686 686 case '$':
687 687 case '(':
688 688 if (catterm) {
689 689 savetoken = c;
690 690 c = CONCAT;
691 691 catterm = 0;
692 692 }
693 693 break;
694 694
695 695 case '}':
696 696 if (nbrace == 0)
697 697 savetoken = ';';
698 698 /*FALLTHRU*/
699 699 case ';':
700 700 inprint = 0;
701 701 /*FALLTHRU*/
702 702 default:
703 703 if (c == DEFFUNC)
704 704 isfuncdef = 1;
705 705 catterm = 0;
706 706 }
707 707 lexlast = c;
708 708 if (wasfield)
709 709 wasfield--;
710 710 /*
711 711 * Map character constants to symbolic names.
712 712 */
713 713 for (i = 0; ctosym[i].c != 0; i++)
714 714 if (c == ctosym[i].c) {
715 715 c = ctosym[i].sym;
716 716 break;
717 717 }
718 718 out:
719 719 #ifdef DEBUG
720 720 if (dflag)
721 721 (void) printf("%d\n", (int)c);
722 722 #endif
723 723 return ((int)c);
724 724 }
725 725
726 726 /*
727 727 * Read a number for the lexical analyzer.
728 728 * Input is the first character of the number.
729 729 * Return value is the lexical type.
730 730 */
731 731 static int
732 732 lexnumber(wint_t c)
733 733 {
734 734 wchar_t *cp;
735 735 int dotfound = 0;
736 736 int efound = 0;
737 737 INT number;
738 738
739 739 cp = linebuf;
740 740 do {
741 741 if (iswdigit(c))
742 742 ;
743 743 else if (c == '.') {
744 744 if (dotfound++)
745 745 break;
746 746 } else if (c == 'e' || c == 'E') {
747 747 if ((c = lexgetc()) != '-' && c != '+') {
748 748 lexungetc(c);
749 749 c = 'e';
750 750 } else
751 751 *cp++ = 'e';
752 752 if (efound++)
753 753 break;
754 754 } else
755 755 break;
756 756 *cp++ = c;
757 757 } while ((c = lexgetc()) != WEOF);
758 758 *cp = '\0';
759 759 if (dotfound && cp == linebuf+1)
760 760 return (DOT);
761 761 lexungetc(c);
762 762 errno = 0;
763 763 if (!dotfound && !efound &&
764 764 ((number = wcstol(linebuf, (wchar_t **)0, 10)), errno != ERANGE))
765 765 yylval.node = intnode(number);
766 766 else
767 767 yylval.node = realnode((REAL)wcstod(linebuf, (wchar_t **)0));
768 768 return (CONSTANT);
769 769 }
770 770
771 771 /*
772 772 * Read an identifier.
773 773 * Input is first character of identifier.
774 774 * Return VAR.
775 775 */
776 776 static int
777 777 lexid(wint_t c)
778 778 {
779 779 wchar_t *cp;
780 780 size_t i;
781 781 NODE *np;
782 782
783 783 cp = linebuf;
784 784 do {
785 785 *cp++ = c;
786 786 c = lexgetc();
787 787 } while (iswalpha(c) || iswdigit(c) || c == '_');
788 788 *cp = '\0';
789 789 lexungetc(c);
790 790 yylval.node = np = vlook(linebuf);
791 791
792 792 switch (np->n_type) {
793 793 case KEYWORD:
794 794 switch (np->n_keywtype) {
795 795 case PRINT:
796 796 case PRINTF:
797 797 ++inprint;
798 798 default:
799 799 return ((int)np->n_keywtype);
800 800 }
801 801 /* NOTREACHED */
802 802
803 803 case ARRAY:
804 804 case VAR:
805 805 /*
806 806 * If reading the argument list, create a dummy node
807 807 * for the duration of that function. These variables
808 808 * can be removed from the symbol table at function end
809 809 * but they must still exist because the execution tree
810 810 * knows about them.
811 811 */
812 812 if (funparm) {
813 813 do_funparm:
814 814 np = emptynode(PARM, i = (cp-linebuf));
815 815 np->n_flags = FSTRING;
816 816 np->n_string = _null;
817 817 np->n_strlen = 0;
818 818 (void) memcpy(np->n_name, linebuf,
819 819 (i+1) * sizeof (wchar_t));
820 820 addsymtab(np);
821 821 yylval.node = np;
822 822 } else if (np == varNF || (np == varFS &&
823 823 (!doing_begin || begin_getline))) {
824 824 /*
825 825 * If the user program references NF or sets
826 826 * FS either outside of a begin block or
827 827 * in a begin block after a getline then the
828 828 * input line will be split immediately upon read
829 829 * rather than when a field is first referenced.
830 830 */
831 831 needsplit = 1;
832 832 } else if (np == varENVIRON)
833 833 needenviron = 1;
834 834 /*FALLTHRU*/
835 835 case PARM:
836 836 return (VAR);
837 837
838 838 case UFUNC:
839 839 /*
840 840 * It is ok to redefine functions as parameters
841 841 */
842 842 if (funparm) goto do_funparm;
843 843 /*FALLTHRU*/
844 844 case FUNC:
845 845 case GETLINE:
846 846 /*
847 847 * When a getline is encountered, clear the 'doing_begin' flag.
848 848 * This will force the 'needsplit' flag to be set, even inside
849 849 * a begin block, if FS is altered. (See VAR case above)
850 850 */
851 851 if (doing_begin)
852 852 begin_getline = 1;
853 853 return (np->n_type);
854 854 }
855 855 /* NOTREACHED */
856 856 return (0);
857 857 }
858 858
859 859 /*
860 860 * Read a string for the lexical analyzer.
861 861 * `endc' terminates the string.
862 862 */
863 863 static int
864 864 lexstring(wint_t endc)
865 865 {
866 866 size_t length = lexescape(endc, 0, 0);
867 867
868 868 yylval.node = stringnode(linebuf, FALLOC, length);
869 869 return (CONSTANT);
870 870 }
871 871
872 872 /*
873 873 * Read a regular expression.
874 874 */
875 875 static int
876 876 lexregexp(wint_t endc)
877 877 {
878 878 (void) lexescape(endc, 1, 0);
879 879 yylval.node = renode(linebuf);
880 880 return (URE);
881 881 }
882 882
883 883 /*
884 884 * Process a string, converting the escape characters as required by
885 885 * 1003.2. The processed string ends up in the global linebuf[]. This
886 886 * routine also changes the value of 'progfd' - the program file
887 887 * descriptor, so it should be used with some care. It is presently used to
888 888 * process -v (awk1.c) and var=str type arguments (awk2.c, nextrecord()).
889 889 */
890 890 void
891 891 strescape(wchar_t *str)
892 892 {
893 893 progptr = str;
894 894 proglen = wcslen(str) + 1; /* Include \0 */
895 895 (void) lexescape('\0', 0, 1);
896 896 progptr = NULL;
897 897 }
898 898
899 899 /*
900 900 * Read a string or regular expression, terminated by ``endc'',
901 901 * for lexical analyzer, processing escape sequences.
902 902 * Return string length.
903 903 */
904 904 static size_t
905 905 lexescape(wint_t endc, int regx, int cmd_line_operand)
906 906 {
907 907 static char nlre[256];
908 908 static char nlstr[256];
909 909 static char eofre[256];
910 910 static char eofstr[256];
911 911 int first_time = 1;
912 912 wint_t c;
913 913 wchar_t *cp;
914 914 int n, max;
915 915
916 916 if (first_time == 1) {
917 917 (void) strcpy(nlre, gettext("Newline in regular expression\n"));
918 918 (void) strcpy(nlstr, gettext("Newline in string\n"));
919 919 (void) strcpy(eofre, gettext("EOF in regular expression\n"));
920 920 (void) strcpy(eofstr, gettext("EOF in string\n"));
921 921 first_time = 0;
922 922 }
923 923
924 924 cp = linebuf;
925 925 while ((c = lexgetc()) != endc) {
926 926 if (c == '\n')
927 927 awkerr(regx ? nlre : nlstr);
928 928 if (c == '\\') {
929 929 switch (c = lexgetc(), c) {
930 930 case '\\':
931 931 if (regx)
932 932 *cp++ = '\\';
933 933 break;
934 934
935 935 case '/':
936 936 c = '/';
937 937 break;
938 938
939 939 case 'n':
940 940 c = '\n';
941 941 break;
942 942
943 943 case 'b':
944 944 c = '\b';
945 945 break;
946 946
947 947 case 't':
948 948 c = '\t';
949 949 break;
950 950
951 951 case 'r':
952 952 c = '\r';
953 953 break;
954 954
955 955 case 'f':
956 956 c = '\f';
957 957 break;
958 958
959 959 case 'v':
960 960 c = '\v';
961 961 break;
962 962
963 963 case 'a':
964 964 c = (char)0x07;
965 965 break;
966 966
967 967 case 'x':
968 968 n = 0;
969 969 while (iswxdigit(c = lexgetc())) {
970 970 if (iswdigit(c))
971 971 c -= '0';
972 972 else if (iswupper(c))
973 973 c -= 'A'-10;
974 974 else
975 975 c -= 'a'-10;
976 976 n = (n<<4) + c;
977 977 }
978 978 lexungetc(c);
979 979 c = n;
980 980 break;
981 981
982 982 case '0':
983 983 case '1':
984 984 case '2':
985 985 case '3':
986 986 case '4':
987 987 case '5':
988 988 case '6':
989 989 case '7':
990 990 #if 0
991 991 /*
992 992 * Posix.2 draft 10 disallows the use of back-referencing - it explicitly
993 993 * requires processing of the octal escapes both in strings and
994 994 * regular expressions. The following code is disabled instead of
995 995 * removed as back-referencing may be reintroduced in a future draft
996 996 * of the standard.
997 997 */
998 998 /*
999 999 * For regular expressions, we disallow
1000 1000 * \ooo to mean octal character, in favour
1001 1001 * of back referencing.
1002 1002 */
1003 1003 if (regx) {
1004 1004 *cp++ = '\\';
1005 1005 break;
1006 1006 }
1007 1007 #endif
1008 1008 max = 3;
1009 1009 n = 0;
1010 1010 do {
1011 1011 n = (n<<3) + c-'0';
1012 1012 if ((c = lexgetc()) > '7' || c < '0')
1013 1013 break;
1014 1014 } while (--max);
1015 1015 lexungetc(c);
1016 1016 /*
1017 1017 * an octal escape sequence must have at least
1018 1018 * 2 digits after the backslash, otherwise
1019 1019 * it gets passed straight thru for possible
1020 1020 * use in backreferencing.
1021 1021 */
1022 1022 if (max == 3) {
1023 1023 *cp++ = '\\';
1024 1024 n += '0';
1025 1025 }
1026 1026 c = n;
1027 1027 break;
1028 1028
1029 1029 case '\n':
1030 1030 continue;
1031 1031
1032 1032 default:
1033 1033 if (c != endc || cmd_line_operand) {
1034 1034 *cp++ = '\\';
1035 1035 if (c == endc)
1036 1036 lexungetc(c);
1037 1037 }
1038 1038 }
1039 1039 }
1040 1040 if (c == WEOF)
1041 1041 awkerr(regx ? eofre : eofstr);
1042 1042 *cp++ = c;
1043 1043 }
1044 1044 *cp = '\0';
1045 1045 return (cp - linebuf);
1046 1046 }
1047 1047
1048 1048 /*
1049 1049 * Build a regular expression NODE.
1050 1050 * Argument is the string holding the expression.
1051 1051 */
1052 1052 NODE *
1053 1053 renode(wchar_t *s)
1054 1054 {
1055 1055 NODE *np;
1056 1056 int n;
1057 1057
1058 1058 np = emptynode(RE, 0);
1059 1059 np->n_left = np->n_right = NNULL;
1060 1060 if ((n = REGWCOMP(&np->n_regexp, s)) != REG_OK) {
1061 1061 int m;
1062 1062 char *p;
1063 1063
1064 1064 m = REGWERROR(n, np->n_regexp, NULL, 0);
1065 1065 p = (char *)emalloc(m);
1066 1066 REGWERROR(n, np->n_regexp, p, m);
1067 1067 awkerr("/%S/: %s", s, p);
1068 1068 }
1069 1069 return (np);
1070 1070 }
1071 1071 /*
1072 1072 * Get a character for the lexical analyser routine.
1073 1073 */
1074 1074 static wint_t
1075 1075 lexgetc()
1076 1076 {
1077 1077 wint_t c;
1078 1078 static char **files = &progfiles[0];
1079 1079
1080 1080 if (progfp != FNULL && (c = fgetwc(progfp)) != WEOF)
1081 1081 ;
1082 1082 else {
1083 1083 if (progptr != NULL) {
1084 1084 if (proglen-- <= 0)
1085 1085 c = WEOF;
1086 1086 else
1087 1087 c = *progptr++;
1088 1088 } else {
1089 1089 if (progfp != FNULL) {
1090 1090 if (progfp != stdin)
1091 1091 (void) fclose(progfp);
1092 1092 else
1093 1093 clearerr(progfp);
1094 1094 progfp = FNULL;
1095 1095 }
1096 1096 if (files < progfilep) {
1097 1097 filename = *files++;
1098 1098 lineno = 1;
1099 1099 if (filename[0] == '-' && filename[1] == '\0')
1100 1100 progfp = stdin;
1101 1101 else if ((progfp = fopen(filename, r))
1102 1102 == FNULL) {
1103 1103 (void) fprintf(stderr,
1104 1104 gettext("script file \"%s\""), filename);
1105 1105 exit(1);
1106 1106 }
1107 1107 c = fgetwc(progfp);
1108 1108 }
1109 1109 }
1110 1110 }
1111 1111 if (c == '\n')
1112 1112 ++lineno;
1113 1113 if (conptr >= &context[NCONTEXT])
1114 1114 conptr = &context[0];
1115 1115 if (c != WEOF)
1116 1116 *conptr++ = c;
1117 1117 return (c);
1118 1118 }
1119 1119
1120 1120 /*
1121 1121 * Return a character for lexical analyser.
1122 1122 * Only one returned character is (not enforced) legitimite.
1123 1123 */
1124 1124 static void
1125 1125 lexungetc(wint_t c)
1126 1126 {
1127 1127 if (c == '\n')
1128 1128 --lineno;
1129 1129 if (c != WEOF) {
1130 1130 if (conptr == &context[0])
1131 1131 conptr = &context[NCONTEXT];
1132 1132 *--conptr = '\0';
1133 1133 }
1134 1134 if (progfp != FNULL) {
1135 1135 (void) ungetwc(c, progfp);
1136 1136 return;
1137 1137 }
1138 1138 if (c == WEOF)
1139 1139 return;
1140 1140 *--progptr = c;
1141 1141 proglen++;
1142 1142 }
1143 1143
1144 1144 /*
1145 1145 * Syntax errors during parsing.
1146 1146 */
1147 1147 void
1148 1148 yyerror(char *s, ...)
1149 1149 {
1150 1150 if (lexlast == FUNC || lexlast == GETLINE || lexlast == KEYWORD)
1151 1151 if (lexlast == KEYWORD)
1152 1152 awkerr(gettext("inadmissible use of reserved keyword"));
1153 1153 else
1154 1154 awkerr(gettext("attempt to redefine builtin function"));
1155 1155 awkerr(s);
1156 1156 }
1157 1157
1158 1158 /*
1159 1159 * Error routine for all awk errors.
1160 1160 */
1161 1161 /* ARGSUSED */
1162 1162 void
1163 1163 awkerr(char *fmt, ...)
1164 1164 {
1165 1165 va_list args;
1166 1166
1167 1167 va_start(args, fmt);
1168 1168 awkierr(0, fmt, args);
1169 1169 va_end(args);
1170 1170 }
1171 1171
1172 1172 /*
1173 1173 * Error routine like "awkerr" except that it prints out
1174 1174 * a message that includes an errno-specific indication.
1175 1175 */
1176 1176 /* ARGSUSED */
1177 1177 void
1178 1178 awkperr(char *fmt, ...)
1179 1179 {
1180 1180 va_list args;
1181 1181
1182 1182 va_start(args, fmt);
1183 1183 awkierr(1, fmt, args);
1184 1184 va_end(args);
1185 1185 }
1186 1186
1187 1187 /*
1188 1188 * Common internal routine for awkerr, awkperr
1189 1189 */
1190 1190 static void
1191 1191 awkierr(int perr, char *fmt, va_list ap)
1192 1192 {
1193 1193 static char sep1[] = "\n>>>\t";
1194 1194 static char sep2[] = "\t<<<";
1195 1195 int saveerr = errno;
1196 1196
1197 1197 (void) fprintf(stderr, "%s: ", _cmdname);
1198 1198 if (running) {
1199 1199 (void) fprintf(stderr, gettext("line %u ("),
1200 1200 curnode == NNULL ? 0 : curnode->n_lineno);
1201 1201 if (phase == 0)
1202 1202 (void) fprintf(stderr, "NR=%lld): ",
1203 1203 (INT)exprint(varNR));
1204 1204 else
1205 1205 (void) fprintf(stderr, "%s): ",
1206 1206 phase == BEGIN ? s_BEGIN : s_END);
1207 1207 } else if (lineno != 0) {
1208 1208 (void) fprintf(stderr, gettext("file \"%s\": "), filename);
1209 1209 (void) fprintf(stderr, gettext("line %u: "), lineno);
1210 1210 }
1211 1211 (void) vfprintf(stderr, gettext(fmt), ap);
1212 1212 if (perr == 1)
1213 1213 (void) fprintf(stderr, ": %s", strerror(saveerr));
1214 1214 if (perr != 2 && !running) {
1215 1215 wchar_t *cp;
1216 1216 int n;
1217 1217 int c;
1218 1218
1219 1219 (void) fprintf(stderr, gettext(" Context is:%s"), sep1);
1220 1220 cp = conptr;
1221 1221 n = NCONTEXT;
1222 1222 do {
1223 1223 if (cp >= &context[NCONTEXT])
1224 1224 cp = &context[0];
1225 1225 if ((c = *cp++) != '\0')
1226 1226 (void) fputs(c == '\n' ? sep1 : toprint(c),
1227 1227 stderr);
1228 1228 } while (--n != 0);
1229 1229 (void) fputs(sep2, stderr);
1230 1230 }
1231 1231 (void) fprintf(stderr, "\n");
1232 1232 exit(1);
1233 1233 }
1234 1234
1235 1235 wchar_t *
1236 1236 emalloc(unsigned n)
1237 1237 {
1238 1238 wchar_t *cp;
1239 1239
1240 1240 if ((cp = malloc(n)) == NULL)
1241 1241 awkerr(nomem);
1242 1242 return (cp);
1243 1243 }
1244 1244
1245 1245 wchar_t *
1246 1246 erealloc(wchar_t *p, unsigned n)
1247 1247 {
1248 1248 wchar_t *cp;
1249 1249
1250 1250 if ((cp = realloc(p, n)) == NULL)
1251 1251 awkerr(nomem);
1252 1252 return (cp);
1253 1253 }
1254 1254
1255 1255
1256 1256 /*
1257 1257 * usage message for awk
1258 1258 */
1259 1259 static int
1260 1260 usage()
1261 1261 {
1262 1262 (void) fprintf(stderr, gettext(
1263 1263 "Usage: awk [-F ERE] [-v var=val] 'program' [var=val ...] [file ...]\n"
1264 1264 " awk [-F ERE] -f progfile ... [-v var=val] [var=val ...] [file ...]\n"));
1265 1265 return (2);
1266 1266 }
1267 1267
1268 1268
1269 1269 static wchar_t *
1270 1270 mbconvert(char *str)
1271 1271 {
1272 1272 static wchar_t *op = 0;
1273 1273
1274 1274 if (op != 0)
1275 1275 free(op);
1276 1276 return (op = mbstowcsdup(str));
1277 1277 }
1278 1278
1279 1279 char *
1280 1280 mbunconvert(wchar_t *str)
1281 1281 {
1282 1282 static char *op = 0;
1283 1283
1284 1284 if (op != 0)
1285 1285 free(op);
1286 1286 return (op = wcstombsdup(str));
1287 1287 }
1288 1288
1289 1289 /*
1290 1290 * Solaris port - following functions are typical MKS functions written
1291 1291 * to work for Solaris.
1292 1292 */
1293 1293
1294 1294 wchar_t *
1295 1295 mbstowcsdup(char *s)
1296 1296 {
1297 1297 int n;
1298 1298 wchar_t *w;
1299 1299
1300 1300 n = strlen(s) + 1;
1301 1301 if ((w = (wchar_t *)malloc(n * sizeof (wchar_t))) == NULL)
1302 1302 return (NULL);
1303 1303
1304 1304 if (mbstowcs(w, s, n) == (size_t)-1)
1305 1305 return (NULL);
1306 1306 return (w);
1307 1307
1308 1308 }
1309 1309
1310 1310 char *
1311 1311 wcstombsdup(wchar_t *w)
1312 1312 {
1313 1313 int n;
1314 1314 char *mb;
1315 1315
1316 1316 /* Fetch memory for worst case string length */
1317 1317 n = wslen(w) + 1;
1318 1318 n *= MB_CUR_MAX;
1319 1319 if ((mb = (char *)malloc(n)) == NULL) {
1320 1320 return (NULL);
1321 1321 }
1322 1322
1323 1323 /* Convert the string */
1324 1324 if ((n = wcstombs(mb, w, n)) == -1) {
1325 1325 int saverr = errno;
1326 1326
1327 1327 free(mb);
1328 1328 errno = saverr;
1329 1329 return (0);
1330 1330 }
1331 1331
1332 1332 /* Shrink the string down */
1333 1333 if ((mb = (char *)realloc(mb, strlen(mb)+1)) == NULL) {
1334 1334 return (NULL);
1335 1335 }
1336 1336 return (mb);
1337 1337 }
1338 1338
1339 1339 /*
1340 1340 * The upe_ctrls[] table contains the printable 'control-sequences' for the
1341 1341 * character values 0..31 and 127. The first entry is for value 127, thus the
1342 1342 * entries for the remaining character values are from 1..32.
1343 1343 */
1344 1344 static const char *const upe_ctrls[] =
1345 1345 {
1346 1346 "^?",
1347 1347 "^@", "^A", "^B", "^C", "^D", "^E", "^F", "^G",
1348 1348 "^H", "^I", "^J", "^K", "^L", "^M", "^N", "^O",
1349 1349 "^P", "^Q", "^R", "^S", "^T", "^U", "^V", "^W",
1350 1350 "^X", "^Y", "^Z", "^[", "^\\", "^]", "^^", "^_"
1351 1351 };
1352 1352
1353 1353
1354 1354 /*
1355 1355 * Return a printable string corresponding to the given character value. If
1356 1356 * the character is printable, simply return it as the string. If it is in
1357 1357 * the range specified by table 5-101 in the UPE, return the corresponding
1358 1358 * string. Otherwise, return an octal escape sequence.
1359 1359 */
1360 1360 static const char *
1361 1361 toprint(wchar_t c)
1362 1362 {
1363 1363 int n, len;
1364 1364 unsigned char *ptr;
1365 1365 static char mbch[MB_LEN_MAX+1];
1366 1366 static char buf[5 * MB_LEN_MAX + 1];
1367 1367
1368 1368 if ((n = wctomb(mbch, c)) == -1) {
1369 1369 /* Should never happen */
1370 1370 (void) sprintf(buf, "\\%x", c);
1371 1371 return (buf);
1372 1372 }
1373 1373 mbch[n] = '\0';
1374 1374 if (iswprint(c)) {
1375 1375 return (mbch);
1376 1376 } else if (c == 127) {
1377 1377 return (upe_ctrls[0]);
1378 1378 } else if (c < 32) {
1379 1379 /* Print as in Table 5-101 in the UPE */
1380 1380 return (upe_ctrls[c+1]);
1381 1381 } else {
1382 1382 /* Print as an octal escape sequence */
1383 1383 for (len = 0, ptr = (unsigned char *) mbch; 0 < n; --n, ++ptr)
1384 1384 len += sprintf(buf+len, "\\%03o", *ptr);
1385 1385 }
1386 1386 return (buf);
1387 1387 }
1388 1388
1389 1389 static int
1390 1390 wcoff(const wchar_t *astring, const int off)
1391 1391 {
1392 1392 const wchar_t *s = astring;
1393 1393 int c = 0;
1394 1394 char mb[MB_LEN_MAX];
1395 1395
1396 1396 while (c < off) {
1397 1397 int n;
1398 1398 if ((n = wctomb(mb, *s)) == 0)
1399 1399 break;
1400 1400 if (n == -1)
1401 1401 n = 1;
1402 1402 c += n;
1403 1403 s++;
1404 1404 }
1405 1405
1406 1406 return (s - astring);
1407 1407 }
1408 1408
1409 1409 #define NREGHASH 64
1410 1410 #define NREGHOLD 1024 /* max number unused entries */
1411 1411
1412 1412 static int nregunref;
1413 1413
1414 1414 struct reghashq {
1415 1415 struct qelem hq;
1416 1416 struct regcache *regcachep;
1417 1417 };
1418 1418
1419 1419 struct regcache {
1420 1420 struct qelem lq;
1421 1421 wchar_t *pattern;
1422 1422 regex_t re;
1423 1423 int refcnt;
1424 1424 struct reghashq hash;
1425 1425 };
1426 1426
1427 1427 static struct qelem reghash[NREGHASH], reglink;
1428 1428
1429 1429 /*
1430 1430 * Generate a hash value of the given wchar string.
1431 1431 * The hashing method is similar to what Java does for strings.
1432 1432 */
1433 1433 static uint_t
1434 1434 regtxthash(const wchar_t *str)
1435 1435 {
1436 1436 int k = 0;
1437 1437
1438 1438 while (*str != L'\0')
1439 1439 k = (31 * k) + *str++;
1440 1440
1441 1441 k += ~(k << 9);
1442 1442 k ^= (k >> 14);
1443 1443 k += (k << 4);
1444 1444 k ^= (k >> 10);
1445 1445
1446 1446 return (k % NREGHASH);
1447 1447 }
1448 1448
1449 1449 int
1450 1450 int_regwcomp(REGEXP *r, const wchar_t *pattern)
1451 1451 {
1452 1452 regex_t re;
1453 1453 char *mbpattern;
1454 1454 int ret;
1455 1455 uint_t key;
1456 1456 struct qelem *qp;
1457 1457 struct regcache *rcp;
1458 1458
1459 1459 key = regtxthash(pattern);
1460 1460 for (qp = reghash[key].q_forw; qp != NULL; qp = qp->q_forw) {
1461 1461 rcp = ((struct reghashq *)qp)->regcachep;
1462 1462 if (*rcp->pattern == *pattern &&
1463 1463 wcscmp(rcp->pattern, pattern) == 0)
1464 1464 break;
1465 1465 }
1466 1466 if (qp != NULL) {
1467 1467 /* update link. put this one at the beginning */
1468 1468 if (rcp != (struct regcache *)reglink.q_forw) {
1469 1469 remque(&rcp->lq);
1470 1470 insque(&rcp->lq, ®link);
1471 1471 }
1472 1472 if (rcp->refcnt == 0)
1473 1473 nregunref--; /* no longer unref'ed */
1474 1474 rcp->refcnt++;
1475 1475 *(struct regcache **)r = rcp;
1476 1476 return (REG_OK);
1477 1477 }
1478 1478
1479 1479 if ((mbpattern = wcstombsdup((wchar_t *)pattern)) == NULL)
1480 1480 return (REG_ESPACE);
1481 1481
1482 1482 ret = regcomp(&re, mbpattern, REG_EXTENDED);
1483 1483
1484 1484 free(mbpattern);
1485 1485
1486 1486 if (ret != REG_OK)
1487 1487 return (ret);
1488 1488
1489 1489 if ((rcp = malloc(sizeof (struct regcache))) == NULL)
1490 1490 return (REG_ESPACE);
1491 1491 rcp->re = re;
1492 1492 if ((rcp->pattern = wsdup(pattern)) == NULL) {
1493 1493 regfree(&re);
1494 1494 free(rcp);
1495 1495 return (REG_ESPACE);
1496 1496 }
1497 1497 rcp->refcnt = 1;
1498 1498 insque(&rcp->lq, ®link);
1499 1499 insque(&rcp->hash.hq, ®hash[key]);
1500 1500 rcp->hash.regcachep = rcp;
1501 1501
1502 1502 *(struct regcache **)r = rcp;
1503 1503 return (ret);
1504 1504 }
1505 1505
1506 1506 void
1507 1507 int_regwfree(REGEXP r)
1508 1508 {
1509 1509 int cnt;
1510 1510 struct qelem *qp, *nqp;
1511 1511 struct regcache *rcp;
1512 1512
1513 1513 rcp = (struct regcache *)r;
1514 1514
1515 1515 if (--rcp->refcnt != 0)
1516 1516 return;
1517 1517
1518 1518 /* this cache has no reference */
1519 1519 if (++nregunref < NREGHOLD)
1520 1520 return;
1521 1521
1522 1522 /*
1523 1523 * We've got too much unref'ed regex. Free half of least
1524 1524 * used regex.
1525 1525 */
1526 1526 cnt = 0;
1527 1527 for (qp = reglink.q_forw; qp != NULL; qp = nqp) {
1528 1528 nqp = qp->q_forw;
1529 1529 rcp = (struct regcache *)qp;
1530 1530 if (rcp->refcnt != 0)
1531 1531 continue;
1532 1532
1533 1533 /* free half of them */
1534 1534 if (++cnt < (NREGHOLD / 2))
1535 1535 continue;
1536 1536
1537 1537 /* detach and free */
1538 1538 remque(&rcp->lq);
1539 1539 remque(&rcp->hash.hq);
1540 1540
1541 1541 /* free up */
1542 1542 free(rcp->pattern);
1543 1543 regfree(&rcp->re);
1544 1544 free(rcp);
1545 1545
1546 1546 nregunref--;
1547 1547 }
1548 1548 }
1549 1549
1550 1550 size_t
1551 1551 int_regwerror(int errcode, REGEXP r, char *errbuf, size_t bufsiz)
1552 1552 {
1553 1553 struct regcache *rcp;
1554 1554
1555 1555 rcp = (struct regcache *)r;
1556 1556 return (regerror(errcode, &rcp->re, errbuf, bufsiz));
1557 1557 }
1558 1558
1559 1559 int
1560 1560 int_regwexec(REGEXP r, /* compiled RE */
1561 1561 const wchar_t *astring, /* subject string */
1562 1562 size_t nsub, /* number of subexpressions */
1563 1563 int_regwmatch_t *sub, /* subexpression pointers */
1564 1564 int flags)
1565 1565 {
1566 1566 char *mbs;
1567 1567 regmatch_t *mbsub = NULL;
1568 1568 int i;
1569 1569 struct regcache *rcp;
1570 1570
1571 1571 if ((mbs = wcstombsdup((wchar_t *)astring)) == NULL)
1572 1572 return (REG_ESPACE);
1573 1573
1574 1574 if (nsub > 0 && sub) {
1575 1575 if ((mbsub = malloc(nsub * sizeof (regmatch_t))) == NULL)
1576 1576 return (REG_ESPACE);
1577 1577 }
1578 1578
1579 1579 rcp = (struct regcache *)r;
1580 1580
1581 1581 i = regexec(&rcp->re, mbs, nsub, mbsub, flags);
1582 1582
1583 1583 /* Now, adjust the pointers/counts in sub */
1584 1584 if (i == REG_OK && nsub > 0 && mbsub) {
1585 1585 int j, k;
1586 1586
1587 1587 for (j = 0; j < nsub; j++) {
1588 1588 regmatch_t *ms = &mbsub[j];
1589 1589 int_regwmatch_t *ws = &sub[j];
1590 1590
1591 1591 if ((k = ms->rm_so) >= 0) {
1592 1592 ws->rm_so = wcoff(astring, k);
1593 1593 ws->rm_sp = astring + ws->rm_so;
1594 1594 }
1595 1595 if ((k = ms->rm_eo) >= 0) {
1596 1596 ws->rm_eo = wcoff(astring, k);
1597 1597 ws->rm_ep = astring + ws->rm_eo;
1598 1598 }
1599 1599 }
1600 1600 }
1601 1601
1602 1602 free(mbs);
1603 1603 if (mbsub)
1604 1604 free(mbsub);
1605 1605 return (i);
1606 1606 }
1607 1607
1608 1608 int
1609 1609 int_regwdosuba(REGEXP rp, /* compiled RE: Pattern */
1610 1610 const wchar_t *rpl, /* replacement string: /rpl/ */
1611 1611 const wchar_t *src, /* source string */
1612 1612 wchar_t **dstp, /* destination string */
1613 1613 int len, /* destination length */
1614 1614 int *globp) /* IN: occurence, 0 for all; OUT: substitutions */
1615 1615 {
1616 1616 wchar_t *dst, *odst;
1617 1617 const wchar_t *ip, *xp;
1618 1618 wchar_t *op;
1619 1619 int i;
1620 1620 wchar_t c;
1621 1621 int glob, iglob = *globp, oglob = 0;
1622 1622 #define NSUB 10
1623 1623 int_regwmatch_t rm[NSUB], *rmp;
1624 1624 int flags;
1625 1625 wchar_t *end;
1626 1626 int regerr;
1627 1627
1628 1628 /* handle overflow of dst. we need "i" more bytes */
1629 1629 #ifdef OVERFLOW
1630 1630 #undef OVERFLOW
1631 1631 #define OVERFLOW(i) { \
1632 1632 int pos = op - dst; \
1633 1633 dst = (wchar_t *)realloc(odst = dst, \
1634 1634 (len += len + i) * sizeof (wchar_t)); \
1635 1635 if (dst == NULL) \
1636 1636 goto nospace; \
↓ open down ↓ |
1636 lines elided |
↑ open up ↑ |
1637 1637 op = dst + pos; \
1638 1638 end = dst + len; \
1639 1639 }
1640 1640 #endif
1641 1641
1642 1642 *dstp = dst = (wchar_t *)malloc(len * sizeof (wchar_t));
1643 1643 if (dst == NULL)
1644 1644 return (REG_ESPACE);
1645 1645
1646 1646 if (rp == NULL || rpl == NULL || src == NULL || dst == NULL)
1647 - return (REG_EFATAL);
1647 + return (REG_BADPAT);
1648 1648
1649 1649 glob = 0; /* match count */
1650 1650 ip = src; /* source position */
1651 1651 op = dst; /* destination position */
1652 1652 end = dst + len;
1653 1653
1654 1654 flags = 0;
1655 1655 while ((regerr = int_regwexec(rp, ip, NSUB, rm, flags)) == REG_OK) {
1656 1656 /* Copy text preceding match */
1657 1657 if (op + (i = rm[0].rm_sp - ip) >= end)
1658 1658 OVERFLOW(i)
1659 1659 while (i--)
1660 1660 *op++ = *ip++;
1661 1661
1662 1662 if (iglob == 0 || ++glob == iglob) {
1663 1663 oglob++;
1664 1664 xp = rpl; /* do substitute */
1665 1665 } else
1666 1666 xp = L"&"; /* preserve text */
1667 1667
1668 1668 /* Perform replacement of matched substing */
1669 1669 while ((c = *xp++) != '\0') {
1670 1670 rmp = NULL;
1671 1671 if (c == '&')
1672 1672 rmp = &rm[0];
1673 1673 else if (c == '\\') {
1674 1674 if ('0' <= *xp && *xp <= '9')
1675 1675 rmp = &rm[*xp++ - '0'];
1676 1676 else if (*xp != '\0')
1677 1677 c = *xp++;
1678 1678 }
1679 1679
1680 1680 if (rmp == NULL) { /* Ordinary character. */
1681 1681 *op++ = c;
1682 1682 if (op >= end)
1683 1683 OVERFLOW(1)
1684 1684 } else if (rmp->rm_sp != NULL && rmp->rm_ep != NULL) {
1685 1685 ip = rmp->rm_sp;
1686 1686 if (op + (i = rmp->rm_ep - rmp->rm_sp) >= end)
1687 1687 OVERFLOW(i)
1688 1688 while (i--)
1689 1689 *op++ = *ip++;
1690 1690 }
1691 1691 }
1692 1692
1693 1693 ip = rm[0].rm_ep;
1694 1694 if (*ip == '\0') /* If at end break */
1695 1695 break;
1696 1696 else if (rm[0].rm_sp == rm[0].rm_ep) {
1697 1697 /* If empty match copy next char */
1698 1698 *op++ = *ip++;
1699 1699 if (op >= end)
1700 1700 OVERFLOW(1)
1701 1701 }
1702 1702 flags = REG_NOTBOL;
1703 1703 }
1704 1704
1705 1705 if (regerr != REG_OK && regerr != REG_NOMATCH)
1706 1706 return (regerr);
1707 1707
1708 1708 /* Copy rest of text */
1709 1709 if (op + (i = wcslen(ip)) >= end)
1710 1710 OVERFLOW(i)
1711 1711 while (i--)
1712 1712 *op++ = *ip++;
1713 1713 *op++ = '\0';
1714 1714
1715 1715 if ((*dstp = dst = (wchar_t *)realloc(odst = dst,
1716 1716 sizeof (wchar_t) * (size_t)(op - dst))) == NULL) {
1717 1717 nospace:
1718 1718 free(odst);
1719 1719 return (REG_ESPACE);
1720 1720 }
1721 1721
1722 1722 *globp = oglob;
1723 1723
1724 1724 return ((oglob == 0) ? REG_NOMATCH : REG_OK);
1725 1725 }
↓ open down ↓ |
68 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX