1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Copyright 1986, 1994 by Mortice Kern Systems Inc.  All rights reserved.
  28  */
  29 
  30 /*
  31  * awk -- mainline, yylex, etc.
  32  *
  33  * Based on MKS awk(1) ported to be /usr/xpg4/bin/awk with POSIX/XCU4 changes
  34  */
  35 
  36 #include "awk.h"
  37 #include "y.tab.h"
  38 #include <stdarg.h>
  39 #include <unistd.h>
  40 #include <locale.h>
  41 #include <search.h>
  42 
  43 static char     *progfiles[NPFILE];     /* Programmes files for yylex */
  44 static char     **progfilep = &progfiles[0]; /* Pointer to last file */
  45 static wchar_t  *progptr;               /* In-memory programme */
  46 static int      proglen;                /* Length of progptr */
  47 static wchar_t  context[NCONTEXT];      /* Circular buffer of context */
  48 static wchar_t  *conptr = &context[0];      /* context ptr */
  49 static FILE     *progfp;                /* Stdio stream for programme */
  50 static char     *filename;
  51 #ifdef  DEBUG
  52 static int      dflag;
  53 #endif
  54 
  55 #define AWK_EXEC_MAGIC  "<MKS AWKC>"
  56 #define LEN_EXEC_MAGIC  10
  57 
  58 static char     unbal[] = "unbalanced E char";
  59 
  60 static void     awkarginit(int c, char **av);
  61 static int      lexid(wint_t c);
  62 static int      lexnumber(wint_t c);
  63 static int      lexstring(wint_t endc);
  64 static int      lexregexp(wint_t endc);
  65 
  66 static void     awkvarinit(void);
  67 static wint_t   lexgetc(void);
  68 static void     lexungetc(wint_t c);
  69 static size_t   lexescape(wint_t endc, int regx, int cmd_line_operand);
  70 static void     awkierr(int perr, char *fmt, va_list ap);
  71 static int      usage(void);
  72 void            strescape(wchar_t *str);
  73 static const char       *toprint(wint_t);
  74 char *_cmdname;
  75 static wchar_t *mbconvert(char *str);
  76 
  77 extern int      isclvar(wchar_t *arg);
  78 
  79 /*
  80  * mainline for awk
  81  */
  82 int
  83 main(int argc, char *argv[])
  84 {
  85         wchar_t *ap;
  86         char *cmd;
  87 
  88         cmd = argv[0];
  89         _cmdname = cmd;
  90 
  91         linebuf = emalloc(NLINE * sizeof (wchar_t));
  92 
  93         /*
  94          * At this point only messaging should be internationalized.
  95          * numbers are still scanned as in the Posix locale.
  96          */
  97         (void) setlocale(LC_ALL, "");
  98         (void) setlocale(LC_NUMERIC, "C");
  99 #if !defined(TEXT_DOMAIN)
 100 #define TEXT_DOMAIN     "SYS_TEST"
 101 #endif
 102         (void) textdomain(TEXT_DOMAIN);
 103 
 104         awkvarinit();
 105         /* running = 1; */
 106         while (argc > 1 && *argv[1] == '-') {
 107                 void *save_ptr = NULL;
 108                 ap = mbstowcsdup(&argv[1][1]);
 109                 if (ap == NULL)
 110                         break;
 111                 if (*ap == '\0') {
 112                         free(ap);
 113                         break;
 114                 }
 115                 save_ptr = (void *) ap;
 116                 ++argv;
 117                 --argc;
 118                 if (*ap == '-' && ap[1] == '\0')
 119                         break;
 120                 for (; *ap != '\0'; ++ap) {
 121                         switch (*ap) {
 122 #ifdef DEBUG
 123                         case 'd':
 124                                 dflag = 1;
 125                                 continue;
 126 
 127 #endif
 128                         case 'f':
 129                                 if (argc < 2) {
 130                                         (void) fprintf(stderr,
 131                                 gettext("Missing script file\n"));
 132                                         return (1);
 133                                 }
 134                                 *progfilep++ = argv[1];
 135                                 --argc;
 136                                 ++argv;
 137                                 continue;
 138 
 139                         case 'F':
 140                                 if (ap[1] == '\0') {
 141                                         if (argc < 2) {
 142                                                 (void) fprintf(stderr,
 143                                 gettext("Missing field separator\n"));
 144                                                 return (1);
 145                                         }
 146                                         ap = mbstowcsdup(argv[1]);
 147                                         --argc;
 148                                         ++argv;
 149                                 } else
 150                                         ++ap;
 151                                 strescape(ap);
 152                                 strassign(varFS, linebuf, FALLOC,
 153                                     wcslen(linebuf));
 154                                 break;
 155 
 156                         case 'v': {
 157                                 wchar_t *vp;
 158                                 wchar_t *arg;
 159 
 160                                 if (argc < 2) {
 161                                         (void) fprintf(stderr,
 162                 gettext("Missing variable assignment\n"));
 163                                         return (1);
 164                                 }
 165                                 arg = mbconvert(argv[1]);
 166                                 /*
 167                                  * Ensure the variable expression
 168                                  * is valid (correct form).
 169                                  */
 170                                 if (((vp = wcschr(arg, '=')) != NULL) &&
 171                                     isclvar(arg)) {
 172                                         *vp = '\0';
 173                                         strescape(vp+1);
 174                                         strassign(vlook(arg), linebuf,
 175                                             FALLOC|FSENSE,
 176                                             wcslen(linebuf));
 177                                         *vp = '=';
 178                                 } else {
 179                                         (void) fprintf(stderr, gettext(
 180                                             "Invalid form for variable "
 181                                             "assignment: %S\n"), arg);
 182                                         return (1);
 183                                 }
 184                                 --argc;
 185                                 ++argv;
 186                                 continue;
 187                         }
 188 
 189                         default:
 190                                 (void) fprintf(stderr,
 191                                 gettext("Unknown option \"-%S\"\n"), ap);
 192                                 return (usage());
 193                         }
 194                         break;
 195                 }
 196                 if (save_ptr)
 197                         free(save_ptr);
 198         }
 199         if (progfilep == &progfiles[0]) {
 200                 if (argc < 2)
 201                         return (usage());
 202                 filename = "[command line]";    /* BUG: NEEDS TRANSLATION */
 203                 progptr = mbstowcsdup(argv[1]);
 204                 proglen = wcslen(progptr);
 205                 --argc;
 206                 ++argv;
 207         }
 208 
 209         argv[0] = cmd;
 210 
 211         awkarginit(argc, argv);
 212 
 213         /* running = 0; */
 214         (void) yyparse();
 215 
 216         lineno = 0;
 217         /*
 218          * Ok, done parsing, so now activate the rest of the nls stuff, set
 219          * the radix character.
 220          */
 221         (void) setlocale(LC_ALL, "");
 222         radixpoint = *localeconv()->decimal_point;
 223         awk();
 224         /* NOTREACHED */
 225         return (0);
 226 }
 227 
 228 /*
 229  * Do initial setup of buffers, etc.
 230  * This must be called before most processing
 231  * and especially before lexical analysis.
 232  * Variables initialised here will be overruled by command
 233  * line parameter initialisation.
 234  */
 235 static void
 236 awkvarinit()
 237 {
 238         NODE *np;
 239 
 240         (void) setvbuf(stderr, NULL, _IONBF, 0);
 241 
 242         if ((NIOSTREAM = sysconf(_SC_OPEN_MAX) - 4) <= 0) {
 243                 (void) fprintf(stderr,
 244         gettext("not enough available file descriptors"));
 245                 exit(1);
 246         }
 247         ofiles = (OFILE *)emalloc(sizeof (OFILE)*NIOSTREAM);
 248 #ifdef A_ZERO_POINTERS
 249         (void) memset((wchar_t *)ofiles, 0, sizeof (OFILE) * NIOSTREAM);
 250 #else
 251         {
 252                 /* initialize file descriptor table */
 253                 OFILE *fp;
 254                 for (fp = ofiles; fp < &ofiles[NIOSTREAM]; fp += 1) {
 255                         fp->f_fp = FNULL;
 256                                         fp->f_mode = 0;
 257                                         fp->f_name = (char *)0;
 258                 }
 259         }
 260 #endif
 261         constant = intnode((INT)0);
 262 
 263         const0 = intnode((INT)0);
 264         const1 = intnode((INT)1);
 265         constundef = emptynode(CONSTANT, 0);
 266         constundef->n_flags = FSTRING|FVINT;
 267         constundef->n_string = _null;
 268         constundef->n_strlen = 0;
 269         inc_oper = emptynode(ADD, 0);
 270         inc_oper->n_right = const1;
 271         asn_oper = emptynode(ADD, 0);
 272         field0 = node(FIELD, const0, NNULL);
 273 
 274         {
 275                 RESFUNC near*rp;
 276 
 277                 for (rp = &resfuncs[0]; rp->rf_name != (LOCCHARP)NULL; ++rp) {
 278                         np = finstall(rp->rf_name, rp->rf_func, rp->rf_type);
 279                 }
 280         }
 281         {
 282                 RESERVED near*rp;
 283 
 284                 for (rp = &reserved[0]; rp->r_name != (LOCCHARP)NULL; ++rp) {
 285                         switch (rp->r_type) {
 286                         case SVAR:
 287                         case VAR:
 288                                 running = 1;
 289                                 np = vlook(rp->r_name);
 290                                 if (rp->r_type == SVAR)
 291                                         np->n_flags |= FSPECIAL;
 292                                 if (rp->r_svalue != NULL)
 293                                         strassign(np, rp->r_svalue, FSTATIC,
 294                                             (size_t)rp->r_ivalue);
 295                                 else {
 296                                         constant->n_int = rp->r_ivalue;
 297                                         (void) assign(np, constant);
 298                                 }
 299                                 running = 0;
 300                                 break;
 301 
 302                         case KEYWORD:
 303                                 kinstall(rp->r_name, (int)rp->r_ivalue);
 304                                 break;
 305                         }
 306                 }
 307         }
 308 
 309         varNR = vlook(s_NR);
 310         varFNR = vlook(s_FNR);
 311         varNF = vlook(s_NF);
 312         varOFMT = vlook(s_OFMT);
 313         varCONVFMT = vlook(s_CONVFMT);
 314         varOFS = vlook(s_OFS);
 315         varORS = vlook(s_ORS);
 316         varRS = vlook(s_RS);
 317         varFS = vlook(s_FS);
 318         varARGC = vlook(s_ARGC);
 319         varSUBSEP = vlook(s_SUBSEP);
 320         varENVIRON = vlook(s_ENVIRON);
 321         varFILENAME = vlook(s_FILENAME);
 322         varSYMTAB = vlook(s_SYMTAB);
 323         incNR = node(ASG, varNR, node(ADD, varNR, const1));
 324         incFNR = node(ASG, varFNR, node(ADD, varFNR, const1));
 325         clrFNR = node(ASG, varFNR, const0);
 326 }
 327 
 328 /*
 329  * Initialise awk ARGC, ARGV variables.
 330  */
 331 static void
 332 awkarginit(int ac, char **av)
 333 {
 334         int i;
 335         wchar_t *cp;
 336 
 337         ARGVsubi = node(INDEX, vlook(s_ARGV), constant);
 338         running = 1;
 339         constant->n_int = ac;
 340         (void) assign(varARGC, constant);
 341         for (i = 0; i < ac; ++i) {
 342                 cp = mbstowcsdup(av[i]);
 343                 constant->n_int = i;
 344                 strassign(exprreduce(ARGVsubi), cp,
 345                     FSTATIC|FSENSE, wcslen(cp));
 346         }
 347         running = 0;
 348 }
 349 
 350 /*
 351  * Clean up when done parsing a function.
 352  * All formal parameters, because of a deal (funparm) in
 353  * yylex, get put into the symbol table in front of any
 354  * global variable of the same name.  When the entire
 355  * function is parsed, remove these formal dummy nodes
 356  * from the symbol table but retain the nodes because
 357  * the generated tree points at them.
 358  */
 359 void
 360 uexit(NODE *np)
 361 {
 362         NODE *formal;
 363 
 364         while ((formal = getlist(&np)) != NNULL)
 365                 delsymtab(formal, 0);
 366 }
 367 
 368 /*
 369  * The lexical analyzer.
 370  */
 371 int
 372 yylex()
 373 {
 374         wint_t c, c1;
 375         int i;
 376         static int savetoken = 0;
 377         static int wasfield;
 378         static int isfuncdef;
 379         static int nbrace, nparen, nbracket;
 380         static struct ctosymstruct {
 381                 wint_t c, sym;
 382         } ctosym[] = {
 383                 { '|', BAR },           { '^', CARAT },
 384                 { '~', TILDE },         { '<', LANGLE },
 385                 { '>', RANGLE },     { '+', PLUSC },
 386                 { '-', HYPHEN },        { '*', STAR },
 387                 { '/', SLASH },         { '%', PERCENT },
 388                 { '!', EXCLAMATION },   { '$', DOLLAR },
 389                 { '[', LSQUARE },       { ']', RSQUARE },
 390                 { '(', LPAREN },        { ')', RPAREN },
 391                 { ';', SEMI },          { '{', LBRACE },
 392                 { '}', RBRACE },        {   0, 0 }
 393         };
 394 
 395         if (savetoken) {
 396                 c = savetoken;
 397                 savetoken = 0;
 398         } else if (redelim != '\0') {
 399                 c = redelim;
 400                 redelim = 0;
 401                 catterm = 0;
 402                 savetoken = c;
 403                 c = lexlast = lexregexp(c);
 404                 goto out;
 405         } else while ((c = lexgetc()) != WEOF) {
 406                 if (iswalpha(c) || c == '_') {
 407                         c = lexid(c);
 408                 } else if (iswdigit(c) || c == '.') {
 409                         c = lexnumber(c);
 410                 } else if (isWblank(c)) {
 411                         continue;
 412                 } else switch (c) {
 413 #if DOS || OS2
 414                 case 032:               /* ^Z */
 415                         continue;
 416 #endif
 417 
 418                 case '"':
 419                         c = lexstring(c);
 420                         break;
 421 
 422                 case '#':
 423                         while ((c = lexgetc()) != '\n' && c != WEOF)
 424                                 ;
 425                         lexungetc(c);
 426                         continue;
 427 
 428                 case '+':
 429                         if ((c1 = lexgetc()) == '+')
 430                                 c = INC;
 431                         else if (c1 == '=')
 432                                 c = AADD;
 433                         else
 434                                 lexungetc(c1);
 435                         break;
 436 
 437                 case '-':
 438                         if ((c1 = lexgetc()) == '-')
 439                                 c = DEC;
 440                         else if (c1 == '=')
 441                                 c = ASUB;
 442                         else
 443                                 lexungetc(c1);
 444                         break;
 445 
 446                 case '*':
 447                         if ((c1 = lexgetc()) == '=')
 448                                 c = AMUL;
 449                         else if (c1 == '*') {
 450                                 if ((c1 = lexgetc()) == '=')
 451                                         c = AEXP;
 452                                 else {
 453                                         c = EXP;
 454                                         lexungetc(c1);
 455                                 }
 456                         } else
 457                                 lexungetc(c1);
 458                         break;
 459 
 460                 case '^':
 461                         if ((c1 = lexgetc()) == '=') {
 462                                 c = AEXP;
 463                         } else {
 464                                 c = EXP;
 465                                 lexungetc(c1);
 466                         }
 467                         break;
 468 
 469                 case '/':
 470                         if ((c1 = lexgetc()) == '=' &&
 471                             lexlast != RE && lexlast != NRE &&
 472                             lexlast != ';' && lexlast != '\n' &&
 473                             lexlast != ',' && lexlast != '(')
 474                                 c = ADIV;
 475                         else
 476                                 lexungetc(c1);
 477                         break;
 478 
 479                 case '%':
 480                         if ((c1 = lexgetc()) == '=')
 481                                 c = AREM;
 482                         else
 483                                 lexungetc(c1);
 484                         break;
 485 
 486                 case '&':
 487                         if ((c1 = lexgetc()) == '&')
 488                                 c = AND;
 489                         else
 490                                 lexungetc(c1);
 491                         break;
 492 
 493                 case '|':
 494                         if ((c1 = lexgetc()) == '|')
 495                                 c = OR;
 496                         else {
 497                                 lexungetc(c1);
 498                                 if (inprint)
 499                                         c = PIPE;
 500                         }
 501                         break;
 502 
 503                 case '>':
 504                         if ((c1 = lexgetc()) == '=')
 505                                 c = GE;
 506                         else if (c1 == '>')
 507                                 c = APPEND;
 508                         else {
 509                                 lexungetc(c1);
 510                                 if (nparen == 0 && inprint)
 511                                         c = WRITE;
 512                         }
 513                         break;
 514 
 515                 case '<':
 516                         if ((c1 = lexgetc()) == '=')
 517                                 c = LE;
 518                         else
 519                                 lexungetc(c1);
 520                         break;
 521 
 522                 case '!':
 523                         if ((c1 = lexgetc()) == '=')
 524                                 c = NE;
 525                         else if (c1 == '~')
 526                                 c = NRE;
 527                         else
 528                                 lexungetc(c1);
 529                         break;
 530 
 531                 case '=':
 532                         if ((c1 = lexgetc()) == '=')
 533                                 c = EQ;
 534                         else {
 535                                 lexungetc(c1);
 536                                 c = ASG;
 537                         }
 538                         break;
 539 
 540                 case '\n':
 541                         switch (lexlast) {
 542                         case ')':
 543                                 if (catterm || inprint) {
 544                                         c = ';';
 545                                         break;
 546                                 }
 547                         /*FALLTHRU*/
 548                         case AND:
 549                         case OR:
 550                         case COMMA:
 551                         case '{':
 552                         case ELSE:
 553                         case ';':
 554                         case DO:
 555                                 continue;
 556 
 557                         case '}':
 558                                 if (nbrace != 0)
 559                                         continue;
 560 
 561                         default:
 562                                 c = ';';
 563                                 break;
 564                         }
 565                         break;
 566 
 567                 case ELSE:
 568                         if (lexlast != ';') {
 569                                 savetoken = ELSE;
 570                                 c = ';';
 571                         }
 572                         break;
 573 
 574                 case '(':
 575                         ++nparen;
 576                         break;
 577 
 578                 case ')':
 579                         if (--nparen < 0)
 580                                 awkerr(unbal, "()");
 581                         break;
 582 
 583                 case '{':
 584                         nbrace++;
 585                         break;
 586 
 587                 case '}':
 588                         if (--nbrace < 0) {
 589                                 char brk[3];
 590 
 591                                 brk[0] = '{';
 592                                 brk[1] = '}';
 593                                 brk[2] = '\0';
 594                                 awkerr(unbal, brk);
 595                         }
 596                         if (lexlast != ';') {
 597                                 savetoken = c;
 598                                 c = ';';
 599                         }
 600                         break;
 601 
 602                 case '[':
 603                         ++nbracket;
 604                         break;
 605 
 606                 case ']':
 607                         if (--nbracket < 0) {
 608                                 char brk[3];
 609 
 610                                 brk[0] = '[';
 611                                 brk[1] = ']';
 612                                 brk[2] = '\0';
 613                                 awkerr(unbal, brk);
 614                         }
 615                         break;
 616 
 617                 case '\\':
 618                         if ((c1 = lexgetc()) == '\n')
 619                                 continue;
 620                         lexungetc(c1);
 621                         break;
 622 
 623                 case ',':
 624                         c = COMMA;
 625                         break;
 626 
 627                 case '?':
 628                         c = QUEST;
 629                         break;
 630 
 631                 case ':':
 632                         c = COLON;
 633                         break;
 634 
 635                 default:
 636                         if (!iswprint(c))
 637                                 awkerr(
 638                                     gettext("invalid character \"%s\""),
 639                                     toprint(c));
 640                         break;
 641                 }
 642                 break;
 643         }
 644 
 645         switch (c) {
 646         case ']':
 647                 ++catterm;
 648                 break;
 649 
 650         case VAR:
 651                 if (catterm) {
 652                         savetoken = c;
 653                         c = CONCAT;
 654                         catterm = 0;
 655                 } else if (!isfuncdef) {
 656                         if ((c1 = lexgetc()) != '(')
 657                                 ++catterm;
 658                         lexungetc(c1);
 659                 }
 660                 isfuncdef = 0;
 661                 break;
 662 
 663         case PARM:
 664         case CONSTANT:
 665                 if (catterm) {
 666                         savetoken = c;
 667                         c = CONCAT;
 668                         catterm = 0;
 669                 } else {
 670                         if (lexlast == '$')
 671                                 wasfield = 2;
 672                         ++catterm;
 673                 }
 674                 break;
 675 
 676         case INC:
 677         case DEC:
 678                 if (!catterm || lexlast != CONSTANT || wasfield)
 679                         break;
 680 
 681         /*FALLTHRU*/
 682         case UFUNC:
 683         case FUNC:
 684         case GETLINE:
 685         case '!':
 686         case '$':
 687         case '(':
 688                 if (catterm) {
 689                         savetoken = c;
 690                         c = CONCAT;
 691                         catterm = 0;
 692                 }
 693                 break;
 694 
 695         case '}':
 696                 if (nbrace == 0)
 697                         savetoken = ';';
 698         /*FALLTHRU*/
 699         case ';':
 700                 inprint = 0;
 701         /*FALLTHRU*/
 702         default:
 703                 if (c == DEFFUNC)
 704                         isfuncdef = 1;
 705                 catterm = 0;
 706         }
 707         lexlast = c;
 708         if (wasfield)
 709                 wasfield--;
 710         /*
 711          * Map character constants to symbolic names.
 712          */
 713         for (i = 0; ctosym[i].c != 0; i++)
 714                 if (c == ctosym[i].c) {
 715                         c = ctosym[i].sym;
 716                         break;
 717                 }
 718 out:
 719 #ifdef DEBUG
 720         if (dflag)
 721                 (void) printf("%d\n", (int)c);
 722 #endif
 723         return ((int)c);
 724 }
 725 
 726 /*
 727  * Read a number for the lexical analyzer.
 728  * Input is the first character of the number.
 729  * Return value is the lexical type.
 730  */
 731 static int
 732 lexnumber(wint_t c)
 733 {
 734         wchar_t *cp;
 735         int dotfound = 0;
 736         int efound = 0;
 737         INT number;
 738 
 739         cp = linebuf;
 740         do {
 741                 if (iswdigit(c))
 742                         ;
 743                 else if (c == '.') {
 744                         if (dotfound++)
 745                                 break;
 746                 } else if (c == 'e' || c == 'E') {
 747                         if ((c = lexgetc()) != '-' && c != '+') {
 748                                 lexungetc(c);
 749                                 c = 'e';
 750                         } else
 751                                 *cp++ = 'e';
 752                         if (efound++)
 753                                 break;
 754                 } else
 755                         break;
 756                 *cp++ = c;
 757         } while ((c = lexgetc()) != WEOF);
 758         *cp = '\0';
 759         if (dotfound && cp == linebuf+1)
 760                 return (DOT);
 761         lexungetc(c);
 762         errno = 0;
 763         if (!dotfound && !efound &&
 764             ((number = wcstol(linebuf, (wchar_t **)0, 10)), errno != ERANGE))
 765                 yylval.node = intnode(number);
 766         else
 767                 yylval.node = realnode((REAL)wcstod(linebuf, (wchar_t **)0));
 768         return (CONSTANT);
 769 }
 770 
 771 /*
 772  * Read an identifier.
 773  * Input is first character of identifier.
 774  * Return VAR.
 775  */
 776 static int
 777 lexid(wint_t c)
 778 {
 779         wchar_t *cp;
 780         size_t i;
 781         NODE *np;
 782 
 783         cp = linebuf;
 784         do {
 785                 *cp++ = c;
 786                 c = lexgetc();
 787         } while (iswalpha(c) || iswdigit(c) || c == '_');
 788         *cp = '\0';
 789         lexungetc(c);
 790         yylval.node = np = vlook(linebuf);
 791 
 792         switch (np->n_type) {
 793         case KEYWORD:
 794                 switch (np->n_keywtype) {
 795                 case PRINT:
 796                 case PRINTF:
 797                         ++inprint;
 798                 default:
 799                         return ((int)np->n_keywtype);
 800                 }
 801                 /* NOTREACHED */
 802 
 803         case ARRAY:
 804         case VAR:
 805                 /*
 806                  * If reading the argument list, create a dummy node
 807                  * for the duration of that function. These variables
 808                  * can be removed from the symbol table at function end
 809                  * but they must still exist because the execution tree
 810                  * knows about them.
 811                  */
 812                 if (funparm) {
 813 do_funparm:
 814                         np = emptynode(PARM, i = (cp-linebuf));
 815                         np->n_flags = FSTRING;
 816                         np->n_string = _null;
 817                         np->n_strlen = 0;
 818                         (void) memcpy(np->n_name, linebuf,
 819                             (i+1) * sizeof (wchar_t));
 820                         addsymtab(np);
 821                         yylval.node = np;
 822                 } else if (np == varNF || (np == varFS &&
 823                     (!doing_begin || begin_getline))) {
 824                         /*
 825                          * If the user program references NF or sets
 826                          * FS either outside of a begin block or
 827                          * in a begin block after a getline then the
 828                          * input line will be split immediately upon read
 829                          * rather than when a field is first referenced.
 830                          */
 831                         needsplit = 1;
 832                 } else if (np == varENVIRON)
 833                         needenviron = 1;
 834         /*FALLTHRU*/
 835         case PARM:
 836                 return (VAR);
 837 
 838         case UFUNC:
 839                 /*
 840                  * It is ok to redefine functions as parameters
 841                  */
 842                 if (funparm) goto do_funparm;
 843         /*FALLTHRU*/
 844         case FUNC:
 845         case GETLINE:
 846                 /*
 847                  * When a getline is encountered, clear the 'doing_begin' flag.
 848                  * This will force the 'needsplit' flag to be set, even inside
 849                  * a begin block, if FS is altered. (See VAR case above)
 850                  */
 851                 if (doing_begin)
 852                         begin_getline = 1;
 853                 return (np->n_type);
 854         }
 855         /* NOTREACHED */
 856         return (0);
 857 }
 858 
 859 /*
 860  * Read a string for the lexical analyzer.
 861  * `endc' terminates the string.
 862  */
 863 static int
 864 lexstring(wint_t endc)
 865 {
 866         size_t length = lexescape(endc, 0, 0);
 867 
 868         yylval.node = stringnode(linebuf, FALLOC, length);
 869         return (CONSTANT);
 870 }
 871 
 872 /*
 873  * Read a regular expression.
 874  */
 875 static int
 876 lexregexp(wint_t endc)
 877 {
 878         (void) lexescape(endc, 1, 0);
 879         yylval.node = renode(linebuf);
 880         return (URE);
 881 }
 882 
 883 /*
 884  * Process a string, converting the escape characters as required by
 885  * 1003.2. The processed string ends up in the global linebuf[]. This
 886  * routine also changes the value of 'progfd' - the program file
 887  * descriptor, so it should be used with some care. It is presently used to
 888  * process -v (awk1.c) and var=str type arguments (awk2.c, nextrecord()).
 889  */
 890 void
 891 strescape(wchar_t *str)
 892 {
 893         progptr = str;
 894         proglen = wcslen(str) + 1;      /* Include \0 */
 895         (void) lexescape('\0', 0, 1);
 896         progptr = NULL;
 897 }
 898 
 899 /*
 900  * Read a string or regular expression, terminated by ``endc'',
 901  * for lexical analyzer, processing escape sequences.
 902  * Return string length.
 903  */
 904 static size_t
 905 lexescape(wint_t endc, int regx, int cmd_line_operand)
 906 {
 907         static char nlre[256];
 908         static char nlstr[256];
 909         static char eofre[256];
 910         static char eofstr[256];
 911         int first_time = 1;
 912         wint_t c;
 913         wchar_t *cp;
 914         int n, max;
 915 
 916         if (first_time == 1) {
 917                 (void) strcpy(nlre, gettext("Newline in regular expression\n"));
 918                 (void) strcpy(nlstr, gettext("Newline in string\n"));
 919                 (void) strcpy(eofre, gettext("EOF in regular expression\n"));
 920                 (void) strcpy(eofstr, gettext("EOF in string\n"));
 921                 first_time = 0;
 922         }
 923 
 924         cp = linebuf;
 925         while ((c = lexgetc()) != endc) {
 926                 if (c == '\n')
 927                         awkerr(regx ? nlre : nlstr);
 928                 if (c == '\\') {
 929                         switch (c = lexgetc(), c) {
 930                         case '\\':
 931                                 if (regx)
 932                                         *cp++ = '\\';
 933                                 break;
 934 
 935                         case '/':
 936                                 c = '/';
 937                                 break;
 938 
 939                         case 'n':
 940                                 c = '\n';
 941                                 break;
 942 
 943                         case 'b':
 944                                 c = '\b';
 945                                 break;
 946 
 947                         case 't':
 948                                 c = '\t';
 949                                 break;
 950 
 951                         case 'r':
 952                                 c = '\r';
 953                                 break;
 954 
 955                         case 'f':
 956                                 c = '\f';
 957                                 break;
 958 
 959                         case 'v':
 960                                 c = '\v';
 961                                 break;
 962 
 963                         case 'a':
 964                                 c = (char)0x07;
 965                                 break;
 966 
 967                         case 'x':
 968                                 n = 0;
 969                                 while (iswxdigit(c = lexgetc())) {
 970                                         if (iswdigit(c))
 971                                                 c -= '0';
 972                                         else if (iswupper(c))
 973                                                 c -= 'A'-10;
 974                                         else
 975                                                 c -= 'a'-10;
 976                                         n = (n<<4) + c;
 977                                 }
 978                                 lexungetc(c);
 979                                 c = n;
 980                                 break;
 981 
 982                         case '0':
 983                         case '1':
 984                         case '2':
 985                         case '3':
 986                         case '4':
 987                         case '5':
 988                         case '6':
 989                         case '7':
 990 #if 0
 991 /*
 992  * Posix.2 draft 10 disallows the use of back-referencing - it explicitly
 993  * requires processing of the octal escapes both in strings and
 994  * regular expressions. The following code is disabled instead of
 995  * removed as back-referencing may be reintroduced in a future draft
 996  * of the standard.
 997  */
 998                                 /*
 999                                  * For regular expressions, we disallow
1000                                  * \ooo to mean octal character, in favour
1001                                  * of back referencing.
1002                                  */
1003                                 if (regx) {
1004                                         *cp++ = '\\';
1005                                         break;
1006                                 }
1007 #endif
1008                                 max = 3;
1009                                 n = 0;
1010                                 do {
1011                                         n = (n<<3) + c-'0';
1012                                         if ((c = lexgetc()) > '7' || c < '0')
1013                                                 break;
1014                                 } while (--max);
1015                                 lexungetc(c);
1016                                 /*
1017                                  * an octal escape sequence must have at least
1018                                  * 2 digits after the backslash, otherwise
1019                                  * it gets passed straight thru for possible
1020                                  * use in backreferencing.
1021                                  */
1022                                 if (max == 3) {
1023                                         *cp++ = '\\';
1024                                         n += '0';
1025                                 }
1026                                 c = n;
1027                                 break;
1028 
1029                         case '\n':
1030                                 continue;
1031 
1032                         default:
1033                                 if (c != endc || cmd_line_operand) {
1034                                         *cp++ = '\\';
1035                                         if (c == endc)
1036                                                 lexungetc(c);
1037                                 }
1038                         }
1039                 }
1040                 if (c == WEOF)
1041                         awkerr(regx ? eofre : eofstr);
1042                 *cp++ = c;
1043         }
1044         *cp = '\0';
1045         return (cp - linebuf);
1046 }
1047 
1048 /*
1049  * Build a regular expression NODE.
1050  * Argument is the string holding the expression.
1051  */
1052 NODE *
1053 renode(wchar_t *s)
1054 {
1055         NODE *np;
1056         int n;
1057 
1058         np = emptynode(RE, 0);
1059         np->n_left = np->n_right = NNULL;
1060         if ((n = REGWCOMP(&np->n_regexp, s)) != REG_OK) {
1061                 int m;
1062                 char *p;
1063 
1064                 m = REGWERROR(n, np->n_regexp, NULL, 0);
1065                 p = (char *)emalloc(m);
1066                 REGWERROR(n, np->n_regexp, p, m);
1067                 awkerr("/%S/: %s", s, p);
1068         }
1069         return (np);
1070 }
1071 /*
1072  * Get a character for the lexical analyser routine.
1073  */
1074 static wint_t
1075 lexgetc()
1076 {
1077         wint_t c;
1078         static char **files = &progfiles[0];
1079 
1080         if (progfp != FNULL && (c = fgetwc(progfp)) != WEOF)
1081                 ;
1082         else {
1083                 if (progptr != NULL) {
1084                         if (proglen-- <= 0)
1085                                 c = WEOF;
1086                         else
1087                                 c = *progptr++;
1088                 } else {
1089                         if (progfp != FNULL) {
1090                                 if (progfp != stdin)
1091                                         (void) fclose(progfp);
1092                                 else
1093                                         clearerr(progfp);
1094                                 progfp = FNULL;
1095                         }
1096                         if (files < progfilep) {
1097                                 filename = *files++;
1098                                 lineno = 1;
1099                                 if (filename[0] == '-' && filename[1] == '\0')
1100                                         progfp = stdin;
1101                                 else if ((progfp = fopen(filename, r))
1102                                     == FNULL) {
1103                                         (void) fprintf(stderr,
1104                                 gettext("script file \"%s\""), filename);
1105                                         exit(1);
1106                                 }
1107                                 c = fgetwc(progfp);
1108                         }
1109                 }
1110         }
1111         if (c == '\n')
1112                 ++lineno;
1113         if (conptr >= &context[NCONTEXT])
1114                 conptr = &context[0];
1115         if (c != WEOF)
1116                 *conptr++ = c;
1117         return (c);
1118 }
1119 
1120 /*
1121  * Return a character for lexical analyser.
1122  * Only one returned character is (not enforced) legitimite.
1123  */
1124 static void
1125 lexungetc(wint_t c)
1126 {
1127         if (c == '\n')
1128                 --lineno;
1129         if (c != WEOF) {
1130                 if (conptr == &context[0])
1131                         conptr = &context[NCONTEXT];
1132                 *--conptr = '\0';
1133         }
1134         if (progfp != FNULL) {
1135                 (void) ungetwc(c, progfp);
1136                 return;
1137         }
1138         if (c == WEOF)
1139                 return;
1140         *--progptr = c;
1141         proglen++;
1142 }
1143 
1144 /*
1145  * Syntax errors during parsing.
1146  */
1147 void
1148 yyerror(char *s, ...)
1149 {
1150         if (lexlast == FUNC || lexlast == GETLINE || lexlast == KEYWORD)
1151                 if (lexlast == KEYWORD)
1152                         awkerr(gettext("inadmissible use of reserved keyword"));
1153                 else
1154                         awkerr(gettext("attempt to redefine builtin function"));
1155         awkerr(s);
1156 }
1157 
1158 /*
1159  * Error routine for all awk errors.
1160  */
1161 /* ARGSUSED */
1162 void
1163 awkerr(char *fmt, ...)
1164 {
1165         va_list args;
1166 
1167         va_start(args, fmt);
1168         awkierr(0, fmt, args);
1169         va_end(args);
1170 }
1171 
1172 /*
1173  * Error routine like "awkerr" except that it prints out
1174  * a message that includes an errno-specific indication.
1175  */
1176 /* ARGSUSED */
1177 void
1178 awkperr(char *fmt, ...)
1179 {
1180         va_list args;
1181 
1182         va_start(args, fmt);
1183         awkierr(1, fmt, args);
1184         va_end(args);
1185 }
1186 
1187 /*
1188  * Common internal routine for awkerr, awkperr
1189  */
1190 static void
1191 awkierr(int perr, char *fmt, va_list ap)
1192 {
1193         static char sep1[] = "\n>>>\t";
1194         static char sep2[] = "\t<<<";
1195         int saveerr = errno;
1196 
1197         (void) fprintf(stderr, "%s: ", _cmdname);
1198         if (running) {
1199                 (void) fprintf(stderr, gettext("line %u ("),
1200                     curnode == NNULL ? 0 : curnode->n_lineno);
1201                 if (phase == 0)
1202                         (void) fprintf(stderr, "NR=%lld): ",
1203                             (INT)exprint(varNR));
1204                 else
1205                         (void) fprintf(stderr, "%s): ",
1206                             phase == BEGIN ? s_BEGIN : s_END);
1207         } else if (lineno != 0) {
1208                 (void) fprintf(stderr, gettext("file \"%s\": "), filename);
1209                 (void) fprintf(stderr, gettext("line %u: "), lineno);
1210         }
1211         (void) vfprintf(stderr, gettext(fmt), ap);
1212         if (perr == 1)
1213                 (void) fprintf(stderr, ": %s", strerror(saveerr));
1214         if (perr != 2 && !running) {
1215                 wchar_t *cp;
1216                 int n;
1217                 int c;
1218 
1219                 (void) fprintf(stderr, gettext("  Context is:%s"), sep1);
1220                 cp = conptr;
1221                 n = NCONTEXT;
1222                 do {
1223                         if (cp >= &context[NCONTEXT])
1224                                 cp = &context[0];
1225                         if ((c = *cp++) != '\0')
1226                                 (void) fputs(c == '\n' ? sep1 : toprint(c),
1227                                     stderr);
1228                 } while (--n != 0);
1229                 (void) fputs(sep2, stderr);
1230         }
1231         (void) fprintf(stderr, "\n");
1232         exit(1);
1233 }
1234 
1235 wchar_t *
1236 emalloc(unsigned n)
1237 {
1238         wchar_t *cp;
1239 
1240         if ((cp = malloc(n)) == NULL)
1241                 awkerr(nomem);
1242         return (cp);
1243 }
1244 
1245 wchar_t *
1246 erealloc(wchar_t *p, unsigned n)
1247 {
1248         wchar_t *cp;
1249 
1250         if ((cp = realloc(p, n)) == NULL)
1251                 awkerr(nomem);
1252         return (cp);
1253 }
1254 
1255 
1256 /*
1257  * usage message for awk
1258  */
1259 static int
1260 usage()
1261 {
1262         (void) fprintf(stderr, gettext(
1263 "Usage: awk [-F ERE] [-v var=val] 'program' [var=val ...] [file ...]\n"
1264 "       awk [-F ERE] -f progfile ... [-v var=val] [var=val ...] [file ...]\n"));
1265         return (2);
1266 }
1267 
1268 
1269 static wchar_t *
1270 mbconvert(char *str)
1271 {
1272         static wchar_t *op = 0;
1273 
1274         if (op != 0)
1275                 free(op);
1276         return (op = mbstowcsdup(str));
1277 }
1278 
1279 char *
1280 mbunconvert(wchar_t *str)
1281 {
1282         static char *op = 0;
1283 
1284         if (op != 0)
1285                 free(op);
1286         return (op = wcstombsdup(str));
1287 }
1288 
1289 /*
1290  * Solaris port - following functions are typical MKS functions written
1291  * to work for Solaris.
1292  */
1293 
1294 wchar_t *
1295 mbstowcsdup(char *s)
1296 {
1297         int n;
1298         wchar_t *w;
1299 
1300         n = strlen(s) + 1;
1301         if ((w = (wchar_t *)malloc(n * sizeof (wchar_t))) == NULL)
1302                 return (NULL);
1303 
1304         if (mbstowcs(w, s, n) == (size_t)-1)
1305                 return (NULL);
1306         return (w);
1307 
1308 }
1309 
1310 char *
1311 wcstombsdup(wchar_t *w)
1312 {
1313         int n;
1314         char *mb;
1315 
1316         /* Fetch memory for worst case string length */
1317         n = wslen(w) + 1;
1318         n *= MB_CUR_MAX;
1319         if ((mb = (char *)malloc(n)) == NULL) {
1320                 return (NULL);
1321         }
1322 
1323         /* Convert the string */
1324         if ((n = wcstombs(mb, w, n)) == -1) {
1325                 int saverr = errno;
1326 
1327                 free(mb);
1328                 errno = saverr;
1329                 return (0);
1330         }
1331 
1332         /* Shrink the string down */
1333         if ((mb = (char *)realloc(mb, strlen(mb)+1)) == NULL)  {
1334                 return (NULL);
1335         }
1336         return (mb);
1337 }
1338 
1339 /*
1340  * The upe_ctrls[] table contains the printable 'control-sequences' for the
1341  * character values 0..31 and 127.  The first entry is for value 127, thus the
1342  * entries for the remaining character values are from 1..32.
1343  */
1344 static const char *const upe_ctrls[] =
1345 {
1346         "^?",
1347         "^@",  "^A",  "^B",  "^C",  "^D",  "^E",  "^F",  "^G",
1348         "^H",  "^I",  "^J",  "^K",  "^L",  "^M",  "^N",  "^O",
1349         "^P",  "^Q",  "^R",  "^S",  "^T",  "^U",  "^V",  "^W",
1350         "^X",  "^Y",  "^Z",  "^[",  "^\\", "^]",  "^^",  "^_"
1351 };
1352 
1353 
1354 /*
1355  * Return a printable string corresponding to the given character value.  If
1356  * the character is printable, simply return it as the string.  If it is in
1357  * the range specified by table 5-101 in the UPE, return the corresponding
1358  * string.  Otherwise, return an octal escape sequence.
1359  */
1360 static const char *
1361 toprint(wchar_t c)
1362 {
1363         int n, len;
1364         unsigned char *ptr;
1365         static char mbch[MB_LEN_MAX+1];
1366         static char buf[5 * MB_LEN_MAX + 1];
1367 
1368         if ((n = wctomb(mbch, c)) == -1) {
1369                 /* Should never happen */
1370                 (void) sprintf(buf, "\\%x", c);
1371                 return (buf);
1372         }
1373         mbch[n] = '\0';
1374         if (iswprint(c)) {
1375                 return (mbch);
1376         } else if (c == 127) {
1377                 return (upe_ctrls[0]);
1378         } else if (c < 32) {
1379                 /* Print as in Table 5-101 in the UPE */
1380                 return (upe_ctrls[c+1]);
1381         } else {
1382                 /* Print as an octal escape sequence */
1383                 for (len = 0, ptr = (unsigned char *) mbch; 0 < n; --n, ++ptr)
1384                         len += sprintf(buf+len, "\\%03o", *ptr);
1385         }
1386         return (buf);
1387 }
1388 
1389 static int
1390 wcoff(const wchar_t *astring, const int off)
1391 {
1392         const wchar_t *s = astring;
1393         int c = 0;
1394         char mb[MB_LEN_MAX];
1395 
1396         while (c < off) {
1397                 int n;
1398                 if ((n = wctomb(mb, *s)) == 0)
1399                         break;
1400                 if (n == -1)
1401                         n = 1;
1402                 c += n;
1403                 s++;
1404         }
1405 
1406         return (s - astring);
1407 }
1408 
1409 #define NREGHASH        64
1410 #define NREGHOLD        1024    /* max number unused entries */
1411 
1412 static int      nregunref;
1413 
1414 struct reghashq {
1415         struct qelem hq;
1416         struct regcache *regcachep;
1417 };
1418 
1419 struct regcache {
1420         struct qelem    lq;
1421         wchar_t *pattern;
1422         regex_t re;
1423         int     refcnt;
1424         struct reghashq hash;
1425 };
1426 
1427 static struct qelem reghash[NREGHASH], reglink;
1428 
1429 /*
1430  * Generate a hash value of the given wchar string.
1431  * The hashing method is similar to what Java does for strings.
1432  */
1433 static uint_t
1434 regtxthash(const wchar_t *str)
1435 {
1436         int k = 0;
1437 
1438         while (*str != L'\0')
1439                 k = (31 * k) + *str++;
1440 
1441         k += ~(k << 9);
1442         k ^=  (k >> 14);
1443         k +=  (k << 4);
1444         k ^=  (k >> 10);
1445 
1446         return (k % NREGHASH);
1447 }
1448 
1449 int
1450 int_regwcomp(REGEXP *r, const wchar_t *pattern)
1451 {
1452         regex_t re;
1453         char *mbpattern;
1454         int ret;
1455         uint_t key;
1456         struct qelem *qp;
1457         struct regcache *rcp;
1458 
1459         key = regtxthash(pattern);
1460         for (qp = reghash[key].q_forw; qp != NULL; qp = qp->q_forw) {
1461                 rcp = ((struct reghashq *)qp)->regcachep;
1462                 if (*rcp->pattern == *pattern &&
1463                     wcscmp(rcp->pattern, pattern) == 0)
1464                         break;
1465         }
1466         if (qp != NULL) {
1467                 /* update link. put this one at the beginning */
1468                 if (rcp != (struct regcache *)reglink.q_forw) {
1469                         remque(&rcp->lq);
1470                         insque(&rcp->lq, &reglink);
1471                 }
1472                 if (rcp->refcnt == 0)
1473                         nregunref--;    /* no longer unref'ed */
1474                 rcp->refcnt++;
1475                 *(struct regcache **)r = rcp;
1476                 return (REG_OK);
1477         }
1478 
1479         if ((mbpattern = wcstombsdup((wchar_t *)pattern)) == NULL)
1480                 return (REG_ESPACE);
1481 
1482         ret = regcomp(&re, mbpattern, REG_EXTENDED);
1483 
1484         free(mbpattern);
1485 
1486         if (ret != REG_OK)
1487                 return (ret);
1488 
1489         if ((rcp = malloc(sizeof (struct regcache))) == NULL)
1490                 return (REG_ESPACE);
1491         rcp->re = re;
1492         if ((rcp->pattern = wsdup(pattern)) == NULL) {
1493                 regfree(&re);
1494                 free(rcp);
1495                 return (REG_ESPACE);
1496         }
1497         rcp->refcnt = 1;
1498         insque(&rcp->lq, &reglink);
1499         insque(&rcp->hash.hq, &reghash[key]);
1500         rcp->hash.regcachep = rcp;
1501 
1502         *(struct regcache **)r = rcp;
1503         return (ret);
1504 }
1505 
1506 void
1507 int_regwfree(REGEXP r)
1508 {
1509         int     cnt;
1510         struct qelem *qp, *nqp;
1511         struct regcache *rcp;
1512 
1513         rcp = (struct regcache *)r;
1514 
1515         if (--rcp->refcnt != 0)
1516                 return;
1517 
1518         /* this cache has no reference */
1519         if (++nregunref < NREGHOLD)
1520                 return;
1521 
1522         /*
1523          * We've got too much unref'ed regex. Free half of least
1524          * used regex.
1525          */
1526         cnt = 0;
1527         for (qp = reglink.q_forw; qp != NULL; qp = nqp) {
1528                 nqp = qp->q_forw;
1529                 rcp = (struct regcache *)qp;
1530                 if (rcp->refcnt != 0)
1531                         continue;
1532 
1533                 /* free half of them */
1534                 if (++cnt < (NREGHOLD / 2))
1535                         continue;
1536 
1537                 /* detach and free */
1538                 remque(&rcp->lq);
1539                 remque(&rcp->hash.hq);
1540 
1541                 /* free up */
1542                 free(rcp->pattern);
1543                 regfree(&rcp->re);
1544                 free(rcp);
1545 
1546                 nregunref--;
1547         }
1548 }
1549 
1550 size_t
1551 int_regwerror(int errcode, REGEXP r, char *errbuf, size_t bufsiz)
1552 {
1553         struct regcache *rcp;
1554 
1555         rcp = (struct regcache *)r;
1556         return (regerror(errcode, &rcp->re, errbuf, bufsiz));
1557 }
1558 
1559 int
1560 int_regwexec(REGEXP r,          /* compiled RE */
1561     const wchar_t *astring,     /* subject string */
1562     size_t nsub,                /* number of subexpressions */
1563     int_regwmatch_t *sub,       /* subexpression pointers */
1564     int flags)
1565 {
1566         char *mbs;
1567         regmatch_t *mbsub = NULL;
1568         int i;
1569         struct regcache *rcp;
1570 
1571         if ((mbs = wcstombsdup((wchar_t *)astring)) == NULL)
1572                 return (REG_ESPACE);
1573 
1574         if (nsub > 0 && sub) {
1575                 if ((mbsub = malloc(nsub * sizeof (regmatch_t))) == NULL)
1576                         return (REG_ESPACE);
1577         }
1578 
1579         rcp = (struct regcache *)r;
1580 
1581         i = regexec(&rcp->re, mbs, nsub, mbsub, flags);
1582 
1583         /* Now, adjust the pointers/counts in sub */
1584         if (i == REG_OK && nsub > 0 && mbsub) {
1585                 int j, k;
1586 
1587                 for (j = 0; j < nsub; j++) {
1588                         regmatch_t *ms = &mbsub[j];
1589                         int_regwmatch_t *ws = &sub[j];
1590 
1591                         if ((k = ms->rm_so) >= 0) {
1592                                 ws->rm_so = wcoff(astring, k);
1593                                 ws->rm_sp = astring + ws->rm_so;
1594                         }
1595                         if ((k = ms->rm_eo) >= 0) {
1596                                 ws->rm_eo = wcoff(astring, k);
1597                                 ws->rm_ep = astring + ws->rm_eo;
1598                         }
1599                 }
1600         }
1601 
1602         free(mbs);
1603         if (mbsub)
1604                 free(mbsub);
1605         return (i);
1606 }
1607 
1608 int
1609 int_regwdosuba(REGEXP rp,       /* compiled RE: Pattern */
1610     const wchar_t *rpl,         /* replacement string: /rpl/ */
1611     const wchar_t *src,         /* source string */
1612     wchar_t **dstp,             /* destination string */
1613     int len,                    /* destination length */
1614     int *globp)         /* IN: occurence, 0 for all; OUT: substitutions */
1615 {
1616         wchar_t *dst, *odst;
1617         const wchar_t *ip, *xp;
1618         wchar_t *op;
1619         int i;
1620         wchar_t c;
1621         int glob, iglob = *globp, oglob = 0;
1622 #define NSUB    10
1623         int_regwmatch_t rm[NSUB], *rmp;
1624         int flags;
1625         wchar_t *end;
1626         int regerr;
1627 
1628 /* handle overflow of dst. we need "i" more bytes */
1629 #ifdef OVERFLOW
1630 #undef OVERFLOW
1631 #define OVERFLOW(i) { \
1632                 int pos = op - dst; \
1633                 dst = (wchar_t *)realloc(odst = dst, \
1634                         (len += len + i) * sizeof (wchar_t)); \
1635                 if (dst == NULL) \
1636                         goto nospace; \
1637                 op = dst + pos; \
1638                 end = dst + len; \
1639         }
1640 #endif
1641 
1642         *dstp = dst = (wchar_t *)malloc(len * sizeof (wchar_t));
1643         if (dst == NULL)
1644                 return (REG_ESPACE);
1645 
1646         if (rp == NULL || rpl == NULL || src == NULL || dst ==  NULL)
1647                 return (REG_EFATAL);
1648 
1649         glob = 0;       /* match count */
1650         ip = src;       /* source position */
1651         op = dst;       /* destination position */
1652         end = dst + len;
1653 
1654         flags = 0;
1655         while ((regerr = int_regwexec(rp, ip, NSUB, rm, flags)) == REG_OK) {
1656                 /* Copy text preceding match */
1657                 if (op + (i = rm[0].rm_sp - ip) >= end)
1658                         OVERFLOW(i)
1659                 while (i--)
1660                         *op++ = *ip++;
1661 
1662                 if (iglob == 0 || ++glob == iglob) {
1663                         oglob++;
1664                         xp = rpl;               /* do substitute */
1665                 } else
1666                         xp = L"&";          /* preserve text */
1667 
1668                 /* Perform replacement of matched substing */
1669                 while ((c = *xp++) != '\0') {
1670                         rmp = NULL;
1671                         if (c == '&')
1672                                 rmp = &rm[0];
1673                         else if (c == '\\') {
1674                                 if ('0' <= *xp && *xp <= '9')
1675                                         rmp = &rm[*xp++ - '0'];
1676                                 else if (*xp != '\0')
1677                                         c = *xp++;
1678                         }
1679 
1680                         if (rmp ==  NULL) {     /* Ordinary character. */
1681                                 *op++ = c;
1682                                 if (op >= end)
1683                                         OVERFLOW(1)
1684                         } else if (rmp->rm_sp != NULL && rmp->rm_ep != NULL) {
1685                                 ip = rmp->rm_sp;
1686                                 if (op + (i = rmp->rm_ep - rmp->rm_sp) >= end)
1687                                         OVERFLOW(i)
1688                                 while (i--)
1689                                         *op++ = *ip++;
1690                         }
1691                 }
1692 
1693                 ip = rm[0].rm_ep;
1694                 if (*ip == '\0')        /* If at end break */
1695                         break;
1696                 else if (rm[0].rm_sp == rm[0].rm_ep) {
1697                         /* If empty match copy next char */
1698                         *op++ = *ip++;
1699                         if (op >= end)
1700                                 OVERFLOW(1)
1701                 }
1702                 flags = REG_NOTBOL;
1703         }
1704 
1705         if (regerr != REG_OK && regerr != REG_NOMATCH)
1706                 return (regerr);
1707 
1708         /* Copy rest of text */
1709         if (op + (i =  wcslen(ip)) >= end)
1710                 OVERFLOW(i)
1711         while (i--)
1712                 *op++ = *ip++;
1713         *op++ = '\0';
1714 
1715         if ((*dstp = dst = (wchar_t *)realloc(odst = dst,
1716             sizeof (wchar_t) * (size_t)(op - dst))) == NULL) {
1717 nospace:
1718                 free(odst);
1719                 return (REG_ESPACE);
1720         }
1721 
1722         *globp = oglob;
1723 
1724         return ((oglob == 0) ? REG_NOMATCH : REG_OK);
1725 }