iltre Wdiff usr/src/cmd/awk_xpg4/awk1.c

Print this page

9083 replace regex implementation with tre

Split	Close
Expand all
Collapse all

          --- old/usr/src/cmd/awk_xpg4/awk1.c
          +++ new/usr/src/cmd/awk_xpg4/awk1.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /*
  27   27   * Copyright 1986, 1994 by Mortice Kern Systems Inc.  All rights reserved.
  28   28   */
  29   29  
  30   30  /*
  31   31   * awk -- mainline, yylex, etc.
  32   32   *
  33   33   * Based on MKS awk(1) ported to be /usr/xpg4/bin/awk with POSIX/XCU4 changes
  34   34   */
  35   35  
  36   36  #include "awk.h"
  37   37  #include "y.tab.h"
  38   38  #include <stdarg.h>
  39   39  #include <unistd.h>
  40   40  #include <locale.h>
  41   41  #include <search.h>
  42   42  
  43   43  static char     *progfiles[NPFILE];     /* Programmes files for yylex */
  44   44  static char     **progfilep = &progfiles[0]; /* Pointer to last file */
  45   45  static wchar_t  *progptr;               /* In-memory programme */
  46   46  static int      proglen;                /* Length of progptr */
  47   47  static wchar_t  context[NCONTEXT];      /* Circular buffer of context */
  48   48  static wchar_t  *conptr = &context[0];  /* context ptr */
  49   49  static FILE     *progfp;                /* Stdio stream for programme */
  50   50  static char     *filename;
  51   51  #ifdef  DEBUG
  52   52  static int      dflag;
  53   53  #endif
  54   54  
  55   55  #define AWK_EXEC_MAGIC  "<MKS AWKC>"
  56   56  #define LEN_EXEC_MAGIC  10
  57   57  
  58   58  static char     unbal[] = "unbalanced E char";
  59   59  
  60   60  static void     awkarginit(int c, char **av);
  61   61  static int      lexid(wint_t c);
  62   62  static int      lexnumber(wint_t c);
  63   63  static int      lexstring(wint_t endc);
  64   64  static int      lexregexp(wint_t endc);
  65   65  
  66   66  static void     awkvarinit(void);
  67   67  static wint_t   lexgetc(void);
  68   68  static void     lexungetc(wint_t c);
  69   69  static size_t   lexescape(wint_t endc, int regx, int cmd_line_operand);
  70   70  static void     awkierr(int perr, char *fmt, va_list ap);
  71   71  static int      usage(void);
  72   72  void            strescape(wchar_t *str);
  73   73  static const char       *toprint(wint_t);
  74   74  char *_cmdname;
  75   75  static wchar_t *mbconvert(char *str);
  76   76  
  77   77  extern int      isclvar(wchar_t *arg);
  78   78  
  79   79  /*
  80   80   * mainline for awk
  81   81   */
  82   82  int
  83   83  main(int argc, char *argv[])
  84   84  {
  85   85          wchar_t *ap;
  86   86          char *cmd;
  87   87  
  88   88          cmd = argv[0];
  89   89          _cmdname = cmd;
  90   90  
  91   91          linebuf = emalloc(NLINE * sizeof (wchar_t));
  92   92  
  93   93          /*
  94   94           * At this point only messaging should be internationalized.
  95   95           * numbers are still scanned as in the Posix locale.
  96   96           */
  97   97          (void) setlocale(LC_ALL, "");
  98   98          (void) setlocale(LC_NUMERIC, "C");
  99   99  #if !defined(TEXT_DOMAIN)
 100  100  #define TEXT_DOMAIN     "SYS_TEST"
 101  101  #endif
 102  102          (void) textdomain(TEXT_DOMAIN);
 103  103  
 104  104          awkvarinit();
 105  105          /* running = 1; */
 106  106          while (argc > 1 && *argv[1] == '-') {
 107  107                  void *save_ptr = NULL;
 108  108                  ap = mbstowcsdup(&argv[1][1]);
 109  109                  if (ap == NULL)
 110  110                          break;
 111  111                  if (*ap == '\0') {
 112  112                          free(ap);
 113  113                          break;
 114  114                  }
 115  115                  save_ptr = (void *) ap;
 116  116                  ++argv;
 117  117                  --argc;
 118  118                  if (*ap == '-' && ap[1] == '\0')
 119  119                          break;
 120  120                  for (; *ap != '\0'; ++ap) {
 121  121                          switch (*ap) {
 122  122  #ifdef DEBUG
 123  123                          case 'd':
 124  124                                  dflag = 1;
 125  125                                  continue;
 126  126  
 127  127  #endif
 128  128                          case 'f':
 129  129                                  if (argc < 2) {
 130  130                                          (void) fprintf(stderr,
 131  131                                  gettext("Missing script file\n"));
 132  132                                          return (1);
 133  133                                  }
 134  134                                  *progfilep++ = argv[1];
 135  135                                  --argc;
 136  136                                  ++argv;
 137  137                                  continue;
 138  138  
 139  139                          case 'F':
 140  140                                  if (ap[1] == '\0') {
 141  141                                          if (argc < 2) {
 142  142                                                  (void) fprintf(stderr,
 143  143                                  gettext("Missing field separator\n"));
 144  144                                                  return (1);
 145  145                                          }
 146  146                                          ap = mbstowcsdup(argv[1]);
 147  147                                          --argc;
 148  148                                          ++argv;
 149  149                                  } else
 150  150                                          ++ap;
 151  151                                  strescape(ap);
 152  152                                  strassign(varFS, linebuf, FALLOC,
 153  153                                      wcslen(linebuf));
 154  154                                  break;
 155  155  
 156  156                          case 'v': {
 157  157                                  wchar_t *vp;
 158  158                                  wchar_t *arg;
 159  159  
 160  160                                  if (argc < 2) {
 161  161                                          (void) fprintf(stderr,
 162  162                  gettext("Missing variable assignment\n"));
 163  163                                          return (1);
 164  164                                  }
 165  165                                  arg = mbconvert(argv[1]);
 166  166                                  /*
 167  167                                   * Ensure the variable expression
 168  168                                   * is valid (correct form).
 169  169                                   */
 170  170                                  if (((vp = wcschr(arg, '=')) != NULL) &&
 171  171                                      isclvar(arg)) {
 172  172                                          *vp = '\0';
 173  173                                          strescape(vp+1);
 174  174                                          strassign(vlook(arg), linebuf,
 175  175                                              FALLOC|FSENSE,
 176  176                                              wcslen(linebuf));
 177  177                                          *vp = '=';
 178  178                                  } else {
 179  179                                          (void) fprintf(stderr, gettext(
 180  180                                              "Invalid form for variable "
 181  181                                              "assignment: %S\n"), arg);
 182  182                                          return (1);
 183  183                                  }
 184  184                                  --argc;
 185  185                                  ++argv;
 186  186                                  continue;
 187  187                          }
 188  188  
 189  189                          default:
 190  190                                  (void) fprintf(stderr,
 191  191                                  gettext("Unknown option \"-%S\"\n"), ap);
 192  192                                  return (usage());
 193  193                          }
 194  194                          break;
 195  195                  }
 196  196                  if (save_ptr)
 197  197                          free(save_ptr);
 198  198          }
 199  199          if (progfilep == &progfiles[0]) {
 200  200                  if (argc < 2)
 201  201                          return (usage());
 202  202                  filename = "[command line]";    /* BUG: NEEDS TRANSLATION */
 203  203                  progptr = mbstowcsdup(argv[1]);
 204  204                  proglen = wcslen(progptr);
 205  205                  --argc;
 206  206                  ++argv;
 207  207          }
 208  208  
 209  209          argv[0] = cmd;
 210  210  
 211  211          awkarginit(argc, argv);
 212  212  
 213  213          /* running = 0; */
 214  214          (void) yyparse();
 215  215  
 216  216          lineno = 0;
 217  217          /*
 218  218           * Ok, done parsing, so now activate the rest of the nls stuff, set
 219  219           * the radix character.
 220  220           */
 221  221          (void) setlocale(LC_ALL, "");
 222  222          radixpoint = *localeconv()->decimal_point;
 223  223          awk();
 224  224          /* NOTREACHED */
 225  225          return (0);
 226  226  }
 227  227  
 228  228  /*
 229  229   * Do initial setup of buffers, etc.
 230  230   * This must be called before most processing
 231  231   * and especially before lexical analysis.
 232  232   * Variables initialised here will be overruled by command
 233  233   * line parameter initialisation.
 234  234   */
 235  235  static void
 236  236  awkvarinit()
 237  237  {
 238  238          NODE *np;
 239  239  
 240  240          (void) setvbuf(stderr, NULL, _IONBF, 0);
 241  241  
 242  242          if ((NIOSTREAM = sysconf(_SC_OPEN_MAX) - 4) <= 0) {
 243  243                  (void) fprintf(stderr,
 244  244          gettext("not enough available file descriptors"));
 245  245                  exit(1);
 246  246          }
 247  247          ofiles = (OFILE *)emalloc(sizeof (OFILE)*NIOSTREAM);
 248  248  #ifdef A_ZERO_POINTERS
 249  249          (void) memset((wchar_t *)ofiles, 0, sizeof (OFILE) * NIOSTREAM);
 250  250  #else
 251  251          {
 252  252                  /* initialize file descriptor table */
 253  253                  OFILE *fp;
 254  254                  for (fp = ofiles; fp < &ofiles[NIOSTREAM]; fp += 1) {
 255  255                          fp->f_fp = FNULL;
 256  256                                          fp->f_mode = 0;
 257  257                                          fp->f_name = (char *)0;
 258  258                  }
 259  259          }
 260  260  #endif
 261  261          constant = intnode((INT)0);
 262  262  
 263  263          const0 = intnode((INT)0);
 264  264          const1 = intnode((INT)1);
 265  265          constundef = emptynode(CONSTANT, 0);
 266  266          constundef->n_flags = FSTRING|FVINT;
 267  267          constundef->n_string = _null;
 268  268          constundef->n_strlen = 0;
 269  269          inc_oper = emptynode(ADD, 0);
 270  270          inc_oper->n_right = const1;
 271  271          asn_oper = emptynode(ADD, 0);
 272  272          field0 = node(FIELD, const0, NNULL);
 273  273  
 274  274          {
 275  275                  RESFUNC near*rp;
 276  276  
 277  277                  for (rp = &resfuncs[0]; rp->rf_name != (LOCCHARP)NULL; ++rp) {
 278  278                          np = finstall(rp->rf_name, rp->rf_func, rp->rf_type);
 279  279                  }
 280  280          }
 281  281          {
 282  282                  RESERVED near*rp;
 283  283  
 284  284                  for (rp = &reserved[0]; rp->r_name != (LOCCHARP)NULL; ++rp) {
 285  285                          switch (rp->r_type) {
 286  286                          case SVAR:
 287  287                          case VAR:
 288  288                                  running = 1;
 289  289                                  np = vlook(rp->r_name);
 290  290                                  if (rp->r_type == SVAR)
 291  291                                          np->n_flags |= FSPECIAL;
 292  292                                  if (rp->r_svalue != NULL)
 293  293                                          strassign(np, rp->r_svalue, FSTATIC,
 294  294                                              (size_t)rp->r_ivalue);
 295  295                                  else {
 296  296                                          constant->n_int = rp->r_ivalue;
 297  297                                          (void) assign(np, constant);
 298  298                                  }
 299  299                                  running = 0;
 300  300                                  break;
 301  301  
 302  302                          case KEYWORD:
 303  303                                  kinstall(rp->r_name, (int)rp->r_ivalue);
 304  304                                  break;
 305  305                          }
 306  306                  }
 307  307          }
 308  308  
 309  309          varNR = vlook(s_NR);
 310  310          varFNR = vlook(s_FNR);
 311  311          varNF = vlook(s_NF);
 312  312          varOFMT = vlook(s_OFMT);
 313  313          varCONVFMT = vlook(s_CONVFMT);
 314  314          varOFS = vlook(s_OFS);
 315  315          varORS = vlook(s_ORS);
 316  316          varRS = vlook(s_RS);
 317  317          varFS = vlook(s_FS);
 318  318          varARGC = vlook(s_ARGC);
 319  319          varSUBSEP = vlook(s_SUBSEP);
 320  320          varENVIRON = vlook(s_ENVIRON);
 321  321          varFILENAME = vlook(s_FILENAME);
 322  322          varSYMTAB = vlook(s_SYMTAB);
 323  323          incNR = node(ASG, varNR, node(ADD, varNR, const1));
 324  324          incFNR = node(ASG, varFNR, node(ADD, varFNR, const1));
 325  325          clrFNR = node(ASG, varFNR, const0);
 326  326  }
 327  327  
 328  328  /*
 329  329   * Initialise awk ARGC, ARGV variables.
 330  330   */
 331  331  static void
 332  332  awkarginit(int ac, char **av)
 333  333  {
 334  334          int i;
 335  335          wchar_t *cp;
 336  336  
 337  337          ARGVsubi = node(INDEX, vlook(s_ARGV), constant);
 338  338          running = 1;
 339  339          constant->n_int = ac;
 340  340          (void) assign(varARGC, constant);
 341  341          for (i = 0; i < ac; ++i) {
 342  342                  cp = mbstowcsdup(av[i]);
 343  343                  constant->n_int = i;
 344  344                  strassign(exprreduce(ARGVsubi), cp,
 345  345                      FSTATIC|FSENSE, wcslen(cp));
 346  346          }
 347  347          running = 0;
 348  348  }
 349  349  
 350  350  /*
 351  351   * Clean up when done parsing a function.
 352  352   * All formal parameters, because of a deal (funparm) in
 353  353   * yylex, get put into the symbol table in front of any
 354  354   * global variable of the same name.  When the entire
 355  355   * function is parsed, remove these formal dummy nodes
 356  356   * from the symbol table but retain the nodes because
 357  357   * the generated tree points at them.
 358  358   */
 359  359  void
 360  360  uexit(NODE *np)
 361  361  {
 362  362          NODE *formal;
 363  363  
 364  364          while ((formal = getlist(&np)) != NNULL)
 365  365                  delsymtab(formal, 0);
 366  366  }
 367  367  
 368  368  /*
 369  369   * The lexical analyzer.
 370  370   */
 371  371  int
 372  372  yylex()
 373  373  {
 374  374          wint_t c, c1;
 375  375          int i;
 376  376          static int savetoken = 0;
 377  377          static int wasfield;
 378  378          static int isfuncdef;
 379  379          static int nbrace, nparen, nbracket;
 380  380          static struct ctosymstruct {
 381  381                  wint_t c, sym;
 382  382          } ctosym[] = {
 383  383                  { '|', BAR },           { '^', CARAT },
 384  384                  { '~', TILDE },         { '<', LANGLE },
 385  385                  { '>', RANGLE },        { '+', PLUSC },
 386  386                  { '-', HYPHEN },        { '*', STAR },
 387  387                  { '/', SLASH },         { '%', PERCENT },
 388  388                  { '!', EXCLAMATION },   { '$', DOLLAR },
 389  389                  { '[', LSQUARE },       { ']', RSQUARE },
 390  390                  { '(', LPAREN },        { ')', RPAREN },
 391  391                  { ';', SEMI },          { '{', LBRACE },
 392  392                  { '}', RBRACE },        {   0, 0 }
 393  393          };
 394  394  
 395  395          if (savetoken) {
 396  396                  c = savetoken;
 397  397                  savetoken = 0;
 398  398          } else if (redelim != '\0') {
 399  399                  c = redelim;
 400  400                  redelim = 0;
 401  401                  catterm = 0;
 402  402                  savetoken = c;
 403  403                  c = lexlast = lexregexp(c);
 404  404                  goto out;
 405  405          } else while ((c = lexgetc()) != WEOF) {
 406  406                  if (iswalpha(c) || c == '_') {
 407  407                          c = lexid(c);
 408  408                  } else if (iswdigit(c) || c == '.') {
 409  409                          c = lexnumber(c);
 410  410                  } else if (isWblank(c)) {
 411  411                          continue;
 412  412                  } else switch (c) {
 413  413  #if DOS || OS2
 414  414                  case 032:               /* ^Z */
 415  415                          continue;
 416  416  #endif
 417  417  
 418  418                  case '"':
 419  419                          c = lexstring(c);
 420  420                          break;
 421  421  
 422  422                  case '#':
 423  423                          while ((c = lexgetc()) != '\n' && c != WEOF)
 424  424                                  ;
 425  425                          lexungetc(c);
 426  426                          continue;
 427  427  
 428  428                  case '+':
 429  429                          if ((c1 = lexgetc()) == '+')
 430  430                                  c = INC;
 431  431                          else if (c1 == '=')
 432  432                                  c = AADD;
 433  433                          else
 434  434                                  lexungetc(c1);
 435  435                          break;
 436  436  
 437  437                  case '-':
 438  438                          if ((c1 = lexgetc()) == '-')
 439  439                                  c = DEC;
 440  440                          else if (c1 == '=')
 441  441                                  c = ASUB;
 442  442                          else
 443  443                                  lexungetc(c1);
 444  444                          break;
 445  445  
 446  446                  case '*':
 447  447                          if ((c1 = lexgetc()) == '=')
 448  448                                  c = AMUL;
 449  449                          else if (c1 == '*') {
 450  450                                  if ((c1 = lexgetc()) == '=')
 451  451                                          c = AEXP;
 452  452                                  else {
 453  453                                          c = EXP;
 454  454                                          lexungetc(c1);
 455  455                                  }
 456  456                          } else
 457  457                                  lexungetc(c1);
 458  458                          break;
 459  459  
 460  460                  case '^':
 461  461                          if ((c1 = lexgetc()) == '=') {
 462  462                                  c = AEXP;
 463  463                          } else {
 464  464                                  c = EXP;
 465  465                                  lexungetc(c1);
 466  466                          }
 467  467                          break;
 468  468  
 469  469                  case '/':
 470  470                          if ((c1 = lexgetc()) == '=' &&
 471  471                              lexlast != RE && lexlast != NRE &&
 472  472                              lexlast != ';' && lexlast != '\n' &&
 473  473                              lexlast != ',' && lexlast != '(')
 474  474                                  c = ADIV;
 475  475                          else
 476  476                                  lexungetc(c1);
 477  477                          break;
 478  478  
 479  479                  case '%':
 480  480                          if ((c1 = lexgetc()) == '=')
 481  481                                  c = AREM;
 482  482                          else
 483  483                                  lexungetc(c1);
 484  484                          break;
 485  485  
 486  486                  case '&':
 487  487                          if ((c1 = lexgetc()) == '&')
 488  488                                  c = AND;
 489  489                          else
 490  490                                  lexungetc(c1);
 491  491                          break;
 492  492  
 493  493                  case '|':
 494  494                          if ((c1 = lexgetc()) == '|')
 495  495                                  c = OR;
 496  496                          else {
 497  497                                  lexungetc(c1);
 498  498                                  if (inprint)
 499  499                                          c = PIPE;
 500  500                          }
 501  501                          break;
 502  502  
 503  503                  case '>':
 504  504                          if ((c1 = lexgetc()) == '=')
 505  505                                  c = GE;
 506  506                          else if (c1 == '>')
 507  507                                  c = APPEND;
 508  508                          else {
 509  509                                  lexungetc(c1);
 510  510                                  if (nparen == 0 && inprint)
 511  511                                          c = WRITE;
 512  512                          }
 513  513                          break;
 514  514  
 515  515                  case '<':
 516  516                          if ((c1 = lexgetc()) == '=')
 517  517                                  c = LE;
 518  518                          else
 519  519                                  lexungetc(c1);
 520  520                          break;
 521  521  
 522  522                  case '!':
 523  523                          if ((c1 = lexgetc()) == '=')
 524  524                                  c = NE;
 525  525                          else if (c1 == '~')
 526  526                                  c = NRE;
 527  527                          else
 528  528                                  lexungetc(c1);
 529  529                          break;
 530  530  
 531  531                  case '=':
 532  532                          if ((c1 = lexgetc()) == '=')
 533  533                                  c = EQ;
 534  534                          else {
 535  535                                  lexungetc(c1);
 536  536                                  c = ASG;
 537  537                          }
 538  538                          break;
 539  539  
 540  540                  case '\n':
 541  541                          switch (lexlast) {
 542  542                          case ')':
 543  543                                  if (catterm || inprint) {
 544  544                                          c = ';';
 545  545                                          break;
 546  546                                  }
 547  547                          /*FALLTHRU*/
 548  548                          case AND:
 549  549                          case OR:
 550  550                          case COMMA:
 551  551                          case '{':
 552  552                          case ELSE:
 553  553                          case ';':
 554  554                          case DO:
 555  555                                  continue;
 556  556  
 557  557                          case '}':
 558  558                                  if (nbrace != 0)
 559  559                                          continue;
 560  560  
 561  561                          default:
 562  562                                  c = ';';
 563  563                                  break;
 564  564                          }
 565  565                          break;
 566  566  
 567  567                  case ELSE:
 568  568                          if (lexlast != ';') {
 569  569                                  savetoken = ELSE;
 570  570                                  c = ';';
 571  571                          }
 572  572                          break;
 573  573  
 574  574                  case '(':
 575  575                          ++nparen;
 576  576                          break;
 577  577  
 578  578                  case ')':
 579  579                          if (--nparen < 0)
 580  580                                  awkerr(unbal, "()");
 581  581                          break;
 582  582  
 583  583                  case '{':
 584  584                          nbrace++;
 585  585                          break;
 586  586  
 587  587                  case '}':
 588  588                          if (--nbrace < 0) {
 589  589                                  char brk[3];
 590  590  
 591  591                                  brk[0] = '{';
 592  592                                  brk[1] = '}';
 593  593                                  brk[2] = '\0';
 594  594                                  awkerr(unbal, brk);
 595  595                          }
 596  596                          if (lexlast != ';') {
 597  597                                  savetoken = c;
 598  598                                  c = ';';
 599  599                          }
 600  600                          break;
 601  601  
 602  602                  case '[':
 603  603                          ++nbracket;
 604  604                          break;
 605  605  
 606  606                  case ']':
 607  607                          if (--nbracket < 0) {
 608  608                                  char brk[3];
 609  609  
 610  610                                  brk[0] = '[';
 611  611                                  brk[1] = ']';
 612  612                                  brk[2] = '\0';
 613  613                                  awkerr(unbal, brk);
 614  614                          }
 615  615                          break;
 616  616  
 617  617                  case '\\':
 618  618                          if ((c1 = lexgetc()) == '\n')
 619  619                                  continue;
 620  620                          lexungetc(c1);
 621  621                          break;
 622  622  
 623  623                  case ',':
 624  624                          c = COMMA;
 625  625                          break;
 626  626  
 627  627                  case '?':
 628  628                          c = QUEST;
 629  629                          break;
 630  630  
 631  631                  case ':':
 632  632                          c = COLON;
 633  633                          break;
 634  634  
 635  635                  default:
 636  636                          if (!iswprint(c))
 637  637                                  awkerr(
 638  638                                      gettext("invalid character \"%s\""),
 639  639                                      toprint(c));
 640  640                          break;
 641  641                  }
 642  642                  break;
 643  643          }
 644  644  
 645  645          switch (c) {
 646  646          case ']':
 647  647                  ++catterm;
 648  648                  break;
 649  649  
 650  650          case VAR:
 651  651                  if (catterm) {
 652  652                          savetoken = c;
 653  653                          c = CONCAT;
 654  654                          catterm = 0;
 655  655                  } else if (!isfuncdef) {
 656  656                          if ((c1 = lexgetc()) != '(')
 657  657                                  ++catterm;
 658  658                          lexungetc(c1);
 659  659                  }
 660  660                  isfuncdef = 0;
 661  661                  break;
 662  662  
 663  663          case PARM:
 664  664          case CONSTANT:
 665  665                  if (catterm) {
 666  666                          savetoken = c;
 667  667                          c = CONCAT;
 668  668                          catterm = 0;
 669  669                  } else {
 670  670                          if (lexlast == '$')
 671  671                                  wasfield = 2;
 672  672                          ++catterm;
 673  673                  }
 674  674                  break;
 675  675  
 676  676          case INC:
 677  677          case DEC:
 678  678                  if (!catterm || lexlast != CONSTANT || wasfield)
 679  679                          break;
 680  680  
 681  681          /*FALLTHRU*/
 682  682          case UFUNC:
 683  683          case FUNC:
 684  684          case GETLINE:
 685  685          case '!':
 686  686          case '$':
 687  687          case '(':
 688  688                  if (catterm) {
 689  689                          savetoken = c;
 690  690                          c = CONCAT;
 691  691                          catterm = 0;
 692  692                  }
 693  693                  break;
 694  694  
 695  695          case '}':
 696  696                  if (nbrace == 0)
 697  697                          savetoken = ';';
 698  698          /*FALLTHRU*/
 699  699          case ';':
 700  700                  inprint = 0;
 701  701          /*FALLTHRU*/
 702  702          default:
 703  703                  if (c == DEFFUNC)
 704  704                          isfuncdef = 1;
 705  705                  catterm = 0;
 706  706          }
 707  707          lexlast = c;
 708  708          if (wasfield)
 709  709                  wasfield--;
 710  710          /*
 711  711           * Map character constants to symbolic names.
 712  712           */
 713  713          for (i = 0; ctosym[i].c != 0; i++)
 714  714                  if (c == ctosym[i].c) {
 715  715                          c = ctosym[i].sym;
 716  716                          break;
 717  717                  }
 718  718  out:
 719  719  #ifdef DEBUG
 720  720          if (dflag)
 721  721                  (void) printf("%d\n", (int)c);
 722  722  #endif
 723  723          return ((int)c);
 724  724  }
 725  725  
 726  726  /*
 727  727   * Read a number for the lexical analyzer.
 728  728   * Input is the first character of the number.
 729  729   * Return value is the lexical type.
 730  730   */
 731  731  static int
 732  732  lexnumber(wint_t c)
 733  733  {
 734  734          wchar_t *cp;
 735  735          int dotfound = 0;
 736  736          int efound = 0;
 737  737          INT number;
 738  738  
 739  739          cp = linebuf;
 740  740          do {
 741  741                  if (iswdigit(c))
 742  742                          ;
 743  743                  else if (c == '.') {
 744  744                          if (dotfound++)
 745  745                                  break;
 746  746                  } else if (c == 'e' || c == 'E') {
 747  747                          if ((c = lexgetc()) != '-' && c != '+') {
 748  748                                  lexungetc(c);
 749  749                                  c = 'e';
 750  750                          } else
 751  751                                  *cp++ = 'e';
 752  752                          if (efound++)
 753  753                                  break;
 754  754                  } else
 755  755                          break;
 756  756                  *cp++ = c;
 757  757          } while ((c = lexgetc()) != WEOF);
 758  758          *cp = '\0';
 759  759          if (dotfound && cp == linebuf+1)
 760  760                  return (DOT);
 761  761          lexungetc(c);
 762  762          errno = 0;
 763  763          if (!dotfound && !efound &&
 764  764              ((number = wcstol(linebuf, (wchar_t **)0, 10)), errno != ERANGE))
 765  765                  yylval.node = intnode(number);
 766  766          else
 767  767                  yylval.node = realnode((REAL)wcstod(linebuf, (wchar_t **)0));
 768  768          return (CONSTANT);
 769  769  }
 770  770  
 771  771  /*
 772  772   * Read an identifier.
 773  773   * Input is first character of identifier.
 774  774   * Return VAR.
 775  775   */
 776  776  static int
 777  777  lexid(wint_t c)
 778  778  {
 779  779          wchar_t *cp;
 780  780          size_t i;
 781  781          NODE *np;
 782  782  
 783  783          cp = linebuf;
 784  784          do {
 785  785                  *cp++ = c;
 786  786                  c = lexgetc();
 787  787          } while (iswalpha(c) || iswdigit(c) || c == '_');
 788  788          *cp = '\0';
 789  789          lexungetc(c);
 790  790          yylval.node = np = vlook(linebuf);
 791  791  
 792  792          switch (np->n_type) {
 793  793          case KEYWORD:
 794  794                  switch (np->n_keywtype) {
 795  795                  case PRINT:
 796  796                  case PRINTF:
 797  797                          ++inprint;
 798  798                  default:
 799  799                          return ((int)np->n_keywtype);
 800  800                  }
 801  801                  /* NOTREACHED */
 802  802  
 803  803          case ARRAY:
 804  804          case VAR:
 805  805                  /*
 806  806                   * If reading the argument list, create a dummy node
 807  807                   * for the duration of that function. These variables
 808  808                   * can be removed from the symbol table at function end
 809  809                   * but they must still exist because the execution tree
 810  810                   * knows about them.
 811  811                   */
 812  812                  if (funparm) {
 813  813  do_funparm:
 814  814                          np = emptynode(PARM, i = (cp-linebuf));
 815  815                          np->n_flags = FSTRING;
 816  816                          np->n_string = _null;
 817  817                          np->n_strlen = 0;
 818  818                          (void) memcpy(np->n_name, linebuf,
 819  819                              (i+1) * sizeof (wchar_t));
 820  820                          addsymtab(np);
 821  821                          yylval.node = np;
 822  822                  } else if (np == varNF || (np == varFS &&
 823  823                      (!doing_begin || begin_getline))) {
 824  824                          /*
 825  825                           * If the user program references NF or sets
 826  826                           * FS either outside of a begin block or
 827  827                           * in a begin block after a getline then the
 828  828                           * input line will be split immediately upon read
 829  829                           * rather than when a field is first referenced.
 830  830                           */
 831  831                          needsplit = 1;
 832  832                  } else if (np == varENVIRON)
 833  833                          needenviron = 1;
 834  834          /*FALLTHRU*/
 835  835          case PARM:
 836  836                  return (VAR);
 837  837  
 838  838          case UFUNC:
 839  839                  /*
 840  840                   * It is ok to redefine functions as parameters
 841  841                   */
 842  842                  if (funparm) goto do_funparm;
 843  843          /*FALLTHRU*/
 844  844          case FUNC:
 845  845          case GETLINE:
 846  846                  /*
 847  847                   * When a getline is encountered, clear the 'doing_begin' flag.
 848  848                   * This will force the 'needsplit' flag to be set, even inside
 849  849                   * a begin block, if FS is altered. (See VAR case above)
 850  850                   */
 851  851                  if (doing_begin)
 852  852                          begin_getline = 1;
 853  853                  return (np->n_type);
 854  854          }
 855  855          /* NOTREACHED */
 856  856          return (0);
 857  857  }
 858  858  
 859  859  /*
 860  860   * Read a string for the lexical analyzer.
 861  861   * `endc' terminates the string.
 862  862   */
 863  863  static int
 864  864  lexstring(wint_t endc)
 865  865  {
 866  866          size_t length = lexescape(endc, 0, 0);
 867  867  
 868  868          yylval.node = stringnode(linebuf, FALLOC, length);
 869  869          return (CONSTANT);
 870  870  }
 871  871  
 872  872  /*
 873  873   * Read a regular expression.
 874  874   */
 875  875  static int
 876  876  lexregexp(wint_t endc)
 877  877  {
 878  878          (void) lexescape(endc, 1, 0);
 879  879          yylval.node = renode(linebuf);
 880  880          return (URE);
 881  881  }
 882  882  
 883  883  /*
 884  884   * Process a string, converting the escape characters as required by
 885  885   * 1003.2. The processed string ends up in the global linebuf[]. This
 886  886   * routine also changes the value of 'progfd' - the program file
 887  887   * descriptor, so it should be used with some care. It is presently used to
 888  888   * process -v (awk1.c) and var=str type arguments (awk2.c, nextrecord()).
 889  889   */
 890  890  void
 891  891  strescape(wchar_t *str)
 892  892  {
 893  893          progptr = str;
 894  894          proglen = wcslen(str) + 1;      /* Include \0 */
 895  895          (void) lexescape('\0', 0, 1);
 896  896          progptr = NULL;
 897  897  }
 898  898  
 899  899  /*
 900  900   * Read a string or regular expression, terminated by ``endc'',
 901  901   * for lexical analyzer, processing escape sequences.
 902  902   * Return string length.
 903  903   */
 904  904  static size_t
 905  905  lexescape(wint_t endc, int regx, int cmd_line_operand)
 906  906  {
 907  907          static char nlre[256];
 908  908          static char nlstr[256];
 909  909          static char eofre[256];
 910  910          static char eofstr[256];
 911  911          int first_time = 1;
 912  912          wint_t c;
 913  913          wchar_t *cp;
 914  914          int n, max;
 915  915  
 916  916          if (first_time == 1) {
 917  917                  (void) strcpy(nlre, gettext("Newline in regular expression\n"));
 918  918                  (void) strcpy(nlstr, gettext("Newline in string\n"));
 919  919                  (void) strcpy(eofre, gettext("EOF in regular expression\n"));
 920  920                  (void) strcpy(eofstr, gettext("EOF in string\n"));
 921  921                  first_time = 0;
 922  922          }
 923  923  
 924  924          cp = linebuf;
 925  925          while ((c = lexgetc()) != endc) {
 926  926                  if (c == '\n')
 927  927                          awkerr(regx ? nlre : nlstr);
 928  928                  if (c == '\\') {
 929  929                          switch (c = lexgetc(), c) {
 930  930                          case '\\':
 931  931                                  if (regx)
 932  932                                          *cp++ = '\\';
 933  933                                  break;
 934  934  
 935  935                          case '/':
 936  936                                  c = '/';
 937  937                                  break;
 938  938  
 939  939                          case 'n':
 940  940                                  c = '\n';
 941  941                                  break;
 942  942  
 943  943                          case 'b':
 944  944                                  c = '\b';
 945  945                                  break;
 946  946  
 947  947                          case 't':
 948  948                                  c = '\t';
 949  949                                  break;
 950  950  
 951  951                          case 'r':
 952  952                                  c = '\r';
 953  953                                  break;
 954  954  
 955  955                          case 'f':
 956  956                                  c = '\f';
 957  957                                  break;
 958  958  
 959  959                          case 'v':
 960  960                                  c = '\v';
 961  961                                  break;
 962  962  
 963  963                          case 'a':
 964  964                                  c = (char)0x07;
 965  965                                  break;
 966  966  
 967  967                          case 'x':
 968  968                                  n = 0;
 969  969                                  while (iswxdigit(c = lexgetc())) {
 970  970                                          if (iswdigit(c))
 971  971                                                  c -= '0';
 972  972                                          else if (iswupper(c))
 973  973                                                  c -= 'A'-10;
 974  974                                          else
 975  975                                                  c -= 'a'-10;
 976  976                                          n = (n<<4) + c;
 977  977                                  }
 978  978                                  lexungetc(c);
 979  979                                  c = n;
 980  980                                  break;
 981  981  
 982  982                          case '0':
 983  983                          case '1':
 984  984                          case '2':
 985  985                          case '3':
 986  986                          case '4':
 987  987                          case '5':
 988  988                          case '6':
 989  989                          case '7':
 990  990  #if 0
 991  991  /*
 992  992   * Posix.2 draft 10 disallows the use of back-referencing - it explicitly
 993  993   * requires processing of the octal escapes both in strings and
 994  994   * regular expressions. The following code is disabled instead of
 995  995   * removed as back-referencing may be reintroduced in a future draft
 996  996   * of the standard.
 997  997   */
 998  998                                  /*
 999  999                                   * For regular expressions, we disallow
1000 1000                                   * \ooo to mean octal character, in favour
1001 1001                                   * of back referencing.
1002 1002                                   */
1003 1003                                  if (regx) {
1004 1004                                          *cp++ = '\\';
1005 1005                                          break;
1006 1006                                  }
1007 1007  #endif
1008 1008                                  max = 3;
1009 1009                                  n = 0;
1010 1010                                  do {
1011 1011                                          n = (n<<3) + c-'0';
1012 1012                                          if ((c = lexgetc()) > '7' || c < '0')
1013 1013                                                  break;
1014 1014                                  } while (--max);
1015 1015                                  lexungetc(c);
1016 1016                                  /*
1017 1017                                   * an octal escape sequence must have at least
1018 1018                                   * 2 digits after the backslash, otherwise
1019 1019                                   * it gets passed straight thru for possible
1020 1020                                   * use in backreferencing.
1021 1021                                   */
1022 1022                                  if (max == 3) {
1023 1023                                          *cp++ = '\\';
1024 1024                                          n += '0';
1025 1025                                  }
1026 1026                                  c = n;
1027 1027                                  break;
1028 1028  
1029 1029                          case '\n':
1030 1030                                  continue;
1031 1031  
1032 1032                          default:
1033 1033                                  if (c != endc || cmd_line_operand) {
1034 1034                                          *cp++ = '\\';
1035 1035                                          if (c == endc)
1036 1036                                                  lexungetc(c);
1037 1037                                  }
1038 1038                          }
1039 1039                  }
1040 1040                  if (c == WEOF)
1041 1041                          awkerr(regx ? eofre : eofstr);
1042 1042                  *cp++ = c;
1043 1043          }
1044 1044          *cp = '\0';
1045 1045          return (cp - linebuf);
1046 1046  }
1047 1047  
1048 1048  /*
1049 1049   * Build a regular expression NODE.
1050 1050   * Argument is the string holding the expression.
1051 1051   */
1052 1052  NODE *
1053 1053  renode(wchar_t *s)
1054 1054  {
1055 1055          NODE *np;
1056 1056          int n;
1057 1057  
1058 1058          np = emptynode(RE, 0);
1059 1059          np->n_left = np->n_right = NNULL;
1060 1060          if ((n = REGWCOMP(&np->n_regexp, s)) != REG_OK) {
1061 1061                  int m;
1062 1062                  char *p;
1063 1063  
1064 1064                  m = REGWERROR(n, np->n_regexp, NULL, 0);
1065 1065                  p = (char *)emalloc(m);
1066 1066                  REGWERROR(n, np->n_regexp, p, m);
1067 1067                  awkerr("/%S/: %s", s, p);
1068 1068          }
1069 1069          return (np);
1070 1070  }
1071 1071  /*
1072 1072   * Get a character for the lexical analyser routine.
1073 1073   */
1074 1074  static wint_t
1075 1075  lexgetc()
1076 1076  {
1077 1077          wint_t c;
1078 1078          static char **files = &progfiles[0];
1079 1079  
1080 1080          if (progfp != FNULL && (c = fgetwc(progfp)) != WEOF)
1081 1081                  ;
1082 1082          else {
1083 1083                  if (progptr != NULL) {
1084 1084                          if (proglen-- <= 0)
1085 1085                                  c = WEOF;
1086 1086                          else
1087 1087                                  c = *progptr++;
1088 1088                  } else {
1089 1089                          if (progfp != FNULL) {
1090 1090                                  if (progfp != stdin)
1091 1091                                          (void) fclose(progfp);
1092 1092                                  else
1093 1093                                          clearerr(progfp);
1094 1094                                  progfp = FNULL;
1095 1095                          }
1096 1096                          if (files < progfilep) {
1097 1097                                  filename = *files++;
1098 1098                                  lineno = 1;
1099 1099                                  if (filename[0] == '-' && filename[1] == '\0')
1100 1100                                          progfp = stdin;
1101 1101                                  else if ((progfp = fopen(filename, r))
1102 1102                                      == FNULL) {
1103 1103                                          (void) fprintf(stderr,
1104 1104                                  gettext("script file \"%s\""), filename);
1105 1105                                          exit(1);
1106 1106                                  }
1107 1107                                  c = fgetwc(progfp);
1108 1108                          }
1109 1109                  }
1110 1110          }
1111 1111          if (c == '\n')
1112 1112                  ++lineno;
1113 1113          if (conptr >= &context[NCONTEXT])
1114 1114                  conptr = &context[0];
1115 1115          if (c != WEOF)
1116 1116                  *conptr++ = c;
1117 1117          return (c);
1118 1118  }
1119 1119  
1120 1120  /*
1121 1121   * Return a character for lexical analyser.
1122 1122   * Only one returned character is (not enforced) legitimite.
1123 1123   */
1124 1124  static void
1125 1125  lexungetc(wint_t c)
1126 1126  {
1127 1127          if (c == '\n')
1128 1128                  --lineno;
1129 1129          if (c != WEOF) {
1130 1130                  if (conptr == &context[0])
1131 1131                          conptr = &context[NCONTEXT];
1132 1132                  *--conptr = '\0';
1133 1133          }
1134 1134          if (progfp != FNULL) {
1135 1135                  (void) ungetwc(c, progfp);
1136 1136                  return;
1137 1137          }
1138 1138          if (c == WEOF)
1139 1139                  return;
1140 1140          *--progptr = c;
1141 1141          proglen++;
1142 1142  }
1143 1143  
1144 1144  /*
1145 1145   * Syntax errors during parsing.
1146 1146   */
1147 1147  void
1148 1148  yyerror(char *s, ...)
1149 1149  {
1150 1150          if (lexlast == FUNC || lexlast == GETLINE || lexlast == KEYWORD)
1151 1151                  if (lexlast == KEYWORD)
1152 1152                          awkerr(gettext("inadmissible use of reserved keyword"));
1153 1153                  else
1154 1154                          awkerr(gettext("attempt to redefine builtin function"));
1155 1155          awkerr(s);
1156 1156  }
1157 1157  
1158 1158  /*
1159 1159   * Error routine for all awk errors.
1160 1160   */
1161 1161  /* ARGSUSED */
1162 1162  void
1163 1163  awkerr(char *fmt, ...)
1164 1164  {
1165 1165          va_list args;
1166 1166  
1167 1167          va_start(args, fmt);
1168 1168          awkierr(0, fmt, args);
1169 1169          va_end(args);
1170 1170  }
1171 1171  
1172 1172  /*
1173 1173   * Error routine like "awkerr" except that it prints out
1174 1174   * a message that includes an errno-specific indication.
1175 1175   */
1176 1176  /* ARGSUSED */
1177 1177  void
1178 1178  awkperr(char *fmt, ...)
1179 1179  {
1180 1180          va_list args;
1181 1181  
1182 1182          va_start(args, fmt);
1183 1183          awkierr(1, fmt, args);
1184 1184          va_end(args);
1185 1185  }
1186 1186  
1187 1187  /*
1188 1188   * Common internal routine for awkerr, awkperr
1189 1189   */
1190 1190  static void
1191 1191  awkierr(int perr, char *fmt, va_list ap)
1192 1192  {
1193 1193          static char sep1[] = "\n>>>\t";
1194 1194          static char sep2[] = "\t<<<";
1195 1195          int saveerr = errno;
1196 1196  
1197 1197          (void) fprintf(stderr, "%s: ", _cmdname);
1198 1198          if (running) {
1199 1199                  (void) fprintf(stderr, gettext("line %u ("),
1200 1200                      curnode == NNULL ? 0 : curnode->n_lineno);
1201 1201                  if (phase == 0)
1202 1202                          (void) fprintf(stderr, "NR=%lld): ",
1203 1203                              (INT)exprint(varNR));
1204 1204                  else
1205 1205                          (void) fprintf(stderr, "%s): ",
1206 1206                              phase == BEGIN ? s_BEGIN : s_END);
1207 1207          } else if (lineno != 0) {
1208 1208                  (void) fprintf(stderr, gettext("file \"%s\": "), filename);
1209 1209                  (void) fprintf(stderr, gettext("line %u: "), lineno);
1210 1210          }
1211 1211          (void) vfprintf(stderr, gettext(fmt), ap);
1212 1212          if (perr == 1)
1213 1213                  (void) fprintf(stderr, ": %s", strerror(saveerr));
1214 1214          if (perr != 2 && !running) {
1215 1215                  wchar_t *cp;
1216 1216                  int n;
1217 1217                  int c;
1218 1218  
1219 1219                  (void) fprintf(stderr, gettext("  Context is:%s"), sep1);
1220 1220                  cp = conptr;
1221 1221                  n = NCONTEXT;
1222 1222                  do {
1223 1223                          if (cp >= &context[NCONTEXT])
1224 1224                                  cp = &context[0];
1225 1225                          if ((c = *cp++) != '\0')
1226 1226                                  (void) fputs(c == '\n' ? sep1 : toprint(c),
1227 1227                                      stderr);
1228 1228                  } while (--n != 0);
1229 1229                  (void) fputs(sep2, stderr);
1230 1230          }
1231 1231          (void) fprintf(stderr, "\n");
1232 1232          exit(1);
1233 1233  }
1234 1234  
1235 1235  wchar_t *
1236 1236  emalloc(unsigned n)
1237 1237  {
1238 1238          wchar_t *cp;
1239 1239  
1240 1240          if ((cp = malloc(n)) == NULL)
1241 1241                  awkerr(nomem);
1242 1242          return (cp);
1243 1243  }
1244 1244  
1245 1245  wchar_t *
1246 1246  erealloc(wchar_t *p, unsigned n)
1247 1247  {
1248 1248          wchar_t *cp;
1249 1249  
1250 1250          if ((cp = realloc(p, n)) == NULL)
1251 1251                  awkerr(nomem);
1252 1252          return (cp);
1253 1253  }
1254 1254  
1255 1255  
1256 1256  /*
1257 1257   * usage message for awk
1258 1258   */
1259 1259  static int
1260 1260  usage()
1261 1261  {
1262 1262          (void) fprintf(stderr, gettext(
1263 1263  "Usage: awk [-F ERE] [-v var=val] 'program' [var=val ...] [file ...]\n"
1264 1264  "       awk [-F ERE] -f progfile ... [-v var=val] [var=val ...] [file ...]\n"));
1265 1265          return (2);
1266 1266  }
1267 1267  
1268 1268  
1269 1269  static wchar_t *
1270 1270  mbconvert(char *str)
1271 1271  {
1272 1272          static wchar_t *op = 0;
1273 1273  
1274 1274          if (op != 0)
1275 1275                  free(op);
1276 1276          return (op = mbstowcsdup(str));
1277 1277  }
1278 1278  
1279 1279  char *
1280 1280  mbunconvert(wchar_t *str)
1281 1281  {
1282 1282          static char *op = 0;
1283 1283  
1284 1284          if (op != 0)
1285 1285                  free(op);
1286 1286          return (op = wcstombsdup(str));
1287 1287  }
1288 1288  
1289 1289  /*
1290 1290   * Solaris port - following functions are typical MKS functions written
1291 1291   * to work for Solaris.
1292 1292   */
1293 1293  
1294 1294  wchar_t *
1295 1295  mbstowcsdup(char *s)
1296 1296  {
1297 1297          int n;
1298 1298          wchar_t *w;
1299 1299  
1300 1300          n = strlen(s) + 1;
1301 1301          if ((w = (wchar_t *)malloc(n * sizeof (wchar_t))) == NULL)
1302 1302                  return (NULL);
1303 1303  
1304 1304          if (mbstowcs(w, s, n) == (size_t)-1)
1305 1305                  return (NULL);
1306 1306          return (w);
1307 1307  
1308 1308  }
1309 1309  
1310 1310  char *
1311 1311  wcstombsdup(wchar_t *w)
1312 1312  {
1313 1313          int n;
1314 1314          char *mb;
1315 1315  
1316 1316          /* Fetch memory for worst case string length */
1317 1317          n = wslen(w) + 1;
1318 1318          n *= MB_CUR_MAX;
1319 1319          if ((mb = (char *)malloc(n)) == NULL) {
1320 1320                  return (NULL);
1321 1321          }
1322 1322  
1323 1323          /* Convert the string */
1324 1324          if ((n = wcstombs(mb, w, n)) == -1) {
1325 1325                  int saverr = errno;
1326 1326  
1327 1327                  free(mb);
1328 1328                  errno = saverr;
1329 1329                  return (0);
1330 1330          }
1331 1331  
1332 1332          /* Shrink the string down */
1333 1333          if ((mb = (char *)realloc(mb, strlen(mb)+1)) == NULL)  {
1334 1334                  return (NULL);
1335 1335          }
1336 1336          return (mb);
1337 1337  }
1338 1338  
1339 1339  /*
1340 1340   * The upe_ctrls[] table contains the printable 'control-sequences' for the
1341 1341   * character values 0..31 and 127.  The first entry is for value 127, thus the
1342 1342   * entries for the remaining character values are from 1..32.
1343 1343   */
1344 1344  static const char *const upe_ctrls[] =
1345 1345  {
1346 1346          "^?",
1347 1347          "^@",  "^A",  "^B",  "^C",  "^D",  "^E",  "^F",  "^G",
1348 1348          "^H",  "^I",  "^J",  "^K",  "^L",  "^M",  "^N",  "^O",
1349 1349          "^P",  "^Q",  "^R",  "^S",  "^T",  "^U",  "^V",  "^W",
1350 1350          "^X",  "^Y",  "^Z",  "^[",  "^\\", "^]",  "^^",  "^_"
1351 1351  };
1352 1352  
1353 1353  
1354 1354  /*
1355 1355   * Return a printable string corresponding to the given character value.  If
1356 1356   * the character is printable, simply return it as the string.  If it is in
1357 1357   * the range specified by table 5-101 in the UPE, return the corresponding
1358 1358   * string.  Otherwise, return an octal escape sequence.
1359 1359   */
1360 1360  static const char *
1361 1361  toprint(wchar_t c)
1362 1362  {
1363 1363          int n, len;
1364 1364          unsigned char *ptr;
1365 1365          static char mbch[MB_LEN_MAX+1];
1366 1366          static char buf[5 * MB_LEN_MAX + 1];
1367 1367  
1368 1368          if ((n = wctomb(mbch, c)) == -1) {
1369 1369                  /* Should never happen */
1370 1370                  (void) sprintf(buf, "\\%x", c);
1371 1371                  return (buf);
1372 1372          }
1373 1373          mbch[n] = '\0';
1374 1374          if (iswprint(c)) {
1375 1375                  return (mbch);
1376 1376          } else if (c == 127) {
1377 1377                  return (upe_ctrls[0]);
1378 1378          } else if (c < 32) {
1379 1379                  /* Print as in Table 5-101 in the UPE */
1380 1380                  return (upe_ctrls[c+1]);
1381 1381          } else {
1382 1382                  /* Print as an octal escape sequence */
1383 1383                  for (len = 0, ptr = (unsigned char *) mbch; 0 < n; --n, ++ptr)
1384 1384                          len += sprintf(buf+len, "\\%03o", *ptr);
1385 1385          }
1386 1386          return (buf);
1387 1387  }
1388 1388  
1389 1389  static int
1390 1390  wcoff(const wchar_t *astring, const int off)
1391 1391  {
1392 1392          const wchar_t *s = astring;
1393 1393          int c = 0;
1394 1394          char mb[MB_LEN_MAX];
1395 1395  
1396 1396          while (c < off) {
1397 1397                  int n;
1398 1398                  if ((n = wctomb(mb, *s)) == 0)
1399 1399                          break;
1400 1400                  if (n == -1)
1401 1401                          n = 1;
1402 1402                  c += n;
1403 1403                  s++;
1404 1404          }
1405 1405  
1406 1406          return (s - astring);
1407 1407  }
1408 1408  
1409 1409  #define NREGHASH        64
1410 1410  #define NREGHOLD        1024    /* max number unused entries */
1411 1411  
1412 1412  static int      nregunref;
1413 1413  
1414 1414  struct reghashq {
1415 1415          struct qelem hq;
1416 1416          struct regcache *regcachep;
1417 1417  };
1418 1418  
1419 1419  struct regcache {
1420 1420          struct qelem    lq;
1421 1421          wchar_t *pattern;
1422 1422          regex_t re;
1423 1423          int     refcnt;
1424 1424          struct reghashq hash;
1425 1425  };
1426 1426  
1427 1427  static struct qelem reghash[NREGHASH], reglink;
1428 1428  
1429 1429  /*
1430 1430   * Generate a hash value of the given wchar string.
1431 1431   * The hashing method is similar to what Java does for strings.
1432 1432   */
1433 1433  static uint_t
1434 1434  regtxthash(const wchar_t *str)
1435 1435  {
1436 1436          int k = 0;
1437 1437  
1438 1438          while (*str != L'\0')
1439 1439                  k = (31 * k) + *str++;
1440 1440  
1441 1441          k += ~(k << 9);
1442 1442          k ^=  (k >> 14);
1443 1443          k +=  (k << 4);
1444 1444          k ^=  (k >> 10);
1445 1445  
1446 1446          return (k % NREGHASH);
1447 1447  }
1448 1448  
1449 1449  int
1450 1450  int_regwcomp(REGEXP *r, const wchar_t *pattern)
1451 1451  {
1452 1452          regex_t re;
1453 1453          char *mbpattern;
1454 1454          int ret;
1455 1455          uint_t key;
1456 1456          struct qelem *qp;
1457 1457          struct regcache *rcp;
1458 1458  
1459 1459          key = regtxthash(pattern);
1460 1460          for (qp = reghash[key].q_forw; qp != NULL; qp = qp->q_forw) {
1461 1461                  rcp = ((struct reghashq *)qp)->regcachep;
1462 1462                  if (*rcp->pattern == *pattern &&
1463 1463                      wcscmp(rcp->pattern, pattern) == 0)
1464 1464                          break;
1465 1465          }
1466 1466          if (qp != NULL) {
1467 1467                  /* update link. put this one at the beginning */
1468 1468                  if (rcp != (struct regcache *)reglink.q_forw) {
1469 1469                          remque(&rcp->lq);
1470 1470                          insque(&rcp->lq, &reglink);
1471 1471                  }
1472 1472                  if (rcp->refcnt == 0)
1473 1473                          nregunref--;    /* no longer unref'ed */
1474 1474                  rcp->refcnt++;
1475 1475                  *(struct regcache **)r = rcp;
1476 1476                  return (REG_OK);
1477 1477          }
1478 1478  
1479 1479          if ((mbpattern = wcstombsdup((wchar_t *)pattern)) == NULL)
1480 1480                  return (REG_ESPACE);
1481 1481  
1482 1482          ret = regcomp(&re, mbpattern, REG_EXTENDED);
1483 1483  
1484 1484          free(mbpattern);
1485 1485  
1486 1486          if (ret != REG_OK)
1487 1487                  return (ret);
1488 1488  
1489 1489          if ((rcp = malloc(sizeof (struct regcache))) == NULL)
1490 1490                  return (REG_ESPACE);
1491 1491          rcp->re = re;
1492 1492          if ((rcp->pattern = wsdup(pattern)) == NULL) {
1493 1493                  regfree(&re);
1494 1494                  free(rcp);
1495 1495                  return (REG_ESPACE);
1496 1496          }
1497 1497          rcp->refcnt = 1;
1498 1498          insque(&rcp->lq, &reglink);
1499 1499          insque(&rcp->hash.hq, &reghash[key]);
1500 1500          rcp->hash.regcachep = rcp;
1501 1501  
1502 1502          *(struct regcache **)r = rcp;
1503 1503          return (ret);
1504 1504  }
1505 1505  
1506 1506  void
1507 1507  int_regwfree(REGEXP r)
1508 1508  {
1509 1509          int     cnt;
1510 1510          struct qelem *qp, *nqp;
1511 1511          struct regcache *rcp;
1512 1512  
1513 1513          rcp = (struct regcache *)r;
1514 1514  
1515 1515          if (--rcp->refcnt != 0)
1516 1516                  return;
1517 1517  
1518 1518          /* this cache has no reference */
1519 1519          if (++nregunref < NREGHOLD)
1520 1520                  return;
1521 1521  
1522 1522          /*
1523 1523           * We've got too much unref'ed regex. Free half of least
1524 1524           * used regex.
1525 1525           */
1526 1526          cnt = 0;
1527 1527          for (qp = reglink.q_forw; qp != NULL; qp = nqp) {
1528 1528                  nqp = qp->q_forw;
1529 1529                  rcp = (struct regcache *)qp;
1530 1530                  if (rcp->refcnt != 0)
1531 1531                          continue;
1532 1532  
1533 1533                  /* free half of them */
1534 1534                  if (++cnt < (NREGHOLD / 2))
1535 1535                          continue;
1536 1536  
1537 1537                  /* detach and free */
1538 1538                  remque(&rcp->lq);
1539 1539                  remque(&rcp->hash.hq);
1540 1540  
1541 1541                  /* free up */
1542 1542                  free(rcp->pattern);
1543 1543                  regfree(&rcp->re);
1544 1544                  free(rcp);
1545 1545  
1546 1546                  nregunref--;
1547 1547          }
1548 1548  }
1549 1549  
1550 1550  size_t
1551 1551  int_regwerror(int errcode, REGEXP r, char *errbuf, size_t bufsiz)
1552 1552  {
1553 1553          struct regcache *rcp;
1554 1554  
1555 1555          rcp = (struct regcache *)r;
1556 1556          return (regerror(errcode, &rcp->re, errbuf, bufsiz));
1557 1557  }
1558 1558  
1559 1559  int
1560 1560  int_regwexec(REGEXP r,          /* compiled RE */
1561 1561      const wchar_t *astring,     /* subject string */
1562 1562      size_t nsub,                /* number of subexpressions */
1563 1563      int_regwmatch_t *sub,       /* subexpression pointers */
1564 1564      int flags)
1565 1565  {
1566 1566          char *mbs;
1567 1567          regmatch_t *mbsub = NULL;
1568 1568          int i;
1569 1569          struct regcache *rcp;
1570 1570  
1571 1571          if ((mbs = wcstombsdup((wchar_t *)astring)) == NULL)
1572 1572                  return (REG_ESPACE);
1573 1573  
1574 1574          if (nsub > 0 && sub) {
1575 1575                  if ((mbsub = malloc(nsub * sizeof (regmatch_t))) == NULL)
1576 1576                          return (REG_ESPACE);
1577 1577          }
1578 1578  
1579 1579          rcp = (struct regcache *)r;
1580 1580  
1581 1581          i = regexec(&rcp->re, mbs, nsub, mbsub, flags);
1582 1582  
1583 1583          /* Now, adjust the pointers/counts in sub */
1584 1584          if (i == REG_OK && nsub > 0 && mbsub) {
1585 1585                  int j, k;
1586 1586  
1587 1587                  for (j = 0; j < nsub; j++) {
1588 1588                          regmatch_t *ms = &mbsub[j];
1589 1589                          int_regwmatch_t *ws = &sub[j];
1590 1590  
1591 1591                          if ((k = ms->rm_so) >= 0) {
1592 1592                                  ws->rm_so = wcoff(astring, k);
1593 1593                                  ws->rm_sp = astring + ws->rm_so;
1594 1594                          }
1595 1595                          if ((k = ms->rm_eo) >= 0) {
1596 1596                                  ws->rm_eo = wcoff(astring, k);
1597 1597                                  ws->rm_ep = astring + ws->rm_eo;
1598 1598                          }
1599 1599                  }
1600 1600          }
1601 1601  
1602 1602          free(mbs);
1603 1603          if (mbsub)
1604 1604                  free(mbsub);
1605 1605          return (i);
1606 1606  }
1607 1607  
1608 1608  int
1609 1609  int_regwdosuba(REGEXP rp,       /* compiled RE: Pattern */
1610 1610      const wchar_t *rpl,         /* replacement string: /rpl/ */
1611 1611      const wchar_t *src,         /* source string */
1612 1612      wchar_t **dstp,             /* destination string */
1613 1613      int len,                    /* destination length */
1614 1614      int *globp)         /* IN: occurence, 0 for all; OUT: substitutions */
1615 1615  {
1616 1616          wchar_t *dst, *odst;
1617 1617          const wchar_t *ip, *xp;
1618 1618          wchar_t *op;
1619 1619          int i;
1620 1620          wchar_t c;
1621 1621          int glob, iglob = *globp, oglob = 0;
1622 1622  #define NSUB    10
1623 1623          int_regwmatch_t rm[NSUB], *rmp;
1624 1624          int flags;
1625 1625          wchar_t *end;
1626 1626          int regerr;
1627 1627  
1628 1628  /* handle overflow of dst. we need "i" more bytes */
1629 1629  #ifdef OVERFLOW
1630 1630  #undef OVERFLOW
1631 1631  #define OVERFLOW(i) { \
1632 1632                  int pos = op - dst; \
1633 1633                  dst = (wchar_t *)realloc(odst = dst, \
1634 1634                          (len += len + i) * sizeof (wchar_t)); \
1635 1635                  if (dst == NULL) \
1636 1636                          goto nospace; \

↓ open down ↓

1636 lines elided

↑ open up ↑

1637 1637                  op = dst + pos; \
1638 1638                  end = dst + len; \
1639 1639          }
1640 1640  #endif
1641 1641  
1642 1642          *dstp = dst = (wchar_t *)malloc(len * sizeof (wchar_t));
1643 1643          if (dst == NULL)
1644 1644                  return (REG_ESPACE);
1645 1645  
1646 1646          if (rp == NULL || rpl == NULL || src == NULL || dst ==  NULL)
1647      -                return (REG_EFATAL);
     1647 +                return (REG_BADPAT);
1648 1648  
1649 1649          glob = 0;       /* match count */
1650 1650          ip = src;       /* source position */
1651 1651          op = dst;       /* destination position */
1652 1652          end = dst + len;
1653 1653  
1654 1654          flags = 0;
1655 1655          while ((regerr = int_regwexec(rp, ip, NSUB, rm, flags)) == REG_OK) {
1656 1656                  /* Copy text preceding match */
1657 1657                  if (op + (i = rm[0].rm_sp - ip) >= end)

1658 1658                          OVERFLOW(i)
1659 1659                  while (i--)
1660 1660                          *op++ = *ip++;
1661 1661  
1662 1662                  if (iglob == 0 || ++glob == iglob) {
1663 1663                          oglob++;
1664 1664                          xp = rpl;               /* do substitute */
1665 1665                  } else
1666 1666                          xp = L"&";              /* preserve text */
1667 1667  
1668 1668                  /* Perform replacement of matched substing */
1669 1669                  while ((c = *xp++) != '\0') {
1670 1670                          rmp = NULL;
1671 1671                          if (c == '&')
1672 1672                                  rmp = &rm[0];
1673 1673                          else if (c == '\\') {
1674 1674                                  if ('0' <= *xp && *xp <= '9')
1675 1675                                          rmp = &rm[*xp++ - '0'];
1676 1676                                  else if (*xp != '\0')
1677 1677                                          c = *xp++;
1678 1678                          }
1679 1679  
1680 1680                          if (rmp ==  NULL) {     /* Ordinary character. */
1681 1681                                  *op++ = c;
1682 1682                                  if (op >= end)
1683 1683                                          OVERFLOW(1)
1684 1684                          } else if (rmp->rm_sp != NULL && rmp->rm_ep != NULL) {
1685 1685                                  ip = rmp->rm_sp;
1686 1686                                  if (op + (i = rmp->rm_ep - rmp->rm_sp) >= end)
1687 1687                                          OVERFLOW(i)
1688 1688                                  while (i--)
1689 1689                                          *op++ = *ip++;
1690 1690                          }
1691 1691                  }
1692 1692  
1693 1693                  ip = rm[0].rm_ep;
1694 1694                  if (*ip == '\0')        /* If at end break */
1695 1695                          break;
1696 1696                  else if (rm[0].rm_sp == rm[0].rm_ep) {
1697 1697                          /* If empty match copy next char */
1698 1698                          *op++ = *ip++;
1699 1699                          if (op >= end)
1700 1700                                  OVERFLOW(1)
1701 1701                  }
1702 1702                  flags = REG_NOTBOL;
1703 1703          }
1704 1704  
1705 1705          if (regerr != REG_OK && regerr != REG_NOMATCH)
1706 1706                  return (regerr);
1707 1707  
1708 1708          /* Copy rest of text */
1709 1709          if (op + (i =  wcslen(ip)) >= end)
1710 1710                  OVERFLOW(i)
1711 1711          while (i--)
1712 1712                  *op++ = *ip++;
1713 1713          *op++ = '\0';
1714 1714  
1715 1715          if ((*dstp = dst = (wchar_t *)realloc(odst = dst,
1716 1716              sizeof (wchar_t) * (size_t)(op - dst))) == NULL) {
1717 1717  nospace:
1718 1718                  free(odst);
1719 1719                  return (REG_ESPACE);
1720 1720          }
1721 1721  
1722 1722          *globp = oglob;
1723 1723  
1724 1724          return ((oglob == 0) ? REG_NOMATCH : REG_OK);
1725 1725  }

↓ open down ↓

68 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX