1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright 1995-2003 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*
  29  * xcompile, xstep, xadvance - simulate compile(3g), step(3g), advance(3g)
  30  *      using regcomp(3c), regexec(3c) interfaces. This is an XCU4
  31  *      porting aid. switches out to libgen compile/step if collation
  32  *      table not present.
  33  *
  34  *      Goal is to work with vi and sed/ed.
  35  *      Returns expbuf in dhl format (encoding of first two bytes).
  36  *      Note also that this is profoundly single threaded.  You
  37  *      cannot call compile twice with two separate search strings
  38  *      because the second call will wipe out the earlier stored string.
  39  *      This must be fixed, plus a general cleanup should be performed
  40  *      if this is to be integrated into libc.
  41  *
  42  */
  43 
  44 #pragma ident   "%Z%%M% %I%     %E% SMI"
  45 
  46 #include <stdio.h>
  47 #include <widec.h>
  48 #include <sys/types.h>
  49 #include <regex.h>
  50 #include <locale.h>
  51 #include <stdlib.h>
  52 #include <locale.h>
  53 #include <string.h>
  54 #include <unistd.h>
  55 #include <regexpr.h>
  56 
  57 /*
  58  * psuedo compile/step/advance global variables
  59  */
  60 extern int nbra;
  61 extern char *locs;              /* for stopping execess recursion */
  62 extern char *loc1;              /* 1st character which matched RE */
  63 extern char *loc2;              /* char after lst char in matched RE */
  64 extern char *braslist[];        /* start of nbra subexp  */
  65 extern char *braelist[];        /* end of nbra subexp    */
  66 extern int regerrno;
  67 extern int reglength;
  68 
  69 int regcomp_flags;              /* interface to specify cflags for regcomp */
  70 
  71 void regex_comp_free(void *a);
  72 static int dhl_step(const char *str, const char *ep);
  73 static int dhl_advance(const char *str, const char *ep);
  74 static int map_errnos(int);             /* Convert regcomp error */
  75 static int dhl_doit(const char *, const regex_t *, const int flags);
  76 static char * dhl_compile(const char *instr, char *ep, char *endbuf);
  77 
  78 /*
  79  * # of sub re's: NOTE: For now limit on bra list defined here
  80  * but fix is to add maxbra define to to regex.h
  81  * One problem is that a bigger number is a performance hit since
  82  * regexec() has a slow initialization loop that goes around SEPSIZE times
  83  */
  84 #define SEPSIZE 20
  85 static regmatch_t rm[SEPSIZE];          /* ptr to list of RE matches */
  86 
  87 /*
  88  * Structure to contain dl encoded first two bytes for vi, plus hold two
  89  * regex structures, one for advance and one for step.
  90  */
  91 static struct regex_comp {
  92         char    r_head[2];              /* Header for DL encoding for vi */
  93         regex_t r_stp;                  /* For use by step */
  94         regex_t r_adv;                  /* For use by advance */
  95 } reg_comp;
  96 
  97 /*
  98  * global value for the size of a regex_comp structure:
  99  */
 100 size_t regexc_size = sizeof (reg_comp);
 101 
 102 
 103 char *
 104 compile(const char *instr, char *expbuf, char *endbuf)
 105 {
 106         return (dhl_compile(instr, expbuf, endbuf));
 107 }
 108 
 109 int
 110 step(const char *instr, const char *expbuf)
 111 {
 112         return (dhl_step(instr, expbuf));
 113 }
 114 
 115 int
 116 advance(const char *instr, const char *expbuf)
 117 {
 118         return (dhl_advance(instr, expbuf));
 119 }
 120 
 121 
 122 /*
 123  * the compile and step routines here simulate the old libgen routines of
 124  * compile/step Re: regexpr(3G). in order to do this, we must assume
 125  * that expbuf[] consists of the following format:
 126  *      1) the first two bytes consist of a special encoding - see below.
 127  *      2) the next part is a regex_t used by regexec()/regcomp() for step
 128  *      3) the final part is a regex_t used by regexec()/regcomp() for advance
 129  *
 130  * the special encoding of the first two bytes is referenced throughout
 131  * vi. apparently expbuf[0] is set to:
 132  *      = 0 upon initialization
 133  *      = 1 if the first char of the RE is a ^
 134  *      = 0 if the first char of the RE isn't a ^
 135  * and expbuf[1-35+]    = bitmap of the type of RE chars in the expression.
 136  * this is apparently 0 if there's no RE.
 137  * Here, we use expbuf[0] in a similar fashion; and expbuf[1] is non-zero
 138  * if there's at least 1 RE in the string.
 139  * I say "apparently" as the code to compile()/step() is poorly written.
 140  */
 141 static char *
 142 dhl_compile(instr, expbuf, endbuf)
 143 const char *instr;              /* the regular expression               */
 144 char *expbuf;                   /* where the compiled RE gets placed    */
 145 char *endbuf;                   /* ending addr of expbuf                */
 146 {
 147         int rv;
 148         int alloc = 0;
 149         char adv_instr[4096];   /* PLENTY big temp buffer */
 150         char *instrp;           /* PLENTY big temp buffer */
 151 
 152         if (*instr == (char) NULL) {
 153                 regerrno = 41;
 154                 return (NULL);
 155         }
 156 
 157         /*
 158          * Check values of expbuf and endbuf
 159          */
 160         if (expbuf == NULL) {
 161                 if ((expbuf = malloc(regexc_size)) == NULL) {
 162                         regerrno = 50;
 163                         return (NULL);
 164                 }
 165                 memset(&reg_comp, 0, regexc_size);
 166                 alloc = 1;
 167                 endbuf = expbuf + regexc_size;
 168         } else {                /* Check if enough memory was allocated */
 169                 if (expbuf + regexc_size > endbuf) {
 170                         regerrno = 50;
 171                         return (NULL);
 172                 }
 173                 memcpy(&reg_comp, expbuf, regexc_size);
 174         }
 175 
 176         /*
 177          * Clear global flags
 178          */
 179         nbra = 0;
 180         regerrno = 0;
 181 
 182         /*
 183          * Free any data being held for previous search strings
 184          */
 185         regex_comp_free(&reg_comp);
 186 
 187         /*
 188          * We call regcomp twice, once to get a regex_t for use by step()
 189          * and then again with for use by advance()
 190          */
 191         if ((rv = regcomp(&reg_comp.r_stp, instr, regcomp_flags)) != 0) {
 192                 regerrno = map_errnos(rv);      /* Convert regcomp error */
 193                 goto out;
 194         }
 195         /*
 196          * To support advance, which assumes an implicit ^ to match at start
 197          * of line we prepend a ^ to the pattern by copying to a temp buffer
 198          */
 199 
 200         if (instr[0] == '^')
 201                 instrp = (char *) instr; /* String already has leading ^ */
 202         else {
 203                 adv_instr[0] = '^';
 204                 strncpy(&adv_instr[1], instr, 2048);
 205                 instrp = adv_instr;
 206         }
 207 
 208         if ((rv = regcomp(&reg_comp.r_adv, instrp, regcomp_flags)) != 0) {
 209                 regerrno = map_errnos(rv);      /* Convert regcomp error */
 210                 goto out;
 211         }
 212 
 213         /*
 214          * update global variables
 215          */
 216         nbra = (int) reg_comp.r_adv.re_nsub > 0 ?
 217             (int) reg_comp.r_adv.re_nsub : 0;
 218         regerrno = 0;
 219 
 220         /*
 221          * Set the header flags for use by vi
 222          */
 223         if (instr[0] == '^')            /* if beginning of string,      */
 224                 reg_comp.r_head[0] = 1; /* set special flag             */
 225         else
 226                 reg_comp.r_head[0] = 0; /* clear special flag           */
 227         /*
 228          * note that for a single BRE, nbra will be 0 here.
 229          * we're guaranteed that, at this point, a RE has been found.
 230          */
 231         reg_comp.r_head[1] = 1; /* set special flag             */
 232         /*
 233          * Copy our reg_comp structure to expbuf
 234          */
 235         (void) memcpy(expbuf, (char *) &reg_comp, regexc_size);
 236 
 237 out:
 238         /*
 239          * Return code from libgen regcomp with mods.  Note weird return
 240          * value - if space is malloc'd return pointer to start of space,
 241          * if user provided their own space, return pointer to 1+last byte
 242          * of that space.
 243          */
 244         if (regerrno != 0) {
 245                 if (alloc)
 246                         free(expbuf);
 247                 return (NULL);
 248         }
 249         reglength = regexc_size;
 250 
 251         if (alloc)
 252                 return (expbuf);
 253         else
 254                 return (expbuf + regexc_size);
 255 }
 256 
 257 
 258 /*
 259  * dhl_step: step through a string until a RE match is found, or end of str
 260  */
 261 static int
 262 dhl_step(str, ep)
 263 const char *str;                /* characters to be checked for a match */
 264 const char *ep;                 /* compiled RE from dhl_compile()       */
 265 {
 266         /*
 267          * Check if we're passed a null ep
 268          */
 269         if (ep == NULL) {
 270                 regerrno = 41;  /* No remembered search string error */
 271                 return (0);
 272         }
 273         /*
 274          * Call common routine with r_stp (step) structure
 275          */
 276         return (dhl_doit(str, &(((struct regex_comp *) ep)->r_stp),
 277             ((locs != NULL) ? REG_NOTBOL : 0)));
 278 }
 279 
 280 /*
 281  * dhl_advance: implement advance
 282  */
 283 static int
 284 dhl_advance(str, ep)
 285 const char *str;                /* characters to be checked for a match */
 286 const char *ep;                 /* compiled RE from dhl_compile()       */
 287 {
 288         int rv;
 289         /*
 290          * Check if we're passed a null ep
 291          */
 292         if (ep == NULL) {
 293                 regerrno = 41;  /* No remembered search string error */
 294                 return (0);
 295         }
 296         /*
 297          * Call common routine with r_adv (advance) structure
 298          */
 299         rv = dhl_doit(str, &(((struct regex_comp *) ep)->r_adv), 0);
 300         loc1 = NULL;            /* Clear it per the compile man page */
 301         return (rv);
 302 }
 303 
 304 /*
 305  * dhl_doit - common code for step and advance
 306  */
 307 static int
 308 dhl_doit(str, rep, flags)
 309 const char *str;                /* characters to be checked for a match */
 310 const regex_t *rep;
 311 const int flags;                /* flags to be passed to regexec directly */
 312 {
 313         int rv;
 314         int i;
 315         regmatch_t *prm;        /* ptr to current regmatch_t            */
 316 
 317         /*
 318          * Check if we're passed a null regex_t
 319          */
 320         if (rep == NULL) {
 321                 regerrno = 41;  /* No remembered search string error */
 322                 return (0);
 323         }
 324 
 325         regerrno = 0;
 326         prm = &rm[0];
 327 
 328         if ((rv = regexec(rep, str, SEPSIZE, prm, flags)) != REG_OK) {
 329                 if (rv == REG_NOMATCH)
 330                         return (0);
 331                 regerrno = map_errnos(rv);
 332                 return (0);
 333         }
 334 
 335         loc1 = (char *)str + prm->rm_so;
 336         loc2 = (char *)str + prm->rm_eo;
 337 
 338         /*
 339          * Now we need to fill up the bra lists with all of the sub re's
 340          * Note we subtract nsub -1, and preincrement prm.
 341          */
 342         for (i = 0; i <= rep->re_nsub; i++) {
 343                 prm++;          /* XXX inc past first subexp */
 344                 braslist[i] = (char *)str + prm->rm_so;
 345                 braelist[i] = (char *)str + prm->rm_eo;
 346                 if (i >= SEPSIZE) {
 347                         regerrno = 50;  /* regex overflow */
 348                         return (0);
 349                 }
 350         }
 351 
 352         /*
 353          * Inverse logic, a zero from regexec - success, is a 1
 354          * from advance/step.
 355          */
 356 
 357         return (rv == 0);
 358 }
 359 
 360 
 361 /*
 362  *      regerrno to compile/step error mapping:
 363  *      This is really a big compromise.  Some errors don't map at all
 364  *      like regcomp error 15 is generated by both compile() error types
 365  *      44 & 46.  So which one should we map to?
 366  *      Note REG_ESUB Can't happen- 9 is no longer max num of subexpressions
 367  *      To do your errors right use xregerr() to get the regcomp error
 368  *      string and print that.
 369  *
 370  * |    regcomp/regexec              |  Compile/step/advance                |
 371  * +---------------------------------+--------------------------------------+
 372  * 0 REG_OK       Pattern matched       1  - Pattern matched
 373  * 1 REG_NOMATCH  No match              0  - Pattern didn't match
 374  * 2 REG_ECOLLATE Bad collation elmnt.  67 - Returned by compile on mbtowc err
 375  * 3 REG_EESCAPE  trailing \ in patrn   45 - } expected after \.
 376  * 4 REG_ENEWLINE \n before end pattrn  36 - Illegal or missing delimiter.
 377  * 5 REG_ENSUB    Over 9 \( \) pairs    43 - Too many \(
 378  * 6 REG_ESUBREG  Bad number in \[0-9]  25 - ``\digit'' out of range.
 379  * 7 REG_EBRACK   [ ] inbalance         49 - [ ] imbalance.
 380  * 8 REG_EPAREN   ( ) inbalance         42 - \(~\) imbalance.
 381  * 9 REG_EBRACE   \{ \} inbalance       45 - } expected after \.
 382  * 10 REG_ERANGE  bad range endpoint    11 - Range endpoint too large.
 383  * 11 REG_ESPACE  no memory for pattern 50 - Regular expression overflow.
 384  * 12 REG_BADRPT  invalid repetition    36 - Illegal or missing delimiter.
 385  * 13 REG_ECTYPE  invalid char-class    67 - illegal byte sequence
 386  * 14 REG_BADPAT  syntax error          50 - Regular expression overflow.
 387  * 15 REG_BADBR   \{ \} contents bad    46 - First number exceeds 2nd in \{~\}
 388  * 16 REG_EFATAL  internal error        50 - Regular expression overflow.
 389  * 17 REG_ECHAR   bad mulitbyte char    67 - illegal byte sequence
 390  * 18 REG_STACK   stack overflow        50 - Regular expression overflow.
 391  * 19 REG_ENOSYS  function not supported 50- Regular expression overflow.
 392  *
 393  *      For reference here's the compile/step errno's. We don't generate
 394  *      41 here - it's done earlier, nor 44 since we can't tell if from 46.
 395  *
 396  *      11 - Range endpoint too large.
 397  *      16 - Bad number.
 398  *      25 - ``\digit'' out of range.
 399  *      36 - Illegal or missing delimiter.
 400  *      41 - No remembered search string.
 401  *      42 - \(~\) imbalance.
 402  *      43 - Too many \(.
 403  *      44 - More than 2 numbers given in "\{~\}"
 404  *      45 - } expected after \.
 405  *      46 - First number exceeds 2nd in "\{~\}"
 406  *      49 - [ ] imbalance.
 407  *      50 - Regular expression overflow.
 408  */
 409 
 410 static int
 411 map_errnos(int Errno)
 412 {
 413         switch (Errno) {
 414         case REG_ECOLLATE:
 415                 regerrno = 67;
 416                 break;
 417         case REG_EESCAPE:
 418                 regerrno = 45;
 419                 break;
 420         case REG_ENEWLINE:
 421                 regerrno = 36;
 422                 break;
 423         case REG_ENSUB:
 424                 regerrno = 43;
 425                 break;
 426         case REG_ESUBREG:
 427                 regerrno = 25;
 428                 break;
 429         case REG_EBRACK:
 430                 regerrno = 49;
 431                 break;
 432         case REG_EPAREN:
 433                 regerrno = 42;
 434                 break;
 435         case REG_EBRACE:
 436                 regerrno = 45;
 437                 break;
 438         case REG_ERANGE:
 439                 regerrno = 11;
 440                 break;
 441         case REG_ESPACE:
 442                 regerrno = 50;
 443                 break;
 444         case REG_BADRPT:
 445                 regerrno = 36;
 446                 break;
 447         case REG_ECTYPE:
 448                 regerrno = 67;
 449                 break;
 450         case REG_BADPAT:
 451                 regerrno = 50;
 452                 break;
 453         case REG_BADBR:
 454                 regerrno = 46;
 455                 break;
 456         case REG_EFATAL:
 457                 regerrno = 50;
 458                 break;
 459         case REG_ECHAR:
 460                 regerrno = 67;
 461                 break;
 462         case REG_STACK:
 463                 regerrno = 50;
 464                 break;
 465         case REG_ENOSYS:
 466                 regerrno = 50;
 467                 break;
 468         default:
 469                 regerrno = 50;
 470                 break;
 471         }
 472         return (regerrno);
 473 }
 474 
 475 /*
 476  *  This is a routine to clean up the subtle substructure of the struct
 477  *  regex_comp type for use by clients of this module.  Since the struct
 478  *  type is private, we use a generic interface, and trust the
 479  *  application to be damn sure that this operation is valid for the
 480  *  named memory.
 481  */
 482 
 483 void
 484 regex_comp_free(void * a)
 485 {
 486         /*
 487          * Free any data being held for previous search strings
 488          */
 489 
 490         if (((struct regex_comp *) a) == NULL) {
 491                 return;
 492         }
 493 
 494         regfree(&((struct regex_comp *)a)->r_stp);
 495         regfree(&((struct regex_comp *)a)->r_adv);
 496 }