1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 1995-2003 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright (c) 2016 by Delphix. All rights reserved. 26 */ 27 28 /* 29 * xcompile, xstep, xadvance - simulate compile(3g), step(3g), advance(3g) 30 * using regcomp(3c), regexec(3c) interfaces. This is an XCU4 31 * porting aid. switches out to libgen compile/step if collation 32 * table not present. 33 * 34 * Goal is to work with vi and sed/ed. 35 * Returns expbuf in dhl format (encoding of first two bytes). 36 * Note also that this is profoundly single threaded. You 37 * cannot call compile twice with two separate search strings 38 * because the second call will wipe out the earlier stored string. 39 * This must be fixed, plus a general cleanup should be performed 40 * if this is to be integrated into libc. 41 * 42 */ 43 44 #pragma ident "%Z%%M% %I% %E% SMI" 45 46 #include <stdio.h> 47 #include <widec.h> 48 #include <sys/types.h> 49 #include <regex.h> 50 #include <locale.h> 51 #include <stdlib.h> 52 #include <locale.h> 53 #include <string.h> 54 #include <unistd.h> 55 #include <regexpr.h> 56 57 /* 58 * psuedo compile/step/advance global variables 59 */ 60 extern int nbra; 61 extern char *locs; /* for stopping execess recursion */ 62 extern char *loc1; /* 1st character which matched RE */ 63 extern char *loc2; /* char after lst char in matched RE */ 64 extern char *braslist[]; /* start of nbra subexp */ 65 extern char *braelist[]; /* end of nbra subexp */ 66 extern int regerrno; 67 extern int reglength; 68 69 int regcomp_flags; /* interface to specify cflags for regcomp */ 70 71 void regex_comp_free(void *a); 72 static int dhl_step(const char *str, const char *ep); 73 static int dhl_advance(const char *str, const char *ep); 74 static int map_errnos(int); /* Convert regcomp error */ 75 static int dhl_doit(const char *, const regex_t *, const int flags); 76 static char * dhl_compile(const char *instr, char *ep, char *endbuf); 77 78 /* 79 * # of sub re's: NOTE: For now limit on bra list defined here 80 * but fix is to add maxbra define to to regex.h 81 * One problem is that a bigger number is a performance hit since 82 * regexec() has a slow initialization loop that goes around SEPSIZE times 83 */ 84 #define SEPSIZE 20 85 static regmatch_t rm[SEPSIZE]; /* ptr to list of RE matches */ 86 87 /* 88 * Structure to contain dl encoded first two bytes for vi, plus hold two 89 * regex structures, one for advance and one for step. 90 */ 91 static struct regex_comp { 92 char r_head[2]; /* Header for DL encoding for vi */ 93 regex_t r_stp; /* For use by step */ 94 regex_t r_adv; /* For use by advance */ 95 } reg_comp; 96 97 /* 98 * global value for the size of a regex_comp structure: 99 */ 100 size_t regexc_size = sizeof (reg_comp); 101 102 103 char * 104 compile(const char *instr, char *expbuf, char *endbuf) 105 { 106 return (dhl_compile(instr, expbuf, endbuf)); 107 } 108 109 int 110 step(const char *instr, const char *expbuf) 111 { 112 return (dhl_step(instr, expbuf)); 113 } 114 115 int 116 advance(const char *instr, const char *expbuf) 117 { 118 return (dhl_advance(instr, expbuf)); 119 } 120 121 122 /* 123 * the compile and step routines here simulate the old libgen routines of 124 * compile/step Re: regexpr(3G). in order to do this, we must assume 125 * that expbuf[] consists of the following format: 126 * 1) the first two bytes consist of a special encoding - see below. 127 * 2) the next part is a regex_t used by regexec()/regcomp() for step 128 * 3) the final part is a regex_t used by regexec()/regcomp() for advance 129 * 130 * the special encoding of the first two bytes is referenced throughout 131 * vi. apparently expbuf[0] is set to: 132 * = 0 upon initialization 133 * = 1 if the first char of the RE is a ^ 134 * = 0 if the first char of the RE isn't a ^ 135 * and expbuf[1-35+] = bitmap of the type of RE chars in the expression. 136 * this is apparently 0 if there's no RE. 137 * Here, we use expbuf[0] in a similar fashion; and expbuf[1] is non-zero 138 * if there's at least 1 RE in the string. 139 * I say "apparently" as the code to compile()/step() is poorly written. 140 */ 141 static char * 142 dhl_compile(instr, expbuf, endbuf) 143 const char *instr; /* the regular expression */ 144 char *expbuf; /* where the compiled RE gets placed */ 145 char *endbuf; /* ending addr of expbuf */ 146 { 147 int rv; 148 int alloc = 0; 149 char adv_instr[4096]; /* PLENTY big temp buffer */ 150 char *instrp; /* PLENTY big temp buffer */ 151 152 if (*instr == (char) NULL) { 153 regerrno = 41; 154 return (NULL); 155 } 156 157 /* 158 * Check values of expbuf and endbuf 159 */ 160 if (expbuf == NULL) { 161 if ((expbuf = malloc(regexc_size)) == NULL) { 162 regerrno = 50; 163 return (NULL); 164 } 165 memset(®_comp, 0, regexc_size); 166 alloc = 1; 167 endbuf = expbuf + regexc_size; 168 } else { /* Check if enough memory was allocated */ 169 if (expbuf + regexc_size > endbuf) { 170 regerrno = 50; 171 return (NULL); 172 } 173 memcpy(®_comp, expbuf, regexc_size); 174 } 175 176 /* 177 * Clear global flags 178 */ 179 nbra = 0; 180 regerrno = 0; 181 182 /* 183 * Free any data being held for previous search strings 184 */ 185 regex_comp_free(®_comp); 186 187 /* 188 * We call regcomp twice, once to get a regex_t for use by step() 189 * and then again with for use by advance() 190 */ 191 if ((rv = regcomp(®_comp.r_stp, instr, regcomp_flags)) != 0) { 192 regerrno = map_errnos(rv); /* Convert regcomp error */ 193 goto out; 194 } 195 /* 196 * To support advance, which assumes an implicit ^ to match at start 197 * of line we prepend a ^ to the pattern by copying to a temp buffer 198 */ 199 200 if (instr[0] == '^') 201 instrp = (char *) instr; /* String already has leading ^ */ 202 else { 203 adv_instr[0] = '^'; 204 strncpy(&adv_instr[1], instr, 2048); 205 instrp = adv_instr; 206 } 207 208 if ((rv = regcomp(®_comp.r_adv, instrp, regcomp_flags)) != 0) { 209 regerrno = map_errnos(rv); /* Convert regcomp error */ 210 goto out; 211 } 212 213 /* 214 * update global variables 215 */ 216 nbra = (int) reg_comp.r_adv.re_nsub > 0 ? 217 (int) reg_comp.r_adv.re_nsub : 0; 218 regerrno = 0; 219 220 /* 221 * Set the header flags for use by vi 222 */ 223 if (instr[0] == '^') /* if beginning of string, */ 224 reg_comp.r_head[0] = 1; /* set special flag */ 225 else 226 reg_comp.r_head[0] = 0; /* clear special flag */ 227 /* 228 * note that for a single BRE, nbra will be 0 here. 229 * we're guaranteed that, at this point, a RE has been found. 230 */ 231 reg_comp.r_head[1] = 1; /* set special flag */ 232 /* 233 * Copy our reg_comp structure to expbuf 234 */ 235 (void) memcpy(expbuf, (char *) ®_comp, regexc_size); 236 237 out: 238 /* 239 * Return code from libgen regcomp with mods. Note weird return 240 * value - if space is malloc'd return pointer to start of space, 241 * if user provided their own space, return pointer to 1+last byte 242 * of that space. 243 */ 244 if (regerrno != 0) { 245 if (alloc) 246 free(expbuf); 247 return (NULL); 248 } 249 reglength = regexc_size; 250 251 if (alloc) 252 return (expbuf); 253 else 254 return (expbuf + regexc_size); 255 } 256 257 258 /* 259 * dhl_step: step through a string until a RE match is found, or end of str 260 */ 261 static int 262 dhl_step(str, ep) 263 const char *str; /* characters to be checked for a match */ 264 const char *ep; /* compiled RE from dhl_compile() */ 265 { 266 /* 267 * Check if we're passed a null ep 268 */ 269 if (ep == NULL) { 270 regerrno = 41; /* No remembered search string error */ 271 return (0); 272 } 273 /* 274 * Call common routine with r_stp (step) structure 275 */ 276 return (dhl_doit(str, &(((struct regex_comp *) ep)->r_stp), 277 ((locs != NULL) ? REG_NOTBOL : 0))); 278 } 279 280 /* 281 * dhl_advance: implement advance 282 */ 283 static int 284 dhl_advance(str, ep) 285 const char *str; /* characters to be checked for a match */ 286 const char *ep; /* compiled RE from dhl_compile() */ 287 { 288 int rv; 289 /* 290 * Check if we're passed a null ep 291 */ 292 if (ep == NULL) { 293 regerrno = 41; /* No remembered search string error */ 294 return (0); 295 } 296 /* 297 * Call common routine with r_adv (advance) structure 298 */ 299 rv = dhl_doit(str, &(((struct regex_comp *) ep)->r_adv), 0); 300 loc1 = NULL; /* Clear it per the compile man page */ 301 return (rv); 302 } 303 304 /* 305 * dhl_doit - common code for step and advance 306 */ 307 static int 308 dhl_doit(str, rep, flags) 309 const char *str; /* characters to be checked for a match */ 310 const regex_t *rep; 311 const int flags; /* flags to be passed to regexec directly */ 312 { 313 int rv; 314 int i; 315 regmatch_t *prm; /* ptr to current regmatch_t */ 316 317 /* 318 * Check if we're passed a null regex_t 319 */ 320 if (rep == NULL) { 321 regerrno = 41; /* No remembered search string error */ 322 return (0); 323 } 324 325 regerrno = 0; 326 prm = &rm[0]; 327 328 if ((rv = regexec(rep, str, SEPSIZE, prm, flags)) != REG_OK) { 329 if (rv == REG_NOMATCH) 330 return (0); 331 regerrno = map_errnos(rv); 332 return (0); 333 } 334 335 loc1 = (char *)str + prm->rm_so; 336 loc2 = (char *)str + prm->rm_eo; 337 338 /* 339 * Now we need to fill up the bra lists with all of the sub re's 340 * Note we subtract nsub -1, and preincrement prm. 341 */ 342 for (i = 0; i <= rep->re_nsub; i++) { 343 prm++; /* XXX inc past first subexp */ 344 braslist[i] = (char *)str + prm->rm_so; 345 braelist[i] = (char *)str + prm->rm_eo; 346 if (i >= SEPSIZE) { 347 regerrno = 50; /* regex overflow */ 348 return (0); 349 } 350 } 351 352 /* 353 * Inverse logic, a zero from regexec - success, is a 1 354 * from advance/step. 355 */ 356 357 return (rv == 0); 358 } 359 360 361 /* 362 * regerrno to compile/step error mapping: 363 * This is really a big compromise. Some errors don't map at all 364 * like regcomp error 15 is generated by both compile() error types 365 * 44 & 46. So which one should we map to? 366 * Note REG_ESUB Can't happen- 9 is no longer max num of subexpressions 367 * To do your errors right use xregerr() to get the regcomp error 368 * string and print that. 369 * 370 * | regcomp/regexec | Compile/step/advance | 371 * +---------------------------------+--------------------------------------+ 372 * 0 REG_OK Pattern matched 1 - Pattern matched 373 * 1 REG_NOMATCH No match 0 - Pattern didn't match 374 * 2 REG_ECOLLATE Bad collation elmnt. 67 - Returned by compile on mbtowc err 375 * 3 REG_EESCAPE trailing \ in patrn 45 - } expected after \. 376 * 4 REG_ENEWLINE \n before end pattrn 36 - Illegal or missing delimiter. 377 * 5 REG_ENSUB Over 9 \( \) pairs 43 - Too many \( 378 * 6 REG_ESUBREG Bad number in \[0-9] 25 - ``\digit'' out of range. 379 * 7 REG_EBRACK [ ] inbalance 49 - [ ] imbalance. 380 * 8 REG_EPAREN ( ) inbalance 42 - \(~\) imbalance. 381 * 9 REG_EBRACE \{ \} inbalance 45 - } expected after \. 382 * 10 REG_ERANGE bad range endpoint 11 - Range endpoint too large. 383 * 11 REG_ESPACE no memory for pattern 50 - Regular expression overflow. 384 * 12 REG_BADRPT invalid repetition 36 - Illegal or missing delimiter. 385 * 13 REG_ECTYPE invalid char-class 67 - illegal byte sequence 386 * 14 REG_BADPAT syntax error 50 - Regular expression overflow. 387 * 15 REG_BADBR \{ \} contents bad 46 - First number exceeds 2nd in \{~\} 388 * 16 REG_EFATAL internal error 50 - Regular expression overflow. 389 * 17 REG_ECHAR bad mulitbyte char 67 - illegal byte sequence 390 * 18 REG_STACK stack overflow 50 - Regular expression overflow. 391 * 19 REG_ENOSYS function not supported 50- Regular expression overflow. 392 * 393 * For reference here's the compile/step errno's. We don't generate 394 * 41 here - it's done earlier, nor 44 since we can't tell if from 46. 395 * 396 * 11 - Range endpoint too large. 397 * 16 - Bad number. 398 * 25 - ``\digit'' out of range. 399 * 36 - Illegal or missing delimiter. 400 * 41 - No remembered search string. 401 * 42 - \(~\) imbalance. 402 * 43 - Too many \(. 403 * 44 - More than 2 numbers given in "\{~\}" 404 * 45 - } expected after \. 405 * 46 - First number exceeds 2nd in "\{~\}" 406 * 49 - [ ] imbalance. 407 * 50 - Regular expression overflow. 408 */ 409 410 static int 411 map_errnos(int Errno) 412 { 413 switch (Errno) { 414 case REG_ECOLLATE: 415 regerrno = 67; 416 break; 417 case REG_EESCAPE: 418 regerrno = 45; 419 break; 420 case REG_ENEWLINE: 421 regerrno = 36; 422 break; 423 case REG_ENSUB: 424 regerrno = 43; 425 break; 426 case REG_ESUBREG: 427 regerrno = 25; 428 break; 429 case REG_EBRACK: 430 regerrno = 49; 431 break; 432 case REG_EPAREN: 433 regerrno = 42; 434 break; 435 case REG_EBRACE: 436 regerrno = 45; 437 break; 438 case REG_ERANGE: 439 regerrno = 11; 440 break; 441 case REG_ESPACE: 442 regerrno = 50; 443 break; 444 case REG_BADRPT: 445 regerrno = 36; 446 break; 447 case REG_ECTYPE: 448 regerrno = 67; 449 break; 450 case REG_BADPAT: 451 regerrno = 50; 452 break; 453 case REG_BADBR: 454 regerrno = 46; 455 break; 456 case REG_EFATAL: 457 regerrno = 50; 458 break; 459 case REG_ECHAR: 460 regerrno = 67; 461 break; 462 case REG_STACK: 463 regerrno = 50; 464 break; 465 case REG_ENOSYS: 466 regerrno = 50; 467 break; 468 default: 469 regerrno = 50; 470 break; 471 } 472 return (regerrno); 473 } 474 475 /* 476 * This is a routine to clean up the subtle substructure of the struct 477 * regex_comp type for use by clients of this module. Since the struct 478 * type is private, we use a generic interface, and trust the 479 * application to be damn sure that this operation is valid for the 480 * named memory. 481 */ 482 483 void 484 regex_comp_free(void * a) 485 { 486 /* 487 * Free any data being held for previous search strings 488 */ 489 490 if (((struct regex_comp *) a) == NULL) { 491 return; 492 } 493 494 regfree(&((struct regex_comp *)a)->r_stp); 495 regfree(&((struct regex_comp *)a)->r_adv); 496 }