1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 1995-2003 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright (c) 2016 by Delphix. All rights reserved. 26 */ 27 28 /* 29 * xcompile, xstep, xadvance - simulate compile(3g), step(3g), advance(3g) 30 * using regcomp(3c), regexec(3c) interfaces. This is an XCU4 31 * porting aid. switches out to libgen compile/step if collation 32 * table not present. 33 * 34 * Goal is to work with vi and sed/ed. 35 * Returns expbuf in dhl format (encoding of first two bytes). 36 * Note also that this is profoundly single threaded. You 37 * cannot call compile twice with two separate search strings 38 * because the second call will wipe out the earlier stored string. 39 * This must be fixed, plus a general cleanup should be performed 40 * if this is to be integrated into libc. 41 * 42 */ 43 44 #include <stdio.h> 45 #include <widec.h> 46 #include <sys/types.h> 47 #include <regex.h> 48 #include <locale.h> 49 #include <stdlib.h> 50 #include <locale.h> 51 #include <string.h> 52 #include <unistd.h> 53 #include <regexpr.h> 54 55 /* 56 * psuedo compile/step/advance global variables 57 */ 58 extern int nbra; 59 extern char *locs; /* for stopping execess recursion */ 60 extern char *loc1; /* 1st character which matched RE */ 61 extern char *loc2; /* char after lst char in matched RE */ 62 extern char *braslist[]; /* start of nbra subexp */ 63 extern char *braelist[]; /* end of nbra subexp */ 64 extern int regerrno; 65 extern int reglength; 66 67 int regcomp_flags; /* interface to specify cflags for regcomp */ 68 69 void regex_comp_free(void *a); 70 static int dhl_step(const char *str, const char *ep); 71 static int dhl_advance(const char *str, const char *ep); 72 static int map_errnos(int); /* Convert regcomp error */ 73 static int dhl_doit(const char *, const regex_t *, const int flags); 74 static char * dhl_compile(const char *instr, char *ep, char *endbuf); 75 76 /* 77 * # of sub re's: NOTE: For now limit on bra list defined here 78 * but fix is to add maxbra define to to regex.h 79 * One problem is that a bigger number is a performance hit since 80 * regexec() has a slow initialization loop that goes around SEPSIZE times 81 */ 82 #define SEPSIZE 20 83 static regmatch_t rm[SEPSIZE]; /* ptr to list of RE matches */ 84 85 /* 86 * Structure to contain dl encoded first two bytes for vi, plus hold two 87 * regex structures, one for advance and one for step. 88 */ 89 static struct regex_comp { 90 char r_head[2]; /* Header for DL encoding for vi */ 91 regex_t r_stp; /* For use by step */ 92 regex_t r_adv; /* For use by advance */ 93 } reg_comp; 94 95 /* 96 * global value for the size of a regex_comp structure: 97 */ 98 size_t regexc_size = sizeof (reg_comp); 99 100 101 char * 102 compile(const char *instr, char *expbuf, char *endbuf) 103 { 104 return (dhl_compile(instr, expbuf, endbuf)); 105 } 106 107 int 108 step(const char *instr, const char *expbuf) 109 { 110 return (dhl_step(instr, expbuf)); 111 } 112 113 int 114 advance(const char *instr, const char *expbuf) 115 { 116 return (dhl_advance(instr, expbuf)); 117 } 118 119 120 /* 121 * the compile and step routines here simulate the old libgen routines of 122 * compile/step Re: regexpr(3G). in order to do this, we must assume 123 * that expbuf[] consists of the following format: 124 * 1) the first two bytes consist of a special encoding - see below. 125 * 2) the next part is a regex_t used by regexec()/regcomp() for step 126 * 3) the final part is a regex_t used by regexec()/regcomp() for advance 127 * 128 * the special encoding of the first two bytes is referenced throughout 129 * vi. apparently expbuf[0] is set to: 130 * = 0 upon initialization 131 * = 1 if the first char of the RE is a ^ 132 * = 0 if the first char of the RE isn't a ^ 133 * and expbuf[1-35+] = bitmap of the type of RE chars in the expression. 134 * this is apparently 0 if there's no RE. 135 * Here, we use expbuf[0] in a similar fashion; and expbuf[1] is non-zero 136 * if there's at least 1 RE in the string. 137 * I say "apparently" as the code to compile()/step() is poorly written. 138 */ 139 static char * 140 dhl_compile(instr, expbuf, endbuf) 141 const char *instr; /* the regular expression */ 142 char *expbuf; /* where the compiled RE gets placed */ 143 char *endbuf; /* ending addr of expbuf */ 144 { 145 int rv; 146 int alloc = 0; 147 char adv_instr[4096]; /* PLENTY big temp buffer */ 148 char *instrp; /* PLENTY big temp buffer */ 149 150 if (*instr == (char) NULL) { 151 regerrno = 41; 152 return (NULL); 153 } 154 155 /* 156 * Check values of expbuf and endbuf 157 */ 158 if (expbuf == NULL) { 159 if ((expbuf = malloc(regexc_size)) == NULL) { 160 regerrno = 50; 161 return (NULL); 162 } 163 memset(®_comp, 0, regexc_size); 164 alloc = 1; 165 endbuf = expbuf + regexc_size; 166 } else { /* Check if enough memory was allocated */ 167 if (expbuf + regexc_size > endbuf) { 168 regerrno = 50; 169 return (NULL); 170 } 171 memcpy(®_comp, expbuf, regexc_size); 172 } 173 174 /* 175 * Clear global flags 176 */ 177 nbra = 0; 178 regerrno = 0; 179 180 /* 181 * Free any data being held for previous search strings 182 */ 183 regex_comp_free(®_comp); 184 185 /* 186 * We call regcomp twice, once to get a regex_t for use by step() 187 * and then again with for use by advance() 188 */ 189 if ((rv = regcomp(®_comp.r_stp, instr, regcomp_flags)) != 0) { 190 regerrno = map_errnos(rv); /* Convert regcomp error */ 191 goto out; 192 } 193 /* 194 * To support advance, which assumes an implicit ^ to match at start 195 * of line we prepend a ^ to the pattern by copying to a temp buffer 196 */ 197 198 if (instr[0] == '^') 199 instrp = (char *) instr; /* String already has leading ^ */ 200 else { 201 adv_instr[0] = '^'; 202 strncpy(&adv_instr[1], instr, 2048); 203 instrp = adv_instr; 204 } 205 206 if ((rv = regcomp(®_comp.r_adv, instrp, regcomp_flags)) != 0) { 207 regerrno = map_errnos(rv); /* Convert regcomp error */ 208 goto out; 209 } 210 211 /* 212 * update global variables 213 */ 214 nbra = (int) reg_comp.r_adv.re_nsub > 0 ? 215 (int) reg_comp.r_adv.re_nsub : 0; 216 regerrno = 0; 217 218 /* 219 * Set the header flags for use by vi 220 */ 221 if (instr[0] == '^') /* if beginning of string, */ 222 reg_comp.r_head[0] = 1; /* set special flag */ 223 else 224 reg_comp.r_head[0] = 0; /* clear special flag */ 225 /* 226 * note that for a single BRE, nbra will be 0 here. 227 * we're guaranteed that, at this point, a RE has been found. 228 */ 229 reg_comp.r_head[1] = 1; /* set special flag */ 230 /* 231 * Copy our reg_comp structure to expbuf 232 */ 233 (void) memcpy(expbuf, (char *) ®_comp, regexc_size); 234 235 out: 236 /* 237 * Return code from libgen regcomp with mods. Note weird return 238 * value - if space is malloc'd return pointer to start of space, 239 * if user provided their own space, return pointer to 1+last byte 240 * of that space. 241 */ 242 if (regerrno != 0) { 243 if (alloc) 244 free(expbuf); 245 return (NULL); 246 } 247 reglength = regexc_size; 248 249 if (alloc) 250 return (expbuf); 251 else 252 return (expbuf + regexc_size); 253 } 254 255 256 /* 257 * dhl_step: step through a string until a RE match is found, or end of str 258 */ 259 static int 260 dhl_step(str, ep) 261 const char *str; /* characters to be checked for a match */ 262 const char *ep; /* compiled RE from dhl_compile() */ 263 { 264 /* 265 * Check if we're passed a null ep 266 */ 267 if (ep == NULL) { 268 regerrno = 41; /* No remembered search string error */ 269 return (0); 270 } 271 /* 272 * Call common routine with r_stp (step) structure 273 */ 274 return (dhl_doit(str, &(((struct regex_comp *) ep)->r_stp), 275 ((locs != NULL) ? REG_NOTBOL : 0))); 276 } 277 278 /* 279 * dhl_advance: implement advance 280 */ 281 static int 282 dhl_advance(str, ep) 283 const char *str; /* characters to be checked for a match */ 284 const char *ep; /* compiled RE from dhl_compile() */ 285 { 286 int rv; 287 /* 288 * Check if we're passed a null ep 289 */ 290 if (ep == NULL) { 291 regerrno = 41; /* No remembered search string error */ 292 return (0); 293 } 294 /* 295 * Call common routine with r_adv (advance) structure 296 */ 297 rv = dhl_doit(str, &(((struct regex_comp *) ep)->r_adv), 0); 298 loc1 = NULL; /* Clear it per the compile man page */ 299 return (rv); 300 } 301 302 /* 303 * dhl_doit - common code for step and advance 304 */ 305 static int 306 dhl_doit(str, rep, flags) 307 const char *str; /* characters to be checked for a match */ 308 const regex_t *rep; 309 const int flags; /* flags to be passed to regexec directly */ 310 { 311 int rv; 312 int i; 313 regmatch_t *prm; /* ptr to current regmatch_t */ 314 315 /* 316 * Check if we're passed a null regex_t 317 */ 318 if (rep == NULL) { 319 regerrno = 41; /* No remembered search string error */ 320 return (0); 321 } 322 323 regerrno = 0; 324 prm = &rm[0]; 325 326 if ((rv = regexec(rep, str, SEPSIZE, prm, flags)) != REG_OK) { 327 if (rv == REG_NOMATCH) 328 return (0); 329 regerrno = map_errnos(rv); 330 return (0); 331 } 332 333 loc1 = (char *)str + prm->rm_so; 334 loc2 = (char *)str + prm->rm_eo; 335 336 /* 337 * Now we need to fill up the bra lists with all of the sub re's 338 * Note we subtract nsub -1, and preincrement prm. 339 */ 340 for (i = 0; i <= rep->re_nsub; i++) { 341 prm++; /* XXX inc past first subexp */ 342 braslist[i] = (char *)str + prm->rm_so; 343 braelist[i] = (char *)str + prm->rm_eo; 344 if (i >= SEPSIZE) { 345 regerrno = 50; /* regex overflow */ 346 return (0); 347 } 348 } 349 350 /* 351 * Inverse logic, a zero from regexec - success, is a 1 352 * from advance/step. 353 */ 354 355 return (rv == 0); 356 } 357 358 359 /* 360 * regerrno to compile/step error mapping: 361 * This is really a big compromise. Some errors don't map at all 362 * like regcomp error 15 is generated by both compile() error types 363 * 44 & 46. So which one should we map to? 364 * Note REG_ESUB Can't happen- 9 is no longer max num of subexpressions 365 * To do your errors right use xregerr() to get the regcomp error 366 * string and print that. 367 * 368 * | regcomp/regexec | Compile/step/advance | 369 * +---------------------------------+--------------------------------------+ 370 * 0 REG_OK Pattern matched 1 - Pattern matched 371 * 1 REG_NOMATCH No match 0 - Pattern didn't match 372 * 2 REG_ECOLLATE Bad collation elmnt. 67 - Returned by compile on mbtowc err 373 * 3 REG_EESCAPE trailing \ in patrn 45 - } expected after \. 374 * 6 REG_ESUBREG Bad number in \[0-9] 25 - ``\digit'' out of range. 375 * 7 REG_EBRACK [ ] inbalance 49 - [ ] imbalance. 376 * 8 REG_EPAREN ( ) inbalance 42 - \(~\) imbalance. 377 * 9 REG_EBRACE \{ \} inbalance 45 - } expected after \. 378 * 10 REG_ERANGE bad range endpoint 11 - Range endpoint too large. 379 * 11 REG_ESPACE no memory for pattern 50 - Regular expression overflow. 380 * 12 REG_BADRPT invalid repetition 36 - Illegal or missing delimiter. 381 * 13 REG_ECTYPE invalid char-class 67 - illegal byte sequence 382 * 14 REG_BADPAT syntax error 50 - Regular expression overflow. 383 * 15 REG_BADBR \{ \} contents bad 46 - First number exceeds 2nd in \{~\} 384 * 17 REG_ECHAR bad mulitbyte char 67 - illegal byte sequence 385 * 386 * For reference here's the compile/step errno's. We don't generate 387 * 41 here - it's done earlier, nor 44 since we can't tell if from 46. 388 * 389 * 11 - Range endpoint too large. 390 * 16 - Bad number. 391 * 25 - ``\digit'' out of range. 392 * 36 - Illegal or missing delimiter. 393 * 41 - No remembered search string. 394 * 42 - \(~\) imbalance. 395 * 43 - Too many \(. 396 * 44 - More than 2 numbers given in "\{~\}" 397 * 45 - } expected after \. 398 * 46 - First number exceeds 2nd in "\{~\}" 399 * 49 - [ ] imbalance. 400 * 50 - Regular expression overflow. 401 */ 402 403 static int 404 map_errnos(int Errno) 405 { 406 switch (Errno) { 407 case REG_ECOLLATE: 408 regerrno = 67; 409 break; 410 case REG_EESCAPE: 411 regerrno = 45; 412 break; 413 case REG_ESUBREG: 414 regerrno = 25; 415 break; 416 case REG_EBRACK: 417 regerrno = 49; 418 break; 419 case REG_EPAREN: 420 regerrno = 42; 421 break; 422 case REG_EBRACE: 423 regerrno = 45; 424 break; 425 case REG_ERANGE: 426 regerrno = 11; 427 break; 428 case REG_ESPACE: 429 regerrno = 50; 430 break; 431 case REG_BADRPT: 432 regerrno = 36; 433 break; 434 case REG_ECTYPE: 435 regerrno = 67; 436 break; 437 case REG_BADPAT: 438 regerrno = 50; 439 break; 440 case REG_BADBR: 441 regerrno = 46; 442 break; 443 case REG_ECHAR: 444 regerrno = 67; 445 break; 446 default: 447 regerrno = 50; 448 break; 449 } 450 return (regerrno); 451 } 452 453 /* 454 * This is a routine to clean up the subtle substructure of the struct 455 * regex_comp type for use by clients of this module. Since the struct 456 * type is private, we use a generic interface, and trust the 457 * application to be damn sure that this operation is valid for the 458 * named memory. 459 */ 460 461 void 462 regex_comp_free(void * a) 463 { 464 /* 465 * Free any data being held for previous search strings 466 */ 467 468 if (((struct regex_comp *) a) == NULL) { 469 return; 470 } 471 472 regfree(&((struct regex_comp *)a)->r_stp); 473 regfree(&((struct regex_comp *)a)->r_adv); 474 }