Print this page
9083 replace regex implementation with tre
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/cmd/expr/compile.c
+++ new/usr/src/cmd/expr/compile.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License, Version 1.0 only
6 6 * (the "License"). You may not use this file except in compliance
7 7 * with the License.
8 8 *
9 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 10 * or http://www.opensolaris.org/os/licensing.
11 11 * See the License for the specific language governing permissions
12 12 * and limitations under the License.
13 13 *
14 14 * When distributing Covered Code, include this CDDL HEADER in each
15 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 16 * If applicable, add the following below this CDDL HEADER, with the
17 17 * fields enclosed by brackets "[]" replaced with your own identifying
18 18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 19 *
20 20 * CDDL HEADER END
21 21 */
22 22 /*
23 23 * Copyright 1995-2003 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 26 */
27 27
28 28 /*
29 29 * xcompile, xstep, xadvance - simulate compile(3g), step(3g), advance(3g)
30 30 * using regcomp(3c), regexec(3c) interfaces. This is an XCU4
31 31 * porting aid. switches out to libgen compile/step if collation
32 32 * table not present.
33 33 *
↓ open down ↓ |
33 lines elided |
↑ open up ↑ |
34 34 * Goal is to work with vi and sed/ed.
35 35 * Returns expbuf in dhl format (encoding of first two bytes).
36 36 * Note also that this is profoundly single threaded. You
37 37 * cannot call compile twice with two separate search strings
38 38 * because the second call will wipe out the earlier stored string.
39 39 * This must be fixed, plus a general cleanup should be performed
40 40 * if this is to be integrated into libc.
41 41 *
42 42 */
43 43
44 -#pragma ident "%Z%%M% %I% %E% SMI"
45 -
46 44 #include <stdio.h>
47 45 #include <widec.h>
48 46 #include <sys/types.h>
49 47 #include <regex.h>
50 48 #include <locale.h>
51 49 #include <stdlib.h>
52 50 #include <locale.h>
53 51 #include <string.h>
54 52 #include <unistd.h>
55 53 #include <regexpr.h>
56 54
57 55 /*
58 56 * psuedo compile/step/advance global variables
59 57 */
60 58 extern int nbra;
61 59 extern char *locs; /* for stopping execess recursion */
62 60 extern char *loc1; /* 1st character which matched RE */
63 61 extern char *loc2; /* char after lst char in matched RE */
64 62 extern char *braslist[]; /* start of nbra subexp */
65 63 extern char *braelist[]; /* end of nbra subexp */
66 64 extern int regerrno;
67 65 extern int reglength;
68 66
69 67 int regcomp_flags; /* interface to specify cflags for regcomp */
70 68
71 69 void regex_comp_free(void *a);
72 70 static int dhl_step(const char *str, const char *ep);
73 71 static int dhl_advance(const char *str, const char *ep);
74 72 static int map_errnos(int); /* Convert regcomp error */
75 73 static int dhl_doit(const char *, const regex_t *, const int flags);
76 74 static char * dhl_compile(const char *instr, char *ep, char *endbuf);
77 75
78 76 /*
79 77 * # of sub re's: NOTE: For now limit on bra list defined here
80 78 * but fix is to add maxbra define to to regex.h
81 79 * One problem is that a bigger number is a performance hit since
82 80 * regexec() has a slow initialization loop that goes around SEPSIZE times
83 81 */
84 82 #define SEPSIZE 20
85 83 static regmatch_t rm[SEPSIZE]; /* ptr to list of RE matches */
86 84
87 85 /*
88 86 * Structure to contain dl encoded first two bytes for vi, plus hold two
89 87 * regex structures, one for advance and one for step.
90 88 */
91 89 static struct regex_comp {
92 90 char r_head[2]; /* Header for DL encoding for vi */
93 91 regex_t r_stp; /* For use by step */
94 92 regex_t r_adv; /* For use by advance */
95 93 } reg_comp;
96 94
97 95 /*
98 96 * global value for the size of a regex_comp structure:
99 97 */
100 98 size_t regexc_size = sizeof (reg_comp);
101 99
102 100
103 101 char *
104 102 compile(const char *instr, char *expbuf, char *endbuf)
105 103 {
106 104 return (dhl_compile(instr, expbuf, endbuf));
107 105 }
108 106
109 107 int
110 108 step(const char *instr, const char *expbuf)
111 109 {
112 110 return (dhl_step(instr, expbuf));
113 111 }
114 112
115 113 int
116 114 advance(const char *instr, const char *expbuf)
117 115 {
118 116 return (dhl_advance(instr, expbuf));
119 117 }
120 118
121 119
122 120 /*
123 121 * the compile and step routines here simulate the old libgen routines of
124 122 * compile/step Re: regexpr(3G). in order to do this, we must assume
125 123 * that expbuf[] consists of the following format:
126 124 * 1) the first two bytes consist of a special encoding - see below.
127 125 * 2) the next part is a regex_t used by regexec()/regcomp() for step
128 126 * 3) the final part is a regex_t used by regexec()/regcomp() for advance
129 127 *
130 128 * the special encoding of the first two bytes is referenced throughout
131 129 * vi. apparently expbuf[0] is set to:
132 130 * = 0 upon initialization
133 131 * = 1 if the first char of the RE is a ^
134 132 * = 0 if the first char of the RE isn't a ^
135 133 * and expbuf[1-35+] = bitmap of the type of RE chars in the expression.
136 134 * this is apparently 0 if there's no RE.
137 135 * Here, we use expbuf[0] in a similar fashion; and expbuf[1] is non-zero
138 136 * if there's at least 1 RE in the string.
139 137 * I say "apparently" as the code to compile()/step() is poorly written.
140 138 */
141 139 static char *
142 140 dhl_compile(instr, expbuf, endbuf)
143 141 const char *instr; /* the regular expression */
144 142 char *expbuf; /* where the compiled RE gets placed */
145 143 char *endbuf; /* ending addr of expbuf */
146 144 {
147 145 int rv;
148 146 int alloc = 0;
149 147 char adv_instr[4096]; /* PLENTY big temp buffer */
150 148 char *instrp; /* PLENTY big temp buffer */
151 149
152 150 if (*instr == (char) NULL) {
153 151 regerrno = 41;
154 152 return (NULL);
155 153 }
156 154
157 155 /*
158 156 * Check values of expbuf and endbuf
159 157 */
160 158 if (expbuf == NULL) {
161 159 if ((expbuf = malloc(regexc_size)) == NULL) {
162 160 regerrno = 50;
163 161 return (NULL);
164 162 }
165 163 memset(®_comp, 0, regexc_size);
166 164 alloc = 1;
167 165 endbuf = expbuf + regexc_size;
168 166 } else { /* Check if enough memory was allocated */
169 167 if (expbuf + regexc_size > endbuf) {
170 168 regerrno = 50;
171 169 return (NULL);
172 170 }
173 171 memcpy(®_comp, expbuf, regexc_size);
174 172 }
175 173
176 174 /*
177 175 * Clear global flags
178 176 */
179 177 nbra = 0;
180 178 regerrno = 0;
181 179
182 180 /*
183 181 * Free any data being held for previous search strings
184 182 */
185 183 regex_comp_free(®_comp);
186 184
187 185 /*
188 186 * We call regcomp twice, once to get a regex_t for use by step()
189 187 * and then again with for use by advance()
190 188 */
191 189 if ((rv = regcomp(®_comp.r_stp, instr, regcomp_flags)) != 0) {
192 190 regerrno = map_errnos(rv); /* Convert regcomp error */
193 191 goto out;
194 192 }
195 193 /*
196 194 * To support advance, which assumes an implicit ^ to match at start
197 195 * of line we prepend a ^ to the pattern by copying to a temp buffer
198 196 */
199 197
200 198 if (instr[0] == '^')
201 199 instrp = (char *) instr; /* String already has leading ^ */
202 200 else {
203 201 adv_instr[0] = '^';
204 202 strncpy(&adv_instr[1], instr, 2048);
205 203 instrp = adv_instr;
206 204 }
207 205
208 206 if ((rv = regcomp(®_comp.r_adv, instrp, regcomp_flags)) != 0) {
209 207 regerrno = map_errnos(rv); /* Convert regcomp error */
210 208 goto out;
211 209 }
212 210
213 211 /*
214 212 * update global variables
215 213 */
216 214 nbra = (int) reg_comp.r_adv.re_nsub > 0 ?
217 215 (int) reg_comp.r_adv.re_nsub : 0;
218 216 regerrno = 0;
219 217
220 218 /*
221 219 * Set the header flags for use by vi
222 220 */
223 221 if (instr[0] == '^') /* if beginning of string, */
224 222 reg_comp.r_head[0] = 1; /* set special flag */
225 223 else
226 224 reg_comp.r_head[0] = 0; /* clear special flag */
227 225 /*
228 226 * note that for a single BRE, nbra will be 0 here.
229 227 * we're guaranteed that, at this point, a RE has been found.
230 228 */
231 229 reg_comp.r_head[1] = 1; /* set special flag */
232 230 /*
233 231 * Copy our reg_comp structure to expbuf
234 232 */
235 233 (void) memcpy(expbuf, (char *) ®_comp, regexc_size);
236 234
237 235 out:
238 236 /*
239 237 * Return code from libgen regcomp with mods. Note weird return
240 238 * value - if space is malloc'd return pointer to start of space,
241 239 * if user provided their own space, return pointer to 1+last byte
242 240 * of that space.
243 241 */
244 242 if (regerrno != 0) {
245 243 if (alloc)
246 244 free(expbuf);
247 245 return (NULL);
248 246 }
249 247 reglength = regexc_size;
250 248
251 249 if (alloc)
252 250 return (expbuf);
253 251 else
254 252 return (expbuf + regexc_size);
255 253 }
256 254
257 255
258 256 /*
259 257 * dhl_step: step through a string until a RE match is found, or end of str
260 258 */
261 259 static int
262 260 dhl_step(str, ep)
263 261 const char *str; /* characters to be checked for a match */
264 262 const char *ep; /* compiled RE from dhl_compile() */
265 263 {
266 264 /*
267 265 * Check if we're passed a null ep
268 266 */
269 267 if (ep == NULL) {
270 268 regerrno = 41; /* No remembered search string error */
271 269 return (0);
272 270 }
273 271 /*
274 272 * Call common routine with r_stp (step) structure
275 273 */
276 274 return (dhl_doit(str, &(((struct regex_comp *) ep)->r_stp),
277 275 ((locs != NULL) ? REG_NOTBOL : 0)));
278 276 }
279 277
280 278 /*
281 279 * dhl_advance: implement advance
282 280 */
283 281 static int
284 282 dhl_advance(str, ep)
285 283 const char *str; /* characters to be checked for a match */
286 284 const char *ep; /* compiled RE from dhl_compile() */
287 285 {
288 286 int rv;
289 287 /*
290 288 * Check if we're passed a null ep
291 289 */
292 290 if (ep == NULL) {
293 291 regerrno = 41; /* No remembered search string error */
294 292 return (0);
295 293 }
296 294 /*
297 295 * Call common routine with r_adv (advance) structure
298 296 */
299 297 rv = dhl_doit(str, &(((struct regex_comp *) ep)->r_adv), 0);
300 298 loc1 = NULL; /* Clear it per the compile man page */
301 299 return (rv);
302 300 }
303 301
304 302 /*
305 303 * dhl_doit - common code for step and advance
306 304 */
307 305 static int
308 306 dhl_doit(str, rep, flags)
309 307 const char *str; /* characters to be checked for a match */
310 308 const regex_t *rep;
311 309 const int flags; /* flags to be passed to regexec directly */
312 310 {
313 311 int rv;
314 312 int i;
315 313 regmatch_t *prm; /* ptr to current regmatch_t */
316 314
317 315 /*
318 316 * Check if we're passed a null regex_t
319 317 */
320 318 if (rep == NULL) {
321 319 regerrno = 41; /* No remembered search string error */
322 320 return (0);
323 321 }
324 322
325 323 regerrno = 0;
326 324 prm = &rm[0];
327 325
328 326 if ((rv = regexec(rep, str, SEPSIZE, prm, flags)) != REG_OK) {
329 327 if (rv == REG_NOMATCH)
330 328 return (0);
331 329 regerrno = map_errnos(rv);
332 330 return (0);
333 331 }
334 332
335 333 loc1 = (char *)str + prm->rm_so;
336 334 loc2 = (char *)str + prm->rm_eo;
337 335
338 336 /*
339 337 * Now we need to fill up the bra lists with all of the sub re's
340 338 * Note we subtract nsub -1, and preincrement prm.
341 339 */
342 340 for (i = 0; i <= rep->re_nsub; i++) {
343 341 prm++; /* XXX inc past first subexp */
344 342 braslist[i] = (char *)str + prm->rm_so;
345 343 braelist[i] = (char *)str + prm->rm_eo;
346 344 if (i >= SEPSIZE) {
347 345 regerrno = 50; /* regex overflow */
348 346 return (0);
349 347 }
350 348 }
351 349
352 350 /*
353 351 * Inverse logic, a zero from regexec - success, is a 1
354 352 * from advance/step.
355 353 */
356 354
357 355 return (rv == 0);
358 356 }
359 357
360 358
361 359 /*
362 360 * regerrno to compile/step error mapping:
363 361 * This is really a big compromise. Some errors don't map at all
364 362 * like regcomp error 15 is generated by both compile() error types
365 363 * 44 & 46. So which one should we map to?
↓ open down ↓ |
310 lines elided |
↑ open up ↑ |
366 364 * Note REG_ESUB Can't happen- 9 is no longer max num of subexpressions
367 365 * To do your errors right use xregerr() to get the regcomp error
368 366 * string and print that.
369 367 *
370 368 * | regcomp/regexec | Compile/step/advance |
371 369 * +---------------------------------+--------------------------------------+
372 370 * 0 REG_OK Pattern matched 1 - Pattern matched
373 371 * 1 REG_NOMATCH No match 0 - Pattern didn't match
374 372 * 2 REG_ECOLLATE Bad collation elmnt. 67 - Returned by compile on mbtowc err
375 373 * 3 REG_EESCAPE trailing \ in patrn 45 - } expected after \.
376 - * 4 REG_ENEWLINE \n before end pattrn 36 - Illegal or missing delimiter.
377 - * 5 REG_ENSUB Over 9 \( \) pairs 43 - Too many \(
378 374 * 6 REG_ESUBREG Bad number in \[0-9] 25 - ``\digit'' out of range.
379 375 * 7 REG_EBRACK [ ] inbalance 49 - [ ] imbalance.
380 376 * 8 REG_EPAREN ( ) inbalance 42 - \(~\) imbalance.
381 377 * 9 REG_EBRACE \{ \} inbalance 45 - } expected after \.
382 378 * 10 REG_ERANGE bad range endpoint 11 - Range endpoint too large.
383 379 * 11 REG_ESPACE no memory for pattern 50 - Regular expression overflow.
384 380 * 12 REG_BADRPT invalid repetition 36 - Illegal or missing delimiter.
385 381 * 13 REG_ECTYPE invalid char-class 67 - illegal byte sequence
386 382 * 14 REG_BADPAT syntax error 50 - Regular expression overflow.
387 383 * 15 REG_BADBR \{ \} contents bad 46 - First number exceeds 2nd in \{~\}
388 - * 16 REG_EFATAL internal error 50 - Regular expression overflow.
389 384 * 17 REG_ECHAR bad mulitbyte char 67 - illegal byte sequence
390 - * 18 REG_STACK stack overflow 50 - Regular expression overflow.
391 - * 19 REG_ENOSYS function not supported 50- Regular expression overflow.
392 385 *
393 386 * For reference here's the compile/step errno's. We don't generate
394 387 * 41 here - it's done earlier, nor 44 since we can't tell if from 46.
395 388 *
396 389 * 11 - Range endpoint too large.
397 390 * 16 - Bad number.
398 391 * 25 - ``\digit'' out of range.
399 392 * 36 - Illegal or missing delimiter.
400 393 * 41 - No remembered search string.
401 394 * 42 - \(~\) imbalance.
402 395 * 43 - Too many \(.
403 396 * 44 - More than 2 numbers given in "\{~\}"
404 397 * 45 - } expected after \.
405 398 * 46 - First number exceeds 2nd in "\{~\}"
406 399 * 49 - [ ] imbalance.
407 400 * 50 - Regular expression overflow.
408 401 */
409 402
↓ open down ↓ |
8 lines elided |
↑ open up ↑ |
410 403 static int
411 404 map_errnos(int Errno)
412 405 {
413 406 switch (Errno) {
414 407 case REG_ECOLLATE:
415 408 regerrno = 67;
416 409 break;
417 410 case REG_EESCAPE:
418 411 regerrno = 45;
419 412 break;
420 - case REG_ENEWLINE:
421 - regerrno = 36;
422 - break;
423 - case REG_ENSUB:
424 - regerrno = 43;
425 - break;
426 413 case REG_ESUBREG:
427 414 regerrno = 25;
428 415 break;
429 416 case REG_EBRACK:
430 417 regerrno = 49;
431 418 break;
432 419 case REG_EPAREN:
433 420 regerrno = 42;
434 421 break;
435 422 case REG_EBRACE:
436 423 regerrno = 45;
437 424 break;
438 425 case REG_ERANGE:
439 426 regerrno = 11;
440 427 break;
441 428 case REG_ESPACE:
442 429 regerrno = 50;
443 430 break;
444 431 case REG_BADRPT:
445 432 regerrno = 36;
↓ open down ↓ |
10 lines elided |
↑ open up ↑ |
446 433 break;
447 434 case REG_ECTYPE:
448 435 regerrno = 67;
449 436 break;
450 437 case REG_BADPAT:
451 438 regerrno = 50;
452 439 break;
453 440 case REG_BADBR:
454 441 regerrno = 46;
455 442 break;
456 - case REG_EFATAL:
457 - regerrno = 50;
458 - break;
459 443 case REG_ECHAR:
460 444 regerrno = 67;
461 445 break;
462 - case REG_STACK:
463 - regerrno = 50;
464 - break;
465 - case REG_ENOSYS:
466 - regerrno = 50;
467 - break;
468 446 default:
469 447 regerrno = 50;
470 448 break;
471 449 }
472 450 return (regerrno);
473 451 }
474 452
475 453 /*
476 454 * This is a routine to clean up the subtle substructure of the struct
477 455 * regex_comp type for use by clients of this module. Since the struct
478 456 * type is private, we use a generic interface, and trust the
479 457 * application to be damn sure that this operation is valid for the
480 458 * named memory.
481 459 */
482 460
483 461 void
484 462 regex_comp_free(void * a)
485 463 {
486 464 /*
487 465 * Free any data being held for previous search strings
488 466 */
489 467
490 468 if (((struct regex_comp *) a) == NULL) {
491 469 return;
492 470 }
493 471
494 472 regfree(&((struct regex_comp *)a)->r_stp);
495 473 regfree(&((struct regex_comp *)a)->r_adv);
496 474 }
↓ open down ↓ |
19 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX