1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 1995-2003 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 /*
29 * xcompile, xstep, xadvance - simulate compile(3g), step(3g), advance(3g)
30 * using regcomp(3c), regexec(3c) interfaces. This is an XCU4
31 * porting aid. switches out to libgen compile/step if collation
32 * table not present.
33 *
34 * Goal is to work with vi and sed/ed.
35 * Returns expbuf in dhl format (encoding of first two bytes).
36 * Note also that this is profoundly single threaded. You
37 * cannot call compile twice with two separate search strings
38 * because the second call will wipe out the earlier stored string.
39 * This must be fixed, plus a general cleanup should be performed
40 * if this is to be integrated into libc.
41 *
42 */
43
44 #include <stdio.h>
45 #include <widec.h>
46 #include <sys/types.h>
47 #include <regex.h>
48 #include <locale.h>
49 #include <stdlib.h>
50 #include <locale.h>
51 #include <string.h>
52 #include <unistd.h>
53 #include <regexpr.h>
54
55 /*
56 * psuedo compile/step/advance global variables
57 */
58 extern int nbra;
59 extern char *locs; /* for stopping execess recursion */
60 extern char *loc1; /* 1st character which matched RE */
61 extern char *loc2; /* char after lst char in matched RE */
62 extern char *braslist[]; /* start of nbra subexp */
63 extern char *braelist[]; /* end of nbra subexp */
64 extern int regerrno;
65 extern int reglength;
66
67 int regcomp_flags; /* interface to specify cflags for regcomp */
68
69 void regex_comp_free(void *a);
70 static int dhl_step(const char *str, const char *ep);
71 static int dhl_advance(const char *str, const char *ep);
72 static int map_errnos(int); /* Convert regcomp error */
73 static int dhl_doit(const char *, const regex_t *, const int flags);
74 static char * dhl_compile(const char *instr, char *ep, char *endbuf);
75
76 /*
77 * # of sub re's: NOTE: For now limit on bra list defined here
78 * but fix is to add maxbra define to to regex.h
79 * One problem is that a bigger number is a performance hit since
80 * regexec() has a slow initialization loop that goes around SEPSIZE times
81 */
82 #define SEPSIZE 20
83 static regmatch_t rm[SEPSIZE]; /* ptr to list of RE matches */
84
85 /*
86 * Structure to contain dl encoded first two bytes for vi, plus hold two
87 * regex structures, one for advance and one for step.
88 */
89 static struct regex_comp {
90 char r_head[2]; /* Header for DL encoding for vi */
91 regex_t r_stp; /* For use by step */
92 regex_t r_adv; /* For use by advance */
93 } reg_comp;
94
95 /*
96 * global value for the size of a regex_comp structure:
97 */
98 size_t regexc_size = sizeof (reg_comp);
99
100
101 char *
102 compile(const char *instr, char *expbuf, char *endbuf)
103 {
104 return (dhl_compile(instr, expbuf, endbuf));
105 }
106
107 int
108 step(const char *instr, const char *expbuf)
109 {
110 return (dhl_step(instr, expbuf));
111 }
112
113 int
114 advance(const char *instr, const char *expbuf)
115 {
116 return (dhl_advance(instr, expbuf));
117 }
118
119
120 /*
121 * the compile and step routines here simulate the old libgen routines of
122 * compile/step Re: regexpr(3G). in order to do this, we must assume
123 * that expbuf[] consists of the following format:
124 * 1) the first two bytes consist of a special encoding - see below.
125 * 2) the next part is a regex_t used by regexec()/regcomp() for step
126 * 3) the final part is a regex_t used by regexec()/regcomp() for advance
127 *
128 * the special encoding of the first two bytes is referenced throughout
129 * vi. apparently expbuf[0] is set to:
130 * = 0 upon initialization
131 * = 1 if the first char of the RE is a ^
132 * = 0 if the first char of the RE isn't a ^
133 * and expbuf[1-35+] = bitmap of the type of RE chars in the expression.
134 * this is apparently 0 if there's no RE.
135 * Here, we use expbuf[0] in a similar fashion; and expbuf[1] is non-zero
136 * if there's at least 1 RE in the string.
137 * I say "apparently" as the code to compile()/step() is poorly written.
138 */
139 static char *
140 dhl_compile(instr, expbuf, endbuf)
141 const char *instr; /* the regular expression */
142 char *expbuf; /* where the compiled RE gets placed */
143 char *endbuf; /* ending addr of expbuf */
144 {
145 int rv;
146 int alloc = 0;
147 char adv_instr[4096]; /* PLENTY big temp buffer */
148 char *instrp; /* PLENTY big temp buffer */
149
150 if (*instr == (char) NULL) {
151 regerrno = 41;
152 return (NULL);
153 }
154
155 /*
156 * Check values of expbuf and endbuf
157 */
158 if (expbuf == NULL) {
159 if ((expbuf = malloc(regexc_size)) == NULL) {
160 regerrno = 50;
161 return (NULL);
162 }
163 memset(®_comp, 0, regexc_size);
164 alloc = 1;
165 endbuf = expbuf + regexc_size;
166 } else { /* Check if enough memory was allocated */
167 if (expbuf + regexc_size > endbuf) {
168 regerrno = 50;
169 return (NULL);
170 }
171 memcpy(®_comp, expbuf, regexc_size);
172 }
173
174 /*
175 * Clear global flags
176 */
177 nbra = 0;
178 regerrno = 0;
179
180 /*
181 * Free any data being held for previous search strings
182 */
183 regex_comp_free(®_comp);
184
185 /*
186 * We call regcomp twice, once to get a regex_t for use by step()
187 * and then again with for use by advance()
188 */
189 if ((rv = regcomp(®_comp.r_stp, instr, regcomp_flags)) != 0) {
190 regerrno = map_errnos(rv); /* Convert regcomp error */
191 goto out;
192 }
193 /*
194 * To support advance, which assumes an implicit ^ to match at start
195 * of line we prepend a ^ to the pattern by copying to a temp buffer
196 */
197
198 if (instr[0] == '^')
199 instrp = (char *) instr; /* String already has leading ^ */
200 else {
201 adv_instr[0] = '^';
202 strncpy(&adv_instr[1], instr, 2048);
203 instrp = adv_instr;
204 }
205
206 if ((rv = regcomp(®_comp.r_adv, instrp, regcomp_flags)) != 0) {
207 regerrno = map_errnos(rv); /* Convert regcomp error */
208 goto out;
209 }
210
211 /*
212 * update global variables
213 */
214 nbra = (int) reg_comp.r_adv.re_nsub > 0 ?
215 (int) reg_comp.r_adv.re_nsub : 0;
216 regerrno = 0;
217
218 /*
219 * Set the header flags for use by vi
220 */
221 if (instr[0] == '^') /* if beginning of string, */
222 reg_comp.r_head[0] = 1; /* set special flag */
223 else
224 reg_comp.r_head[0] = 0; /* clear special flag */
225 /*
226 * note that for a single BRE, nbra will be 0 here.
227 * we're guaranteed that, at this point, a RE has been found.
228 */
229 reg_comp.r_head[1] = 1; /* set special flag */
230 /*
231 * Copy our reg_comp structure to expbuf
232 */
233 (void) memcpy(expbuf, (char *) ®_comp, regexc_size);
234
235 out:
236 /*
237 * Return code from libgen regcomp with mods. Note weird return
238 * value - if space is malloc'd return pointer to start of space,
239 * if user provided their own space, return pointer to 1+last byte
240 * of that space.
241 */
242 if (regerrno != 0) {
243 if (alloc)
244 free(expbuf);
245 return (NULL);
246 }
247 reglength = regexc_size;
248
249 if (alloc)
250 return (expbuf);
251 else
252 return (expbuf + regexc_size);
253 }
254
255
256 /*
257 * dhl_step: step through a string until a RE match is found, or end of str
258 */
259 static int
260 dhl_step(str, ep)
261 const char *str; /* characters to be checked for a match */
262 const char *ep; /* compiled RE from dhl_compile() */
263 {
264 /*
265 * Check if we're passed a null ep
266 */
267 if (ep == NULL) {
268 regerrno = 41; /* No remembered search string error */
269 return (0);
270 }
271 /*
272 * Call common routine with r_stp (step) structure
273 */
274 return (dhl_doit(str, &(((struct regex_comp *) ep)->r_stp),
275 ((locs != NULL) ? REG_NOTBOL : 0)));
276 }
277
278 /*
279 * dhl_advance: implement advance
280 */
281 static int
282 dhl_advance(str, ep)
283 const char *str; /* characters to be checked for a match */
284 const char *ep; /* compiled RE from dhl_compile() */
285 {
286 int rv;
287 /*
288 * Check if we're passed a null ep
289 */
290 if (ep == NULL) {
291 regerrno = 41; /* No remembered search string error */
292 return (0);
293 }
294 /*
295 * Call common routine with r_adv (advance) structure
296 */
297 rv = dhl_doit(str, &(((struct regex_comp *) ep)->r_adv), 0);
298 loc1 = NULL; /* Clear it per the compile man page */
299 return (rv);
300 }
301
302 /*
303 * dhl_doit - common code for step and advance
304 */
305 static int
306 dhl_doit(str, rep, flags)
307 const char *str; /* characters to be checked for a match */
308 const regex_t *rep;
309 const int flags; /* flags to be passed to regexec directly */
310 {
311 int rv;
312 int i;
313 regmatch_t *prm; /* ptr to current regmatch_t */
314
315 /*
316 * Check if we're passed a null regex_t
317 */
318 if (rep == NULL) {
319 regerrno = 41; /* No remembered search string error */
320 return (0);
321 }
322
323 regerrno = 0;
324 prm = &rm[0];
325
326 if ((rv = regexec(rep, str, SEPSIZE, prm, flags)) != REG_OK) {
327 if (rv == REG_NOMATCH)
328 return (0);
329 regerrno = map_errnos(rv);
330 return (0);
331 }
332
333 loc1 = (char *)str + prm->rm_so;
334 loc2 = (char *)str + prm->rm_eo;
335
336 /*
337 * Now we need to fill up the bra lists with all of the sub re's
338 * Note we subtract nsub -1, and preincrement prm.
339 */
340 for (i = 0; i <= rep->re_nsub; i++) {
341 prm++; /* XXX inc past first subexp */
342 braslist[i] = (char *)str + prm->rm_so;
343 braelist[i] = (char *)str + prm->rm_eo;
344 if (i >= SEPSIZE) {
345 regerrno = 50; /* regex overflow */
346 return (0);
347 }
348 }
349
350 /*
351 * Inverse logic, a zero from regexec - success, is a 1
352 * from advance/step.
353 */
354
355 return (rv == 0);
356 }
357
358
359 /*
360 * regerrno to compile/step error mapping:
361 * This is really a big compromise. Some errors don't map at all
362 * like regcomp error 15 is generated by both compile() error types
363 * 44 & 46. So which one should we map to?
364 * Note REG_ESUB Can't happen- 9 is no longer max num of subexpressions
365 * To do your errors right use xregerr() to get the regcomp error
366 * string and print that.
367 *
368 * | regcomp/regexec | Compile/step/advance |
369 * +---------------------------------+--------------------------------------+
370 * 0 REG_OK Pattern matched 1 - Pattern matched
371 * 1 REG_NOMATCH No match 0 - Pattern didn't match
372 * 2 REG_ECOLLATE Bad collation elmnt. 67 - Returned by compile on mbtowc err
373 * 3 REG_EESCAPE trailing \ in patrn 45 - } expected after \.
374 * 6 REG_ESUBREG Bad number in \[0-9] 25 - ``\digit'' out of range.
375 * 7 REG_EBRACK [ ] inbalance 49 - [ ] imbalance.
376 * 8 REG_EPAREN ( ) inbalance 42 - \(~\) imbalance.
377 * 9 REG_EBRACE \{ \} inbalance 45 - } expected after \.
378 * 10 REG_ERANGE bad range endpoint 11 - Range endpoint too large.
379 * 11 REG_ESPACE no memory for pattern 50 - Regular expression overflow.
380 * 12 REG_BADRPT invalid repetition 36 - Illegal or missing delimiter.
381 * 13 REG_ECTYPE invalid char-class 67 - illegal byte sequence
382 * 14 REG_BADPAT syntax error 50 - Regular expression overflow.
383 * 15 REG_BADBR \{ \} contents bad 46 - First number exceeds 2nd in \{~\}
384 * 17 REG_ECHAR bad mulitbyte char 67 - illegal byte sequence
385 *
386 * For reference here's the compile/step errno's. We don't generate
387 * 41 here - it's done earlier, nor 44 since we can't tell if from 46.
388 *
389 * 11 - Range endpoint too large.
390 * 16 - Bad number.
391 * 25 - ``\digit'' out of range.
392 * 36 - Illegal or missing delimiter.
393 * 41 - No remembered search string.
394 * 42 - \(~\) imbalance.
395 * 43 - Too many \(.
396 * 44 - More than 2 numbers given in "\{~\}"
397 * 45 - } expected after \.
398 * 46 - First number exceeds 2nd in "\{~\}"
399 * 49 - [ ] imbalance.
400 * 50 - Regular expression overflow.
401 */
402
403 static int
404 map_errnos(int Errno)
405 {
406 switch (Errno) {
407 case REG_ECOLLATE:
408 regerrno = 67;
409 break;
410 case REG_EESCAPE:
411 regerrno = 45;
412 break;
413 case REG_ESUBREG:
414 regerrno = 25;
415 break;
416 case REG_EBRACK:
417 regerrno = 49;
418 break;
419 case REG_EPAREN:
420 regerrno = 42;
421 break;
422 case REG_EBRACE:
423 regerrno = 45;
424 break;
425 case REG_ERANGE:
426 regerrno = 11;
427 break;
428 case REG_ESPACE:
429 regerrno = 50;
430 break;
431 case REG_BADRPT:
432 regerrno = 36;
433 break;
434 case REG_ECTYPE:
435 regerrno = 67;
436 break;
437 case REG_BADPAT:
438 regerrno = 50;
439 break;
440 case REG_BADBR:
441 regerrno = 46;
442 break;
443 case REG_ECHAR:
444 regerrno = 67;
445 break;
446 default:
447 regerrno = 50;
448 break;
449 }
450 return (regerrno);
451 }
452
453 /*
454 * This is a routine to clean up the subtle substructure of the struct
455 * regex_comp type for use by clients of this module. Since the struct
456 * type is private, we use a generic interface, and trust the
457 * application to be damn sure that this operation is valid for the
458 * named memory.
459 */
460
461 void
462 regex_comp_free(void * a)
463 {
464 /*
465 * Free any data being held for previous search strings
466 */
467
468 if (((struct regex_comp *) a) == NULL) {
469 return;
470 }
471
472 regfree(&((struct regex_comp *)a)->r_stp);
473 regfree(&((struct regex_comp *)a)->r_adv);
474 }