1 .\" Copyright (c) 1992, 1993, 1994 Henry Spencer.
2 .\" Copyright (c) 1992, 1993, 1994
3 .\" The Regents of the University of California. All rights reserved.
4 .\"
5 .\" This code is derived from software contributed to Berkeley by
6 .\" Henry Spencer.
7 .\"
8 .\" Redistribution and use in source and binary forms, with or without
9 .\" modification, are permitted provided that the following conditions
10 .\" are met:
11 .\" 1. Redistributions of source code must retain the above copyright
12 .\" notice, this list of conditions and the following disclaimer.
13 .\" 2. Redistributions in binary form must reproduce the above copyright
14 .\" notice, this list of conditions and the following disclaimer in the
15 .\" documentation and/or other materials provided with the distribution.
16 .\" 3. Neither the name of the University nor the names of its contributors
17 .\" may be used to endorse or promote products derived from this software
18 .\" without specific prior written permission.
19 .\"
20 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 .\" SUCH DAMAGE.
31 .\"
32 .\"
33 .\" Sun Microsystems, Inc. gratefully acknowledges The Open Group for permission
34 .\" to reproduce portions of its copyrighted documentation.
35 .\"
36 .\" Original documentation from The Open Group can be obtained online at
37 .\" http://www.opengroup.org/bookstore/.
38 .\"
39 .\" The Institute of Electrical and Electronics Engineers and The Open Group,
40 .\" have given us permission to reprint portions of their documentation. In the
41 .\" following statement, the phrase "this text" refers to portions of the system
42 .\" documentation.
43 .\"
44 .\" Portions of this text are reprinted and reproduced in electronic form in the
45 .\" Sun OS Reference Manual, from IEEE Std 1003.1, 2004 Edition, Standard for
46 .\" Information Technology -- Portable Operating System Interface (POSIX),
47 .\" The Open Group Base Specifications Issue 6, Copyright (C) 2001-2004 by the
48 .\" Institute of Electrical and Electronics Engineers, Inc and The Open Group.
49 .\"
50 .\" In the event of any discrepancy between these versions and the original
51 .\" IEEE and The Open Group Standard, the original IEEE and The Open Group
52 .\" Standard is the referee document.
53 .\"
54 .\" The original Standard can be obtained online at
55 .\" http://www.opengroup.org/unix/online.html.
56 .\"
57 .\" This notice shall appear on any product containing this material.
58 .\"
59 .\" The contents of this file are subject to the terms of the
60 .\" Common Development and Distribution License (the "License").
61 .\" You may not use this file except in compliance with the License.
62 .\"
63 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
64 .\" or http://www.opensolaris.org/os/licensing.
65 .\" See the License for the specific language governing permissions
66 .\" and limitations under the License.
67 .\"
68 .\" When distributing Covered Code, include this CDDL HEADER in each
69 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
70 .\" If applicable, add the following below this CDDL HEADER, with the
71 .\" fields enclosed by brackets "[]" replaced with your own identifying
72 .\" information: Portions Copyright [yyyy] [name of copyright owner]
73 .\"
74 .\"
75 .\" Copyright (c) 1992, X/Open Company Limited. All Rights Reserved.
76 .\" Portions Copyright (c) 2003, Sun Microsystems, Inc. All Rights Reserved.
77 .\" Copyright 2018 Nexenta Systems, Inc.
78 .\"
79 .Dd February 3, 2018
80 .Dt REGCOMP 3C
81 .Os
82 .Sh NAME
83 .Nm regcomp ,
84 .Nm regexec ,
85 .Nm regerror ,
86 .Nm regfree
87 .Nd regular-expression library
88 .Sh LIBRARY
89 .Lb libc
90 .Sh SYNOPSIS
91 .In regex.h
92 .Ft int
93 .Fo regcomp
94 .Fa "regex_t *restrict preg" "const char *restrict pattern" "int cflags"
95 .Fc
96 .Ft int
97 .Fo regexec
98 .Fa "const regex_t *restrict preg" "const char *restrict string"
99 .Fa "size_t nmatch" "regmatch_t pmatch[restrict]" "int eflags"
100 .Fc
101 .Ft size_t
102 .Fo regerror
103 .Fa "int errcode" "const regex_t *restrict preg"
104 .Fa "char *restrict errbuf" "size_t errbuf_size"
105 .Fc
106 .Ft void
107 .Fn regfree "regex_t *preg"
108 .Sh DESCRIPTION
109 These routines implement
110 .St -p1003.2
111 regular expressions; see
112 .Xr regex 5 .
113 The
114 .Fn regcomp
115 function compiles an RE written as a string into an internal form,
116 .Fn regexec
117 matches that internal form against a string and reports results,
118 .Fn regerror
119 transforms error codes from either into human-readable messages,
120 and
121 .Fn regfree
122 frees any dynamically-allocated storage used by the internal form
123 of an RE.
124 .Pp
125 The header
126 .In regex.h
127 declares two structure types,
128 .Ft regex_t
129 and
130 .Ft regmatch_t ,
131 the former for compiled internal forms and the latter for match reporting.
132 It also declares the four functions, a type
133 .Ft regoff_t ,
134 and a number of constants with names starting with
135 .Qq Dv REG_ .
136 .Ss Fn regcomp
137 The
138 .Fn regcomp
139 function compiles the regular expression contained in the
140 .Fa pattern
141 string, subject to the flags in
142 .Fa cflags ,
143 and places the results in the
144 .Ft regex_t
145 structure pointed to by
146 .Fa preg .
147 The
148 .Fa cflags
149 argument is the bitwise OR of zero or more of the following flags:
150 .Bl -tag -width REG_EXTENDED
151 .It Dv REG_EXTENDED
152 Compile extended regular expressions
153 .Pq EREs ,
154 rather than the basic regular expressions
155 .Pq BREs
156 that are the default.
157 .It Dv REG_BASIC
158 This is a synonym for 0, provided as a counterpart to
159 .Dv REG_EXTENDED
160 to improve readability.
161 .It Dv REG_NOSPEC
162 Compile with recognition of all special characters turned off.
163 All characters are thus considered ordinary, so the RE is a literal string.
164 This is an extension, compatible with but not specified by
165 .St -p1003.2 ,
166 and should be used with caution in software intended to be portable to other
167 systems.
168 .Dv REG_EXTENDED
169 and
170 .Dv REG_NOSPEC
171 may not be used in the same call to
172 .Fn regcomp .
173 .It Dv REG_LITERAL
174 An alias of
175 .Dv REG_NOSPEC .
176 .It Dv REG_ICASE
177 Compile for matching that ignores upper/lower case distinctions.
178 See
179 .Xr regex 5 .
180 .It Dv REG_NOSUB
181 Compile for matching that need only report success or failure,
182 not what was matched.
183 .It Dv REG_NEWLINE
184 Compile for newline-sensitive matching.
185 By default, newline is a completely ordinary character with no special
186 meaning in either REs or strings.
187 With this flag,
188 .Qq [^
189 bracket expressions and
190 .Qq \&.
191 never match newline,
192 a
193 .Qq \&^
194 anchor matches the null string after any newline in the string in addition to
195 its normal function, and the
196 .Qq \&$
197 anchor matches the null string before any newline in the string in addition to
198 its normal function.
199 .It Dv REG_PEND
200 The regular expression ends, not at the first NUL, but just before the character
201 pointed to by the
202 .Va re_endp
203 member of the structure pointed to by
204 .Fa preg .
205 The
206 .Va re_endp
207 member is of type
208 .Vt "const char *" .
209 This flag permits inclusion of NULs in the RE; they are considered ordinary
210 characters.
211 This is an extension, compatible with but not specified by
212 .St -p1003.2 ,
213 and should be used with caution in software intended to be portable to other
214 systems.
215 .El
216 .Pp
217 When successful,
218 .Fn regcomp
219 returns 0 and fills in the structure pointed to by
220 .Fa preg .
221 One member of that structure
222 .Po other than
223 .Va re_endp
224 .Pc
225 is publicized:
226 .Va re_nsub ,
227 of type
228 .Ft size_t ,
229 contains the number of parenthesized subexpressions within the RE
230 .Po except that the value of this member is undefined if the
231 .Dv REG_NOSUB
232 flag was used
233 .Pc .
234 .Ss Fn regexec
235 The
236 .Fn regexec
237 function matches the compiled RE pointed to by
238 .Fa preg
239 against the
240 .Fa string ,
241 subject to the flags in
242 .Fa eflags ,
243 and reports results using
244 .Fa nmatch ,
245 .Fa pmatch ,
246 and the returned value.
247 The RE must have been compiled by a previous invocation of
248 .Fn regcomp .
249 The compiled form is not altered during execution of
250 .Fn regexec ,
251 so a single compiled RE can be used simultaneously by multiple threads.
252 .Pp
253 By default, the NUL-terminated string pointed to by
254 .Fa string
255 is considered to be the text of an entire line, minus any terminating
256 newline.
257 The
258 .Fa eflags
259 argument is the bitwise OR of zero or more of the following flags:
260 .Bl -tag -width REG_STARTEND
261 .It Dv REG_NOTBOL
262 The first character of the string is treated as the continuation
263 of a line.
264 This means that the anchors
265 .Qq \&^ ,
266 .Qq [[:<:]] ,
267 and
268 .Qq \e<
269 do not match before it; but see
270 .Dv REG_STARTEND
271 below.
272 This does not affect the behavior of newlines under
273 .Dv REG_NEWLINE .
274 .It Dv REG_NOTEOL
275 The NUL terminating the string does not end a line, so the
276 .Qq \&$
277 anchor does not match before it.
278 This does not affect the behavior of newlines under
279 .Dv REG_NEWLINE .
280 .It Dv REG_STARTEND
281 The string is considered to start at
282 .Fa string No +
283 .Fa pmatch Ns [0]. Ns Fa rm_so
284 and to end before the byte located at
285 .Fa string No +
286 .Fa pmatch Ns [0]. Ns Fa rm_eo ,
287 regardless of the value of
288 .Fa nmatch .
289 See below for the definition of
290 .Fa pmatch
291 and
292 .Fa nmatch .
293 This is an extension, compatible with but not specified by
294 .St -p1003.2 ,
295 and should be used with caution in software intended to be portable to other
296 systems.
297 .Pp
298 Without
299 .Dv REG_NOTBOL ,
300 the position
301 .Fa rm_so
302 is considered the beginning of a line, such that
303 .Qq \&^
304 matches before it, and the beginning of a word if there is a word character at
305 this position, such that
306 .Qq [[:<:]]
307 and
308 .Qq \e<
309 match before it.
310 .Pp
311 With
312 .Dv REG_NOTBOL ,
313 the character at position
314 .Fa rm_so
315 is treated as the continuation of a line, and if
316 .Fa rm_so
317 is greater than 0, the preceding character is taken into consideration.
318 If the preceding character is a newline and the regular expression was compiled
319 with
320 .Dv REG_NEWLINE ,
321 .Qq ^
322 matches before the string; if the preceding character is not a word character
323 but the string starts with a word character,
324 .Qq [[:<:]]
325 and
326 .Qq \e<
327 match before the string.
328 .El
329 .Pp
330 See
331 .Xr regex 5
332 for a discussion of what is matched in situations where an RE or a portion
333 thereof could match any of several substrings of
334 .Fa string .
335 .Pp
336 If
337 .Dv REG_NOSUB
338 was specified in the compilation of the RE, or if
339 .Fa nmatch
340 is 0,
341 .Fn regexec
342 ignores the
343 .Fa pmatch
344 argument
345 .Po but see below for the case where
346 .Dv REG_STARTEND
347 is specified
348 .Pc .
349 Otherwise,
350 .Fa pmatch
351 points to an array of
352 .Fa nmatch
353 structures of type
354 .Ft regmatch_t .
355 Such a structure has at least the members
356 .Va rm_so
357 and
358 .Va rm_eo ,
359 both of type
360 .Ft regoff_t
361 .Po a signed arithmetic type at least as large as an
362 .Ft off_t
363 and a
364 .Ft ssize_t
365 .Pc ,
366 containing respectively the offset of the first character of a substring
367 and the offset of the first character after the end of the substring.
368 Offsets are measured from the beginning of the
369 .Fa string
370 argument given to
371 .Fn regexec .
372 An empty substring is denoted by equal offsets, both indicating the character
373 following the empty substring.
374 .Pp
375 The 0th member of the
376 .Fa pmatch
377 array is filled in to indicate what substring of
378 .Fa string
379 was matched by the entire RE.
380 Remaining members report what substring was matched by parenthesized
381 subexpressions within the RE; member
382 .Va i
383 reports subexpression
384 .Va i ,
385 with subexpressions counted
386 .Pq starting at 1
387 by the order of their opening parentheses in the RE, left to right.
388 Unused entries in the array
389 .Po corresponding either to subexpressions that did not participate in the match
390 at all, or to subexpressions that do not exist in the RE
391 .Po that is,
392 .Va i
393 >
394 .Fa preg Ns -> Ns Va re_nsub
395 .Pc
396 .Pc
397 have both
398 .Va rm_so
399 and
400 .Va rm_eo
401 set to -1.
402 If a subexpression participated in the match several times,
403 the reported substring is the last one it matched.
404 .Po Note, as an example in particular, that when the RE
405 .Qq (b*)+
406 matches
407 .Qq bbb ,
408 the parenthesized subexpression matches each of the three
409 .So Li b Sc Ns s
410 and then an infinite number of empty strings following the last
411 .Qq b ,
412 so the reported substring is one of the empties.
413 .Pc
414 .Pp
415 If
416 .Dv REG_STARTEND
417 is specified,
418 .Fa pmatch
419 must point to at least one
420 .Ft regmatch_t
421 .Po even if
422 .Fa nmatch
423 is 0 or
424 .Dv REG_NOSUB
425 was specified
426 .Pc ,
427 to hold the input offsets for
428 .Dv REG_STARTEND .
429 Use for output is still entirely controlled by
430 .Fa nmatch ;
431 if
432 .Fa nmatch
433 is 0 or
434 .Dv REG_NOSUB
435 was specified,
436 the value of
437 .Fa pmatch Ns [0]
438 will not be changed by a successful
439 .Fn regexec .
440 .Ss Fn regerror
441 The
442 .Fn regerror
443 function maps a non-zero
444 .Fa errcode
445 from either
446 .Fn regcomp
447 or
448 .Fn regexec
449 to a human-readable, printable message.
450 If
451 .Fa preg
452 is non-NULL, the error code should have arisen from use of the
453 .Ft regex_t
454 pointed to by
455 .Fa preg ,
456 and if the error code came from
457 .Fn regcomp ,
458 it should have been the result from the most recent
459 .Fn regcomp
460 using that
461 .Ft regex_t .
462 The
463 .Po
464 .Fn regerror
465 may be able to supply a more detailed message using information
466 from the
467 .Ft regex_t .
468 .Pc
469 The
470 .Fn regerror
471 function places the NUL-terminated message into the buffer pointed to by
472 .Fa errbuf ,
473 limiting the length
474 .Pq including the NUL
475 to at most
476 .Fa errbuf_size
477 bytes.
478 If the whole message will not fit, as much of it as will fit before the
479 terminating NUL is supplied.
480 In any case, the returned value is the size of buffer needed to hold the whole
481 message
482 .Pq including terminating NUL .
483 If
484 .Fa errbuf_size
485 is 0,
486 .Fa errbuf
487 is ignored but the return value is still correct.
488 .Ss Fn regfree
489 The
490 .Fn regfree
491 function frees any dynamically-allocated storage associated with the compiled RE
492 pointed to by
493 .Fa preg .
494 The remaining
495 .Ft regex_t
496 is no longer a valid compiled RE and the effect of supplying it to
497 .Fn regexec
498 or
499 .Fn regerror
500 is undefined.
501 .Sh RETURN VALUES
502 On successful completion, the
503 .Fn regcomp
504 function returns 0.
505 Otherwise, it returns an integer value indicating an error as described in
506 .In regex.h ,
507 and the content of preg is undefined.
508 .Pp
509 On successful completion, the
510 .Fn regexec
511 function returns 0.
512 Otherwise it returns
513 .Dv REG_NOMATCH
514 to indicate no match.
515 .Pp
516 Upon successful completion, the
517 .Fn regerror
518 function returns the number of bytes needed to hold the entire generated string.
519 .Pp
520 The
521 .Fn regfree
522 function returns no value.
523 .Pp
524 The following constants are defined as error return values:
525 .Pp
526 .Bl -tag -width "REG_ECOLLATE" -compact
527 .It Dv REG_NOMATCH
528 The
529 .Fn regexec
530 function failed to match.
531 .It Dv REG_BADPAT
532 Invalid regular expression.
533 .It Dv REG_ECOLLATE
534 Invalid collating element referenced.
535 .It Dv REG_ECTYPE
536 Invalid character class type referenced.
537 .It Dv REG_EESCAPE
538 Trailing
539 .Qq \&\e
540 in pattern.
541 .It Dv REG_ESUBREG
542 Number in
543 .Qq \&\e Ns Em digit
544 invalid or in error.
545 .It Dv REG_EBRACK
546 .Qq []
547 imbalance.
548 .It Dv REG_ENOSYS
549 The function is not supported.
550 .It Dv REG_EPAREN
551 .Qq \e(\e)
552 or
553 .Qq ()
554 imbalance.
555 .It Dv REG_EBRACE
556 .Qq \e{\e}
557 imbalance.
558 .It Dv REG_BADBR
559 Content of
560 .Qq \e{\e}
561 invalid: not a number, number too large, more than two
562 numbers, first larger than second.
563 .It Dv REG_ERANGE
564 Invalid endpoint in range expression.
565 .It Dv REG_ESPACE
566 Out of memory.
567 .It Dv REG_BADRPT
568 .Qq \&? ,
569 .Qq *
570 or
571 .Qq +
572 not preceded by valid regular expression.
573 .It Dv REG_EMPTY
574 Empty (sub)expression.
575 .It Dv REG_INVARG
576 Invalid argument, e.g. negative-length string.
577 .El
578 .Sh USAGE
579 An application could use:
580 .Bd -literal -offset Ds
581 regerror(code, preg, (char *)NULL, (size_t)0)
582 .Ed
583 .Pp
584 to find out how big a buffer is needed for the generated string,
585 .Fn malloc
586 a buffer to hold the string, and then call
587 .Fn regerror
588 again to get the string
589 .Po see
590 .Xr malloc 3C
591 .Pc .
592 Alternately, it could allocate a fixed, static buffer that is big enough to hold
593 most strings, and then use
594 .Fn malloc
595 allocate a larger buffer if it finds that this is too small.
596 .Sh EXAMPLES
597 Matching string against the extended regular expression in pattern.
598 .Bd -literal -offset Ds
599 #include <regex.h>
600
601 /*
602 * Match string against the extended regular expression in
603 * pattern, treating errors as no match.
604 *
605 * return 1 for match, 0 for no match
606 */
607 int
608 match(const char *string, char *pattern)
609 {
610 int status;
611 regex_t re;
612
613 if (regcomp(&re, pattern, REG_EXTENDED\||\|REG_NOSUB) != 0) {
614 return(0); /* report error */
615 }
616 status = regexec(&re, string, (size_t) 0, NULL, 0);
617 regfree(&re);
618 if (status != 0) {
619 return(0); /* report error */
620 }
621 return(1);
622 }
623 .Ed
624 .Pp
625 The following demonstrates how the
626 .Dv REG_NOTBOL
627 flag could be used with
628 .Fn regexec
629 to find all substrings in a line that match a pattern supplied by a user.
630 .Pq For simplicity of the example, very little error checking is done.
631 .Bd -literal -offset Ds
632 (void) regcomp(&re, pattern, 0);
633 /* this call to regexec() finds the first match on the line */
634 error = regexec(&re, &buffer[0], 1, &pm, 0);
635 while (error == 0) { /* while matches found */
636 /* substring found between pm.rm_so and pm.rm_eo */
637 /* This call to regexec() finds the next match */
638 error = regexec(&re, buffer + pm.rm_eo, 1, &pm, REG_NOTBOL);
639 }
640 .Ed
641 .Sh ERRORS
642 No errors are defined.
643 .Sh CODE SET INDEPENDENCE
644 .Sy Enabled
645 .Sh INTERFACE STABILITY
646 .Sy Standard
647 .Sh MT-LEVEL
648 .Sy MT-Safe with exceptions
649 .Pp
650 The
651 .Fn regcomp
652 function can be used safely in a multithreaded application as long as
653 .Xr setlocale 3C
654 is not being called to change the locale.
655 .Sh SEE ALSO
656 .Xr attributes 5 ,
657 .Xr regex 5 ,
658 .Xr standards 5
659 .Pp
660 .St -p1003.2 ,
661 sections 2.8
662 .Pq Regular Expression Notation
663 and
664 B.5
665 .Pq C Binding for Regular Expression Matching .