1 .\" Copyright (c) 1992, 1993, 1994 Henry Spencer.
   2 .\" Copyright (c) 1992, 1993, 1994
   3 .\"     The Regents of the University of California.  All rights reserved.
   4 .\"
   5 .\" This code is derived from software contributed to Berkeley by
   6 .\" Henry Spencer.
   7 .\"
   8 .\" Redistribution and use in source and binary forms, with or without
   9 .\" modification, are permitted provided that the following conditions
  10 .\" are met:
  11 .\" 1. Redistributions of source code must retain the above copyright
  12 .\"    notice, this list of conditions and the following disclaimer.
  13 .\" 2. Redistributions in binary form must reproduce the above copyright
  14 .\"    notice, this list of conditions and the following disclaimer in the
  15 .\"    documentation and/or other materials provided with the distribution.
  16 .\" 3. Neither the name of the University nor the names of its contributors
  17 .\"    may be used to endorse or promote products derived from this software
  18 .\"    without specific prior written permission.
  19 .\"
  20 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30 .\" SUCH DAMAGE.
  31 .\"
  32 .\"
  33 .\" Sun Microsystems, Inc. gratefully acknowledges The Open Group for permission
  34 .\" to reproduce portions of its copyrighted documentation.
  35 .\"
  36 .\" Original documentation from The Open Group can be obtained online at
  37 .\" http://www.opengroup.org/bookstore/.
  38 .\"
  39 .\" The Institute of Electrical and Electronics Engineers and The Open Group,
  40 .\" have given us permission to reprint portions of their documentation. In the
  41 .\" following statement, the phrase "this text" refers to portions of the system
  42 .\" documentation.
  43 .\"
  44 .\" Portions of this text are reprinted and reproduced in electronic form in the
  45 .\" Sun OS Reference Manual, from IEEE Std 1003.1, 2004 Edition, Standard for
  46 .\" Information Technology -- Portable Operating System Interface (POSIX),
  47 .\" The Open Group Base Specifications Issue 6, Copyright (C) 2001-2004 by the
  48 .\" Institute of Electrical and Electronics Engineers, Inc and The Open Group.
  49 .\"
  50 .\" In the event of any discrepancy between these versions and the original
  51 .\" IEEE and The Open Group Standard, the original IEEE and The Open Group
  52 .\" Standard is the referee document.
  53 .\"
  54 .\" The original Standard can be obtained online at
  55 .\" http://www.opengroup.org/unix/online.html.
  56 .\"
  57 .\" This notice shall appear on any product containing this material.
  58 .\"
  59 .\" The contents of this file are subject to the terms of the
  60 .\" Common Development and Distribution License (the "License").
  61 .\" You may not use this file except in compliance with the License.
  62 .\"
  63 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  64 .\" or http://www.opensolaris.org/os/licensing.
  65 .\" See the License for the specific language governing permissions
  66 .\" and limitations under the License.
  67 .\"
  68 .\" When distributing Covered Code, include this CDDL HEADER in each
  69 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  70 .\" If applicable, add the following below this CDDL HEADER, with the
  71 .\" fields enclosed by brackets "[]" replaced with your own identifying
  72 .\" information: Portions Copyright [yyyy] [name of copyright owner]
  73 .\"
  74 .\"
  75 .\" Copyright (c) 1992, X/Open Company Limited. All Rights Reserved.
  76 .\" Portions Copyright (c) 2003, Sun Microsystems, Inc.  All Rights Reserved.
  77 .\" Copyright 2018 Nexenta Systems, Inc.
  78 .\"
  79 .Dd February 3, 2018
  80 .Dt REGCOMP 3C
  81 .Os
  82 .Sh NAME
  83 .Nm regcomp ,
  84 .Nm regexec ,
  85 .Nm regerror ,
  86 .Nm regfree
  87 .Nd regular-expression library
  88 .Sh LIBRARY
  89 .Lb libc
  90 .Sh SYNOPSIS
  91 .In regex.h
  92 .Ft int
  93 .Fo regcomp
  94 .Fa "regex_t *restrict preg" "const char *restrict pattern" "int cflags"
  95 .Fc
  96 .Ft int
  97 .Fo regexec
  98 .Fa "const regex_t *restrict preg" "const char *restrict string"
  99 .Fa "size_t nmatch" "regmatch_t pmatch[restrict]" "int eflags"
 100 .Fc
 101 .Ft size_t
 102 .Fo regerror
 103 .Fa "int errcode" "const regex_t *restrict preg"
 104 .Fa "char *restrict errbuf" "size_t errbuf_size"
 105 .Fc
 106 .Ft void
 107 .Fn regfree "regex_t *preg"
 108 .Sh DESCRIPTION
 109 These routines implement
 110 .St -p1003.2
 111 regular expressions; see
 112 .Xr regex 5 .
 113 The
 114 .Fn regcomp
 115 function compiles an RE written as a string into an internal form,
 116 .Fn regexec
 117 matches that internal form against a string and reports results,
 118 .Fn regerror
 119 transforms error codes from either into human-readable messages,
 120 and
 121 .Fn regfree
 122 frees any dynamically-allocated storage used by the internal form
 123 of an RE.
 124 .Pp
 125 The header
 126 .In regex.h
 127 declares two structure types,
 128 .Ft regex_t
 129 and
 130 .Ft regmatch_t ,
 131 the former for compiled internal forms and the latter for match reporting.
 132 It also declares the four functions, a type
 133 .Ft regoff_t ,
 134 and a number of constants with names starting with
 135 .Qq Dv REG_ .
 136 .Ss Fn regcomp
 137 The
 138 .Fn regcomp
 139 function compiles the regular expression contained in the
 140 .Fa pattern
 141 string, subject to the flags in
 142 .Fa cflags ,
 143 and places the results in the
 144 .Ft regex_t
 145 structure pointed to by
 146 .Fa preg .
 147 The
 148 .Fa cflags
 149 argument is the bitwise OR of zero or more of the following flags:
 150 .Bl -tag -width REG_EXTENDED
 151 .It Dv REG_EXTENDED
 152 Compile extended regular expressions
 153 .Pq EREs ,
 154 rather than the basic regular expressions
 155 .Pq BREs
 156 that are the default.
 157 .It Dv REG_BASIC
 158 This is a synonym for 0, provided as a counterpart to
 159 .Dv REG_EXTENDED
 160 to improve readability.
 161 .It Dv REG_NOSPEC
 162 Compile with recognition of all special characters turned off.
 163 All characters are thus considered ordinary, so the RE is a literal string.
 164 This is an extension, compatible with but not specified by
 165 .St -p1003.2 ,
 166 and should be used with caution in software intended to be portable to other
 167 systems.
 168 .Dv REG_EXTENDED
 169 and
 170 .Dv REG_NOSPEC
 171 may not be used in the same call to
 172 .Fn regcomp .
 173 .It Dv REG_LITERAL
 174 An alias of
 175 .Dv REG_NOSPEC .
 176 .It Dv REG_ICASE
 177 Compile for matching that ignores upper/lower case distinctions.
 178 See
 179 .Xr regex 5 .
 180 .It Dv REG_NOSUB
 181 Compile for matching that need only report success or failure,
 182 not what was matched.
 183 .It Dv REG_NEWLINE
 184 Compile for newline-sensitive matching.
 185 By default, newline is a completely ordinary character with no special
 186 meaning in either REs or strings.
 187 With this flag,
 188 .Qq [^
 189 bracket expressions and
 190 .Qq \&.
 191 never match newline,
 192 a
 193 .Qq \&^
 194 anchor matches the null string after any newline in the string in addition to
 195 its normal function, and the
 196 .Qq \&$
 197 anchor matches the null string before any newline in the string in addition to
 198 its normal function.
 199 .It Dv REG_PEND
 200 The regular expression ends, not at the first NUL, but just before the character
 201 pointed to by the
 202 .Va re_endp
 203 member of the structure pointed to by
 204 .Fa preg .
 205 The
 206 .Va re_endp
 207 member is of type
 208 .Vt "const char *" .
 209 This flag permits inclusion of NULs in the RE; they are considered ordinary
 210 characters.
 211 This is an extension, compatible with but not specified by
 212 .St -p1003.2 ,
 213 and should be used with caution in software intended to be portable to other
 214 systems.
 215 .El
 216 .Pp
 217 When successful,
 218 .Fn regcomp
 219 returns 0 and fills in the structure pointed to by
 220 .Fa preg .
 221 One member of that structure
 222 .Po other than
 223 .Va re_endp
 224 .Pc
 225 is publicized:
 226 .Va re_nsub ,
 227 of type
 228 .Ft size_t ,
 229 contains the number of parenthesized subexpressions within the RE
 230 .Po except that the value of this member is undefined if the
 231 .Dv REG_NOSUB
 232 flag was used
 233 .Pc .
 234 .Ss Fn regexec
 235 The
 236 .Fn regexec
 237 function matches the compiled RE pointed to by
 238 .Fa preg
 239 against the
 240 .Fa string ,
 241 subject to the flags in
 242 .Fa eflags ,
 243 and reports results using
 244 .Fa nmatch ,
 245 .Fa pmatch ,
 246 and the returned value.
 247 The RE must have been compiled by a previous invocation of
 248 .Fn regcomp .
 249 The compiled form is not altered during execution of
 250 .Fn regexec ,
 251 so a single compiled RE can be used simultaneously by multiple threads.
 252 .Pp
 253 By default, the NUL-terminated string pointed to by
 254 .Fa string
 255 is considered to be the text of an entire line, minus any terminating
 256 newline.
 257 The
 258 .Fa eflags
 259 argument is the bitwise OR of zero or more of the following flags:
 260 .Bl -tag -width REG_STARTEND
 261 .It Dv REG_NOTBOL
 262 The first character of the string is treated as the continuation
 263 of a line.
 264 This means that the anchors
 265 .Qq \&^ ,
 266 .Qq [[:<:]] ,
 267 and
 268 .Qq \e<
 269 do not match before it; but see
 270 .Dv REG_STARTEND
 271 below.
 272 This does not affect the behavior of newlines under
 273 .Dv REG_NEWLINE .
 274 .It Dv REG_NOTEOL
 275 The NUL terminating the string does not end a line, so the
 276 .Qq \&$
 277 anchor does not match before it.
 278 This does not affect the behavior of newlines under
 279 .Dv REG_NEWLINE .
 280 .It Dv REG_STARTEND
 281 The string is considered to start at
 282 .Fa string No +
 283 .Fa pmatch Ns [0]. Ns Fa rm_so
 284 and to end before the byte located at
 285 .Fa string No +
 286 .Fa pmatch Ns [0]. Ns Fa rm_eo ,
 287 regardless of the value of
 288 .Fa nmatch .
 289 See below for the definition of
 290 .Fa pmatch
 291 and
 292 .Fa nmatch .
 293 This is an extension, compatible with but not specified by
 294 .St -p1003.2 ,
 295 and should be used with caution in software intended to be portable to other
 296 systems.
 297 .Pp
 298 Without
 299 .Dv REG_NOTBOL ,
 300 the position
 301 .Fa rm_so
 302 is considered the beginning of a line, such that
 303 .Qq \&^
 304 matches before it, and the beginning of a word if there is a word character at
 305 this position, such that
 306 .Qq [[:<:]]
 307 and
 308 .Qq \e<
 309 match before it.
 310 .Pp
 311 With
 312 .Dv REG_NOTBOL ,
 313 the character at position
 314 .Fa rm_so
 315 is treated as the continuation of a line, and if
 316 .Fa rm_so
 317 is greater than 0, the preceding character is taken into consideration.
 318 If the preceding character is a newline and the regular expression was compiled
 319 with
 320 .Dv REG_NEWLINE ,
 321 .Qq ^
 322 matches before the string; if the preceding character is not a word character
 323 but the string starts with a word character,
 324 .Qq [[:<:]]
 325 and
 326 .Qq \e<
 327 match before the string.
 328 .El
 329 .Pp
 330 See
 331 .Xr regex 5
 332 for a discussion of what is matched in situations where an RE or a portion
 333 thereof could match any of several substrings of
 334 .Fa string .
 335 .Pp
 336 If
 337 .Dv REG_NOSUB
 338 was specified in the compilation of the RE, or if
 339 .Fa nmatch
 340 is 0,
 341 .Fn regexec
 342 ignores the
 343 .Fa pmatch
 344 argument
 345 .Po but see below for the case where
 346 .Dv REG_STARTEND
 347 is specified
 348 .Pc .
 349 Otherwise,
 350 .Fa pmatch
 351 points to an array of
 352 .Fa nmatch
 353 structures of type
 354 .Ft regmatch_t .
 355 Such a structure has at least the members
 356 .Va rm_so
 357 and
 358 .Va rm_eo ,
 359 both of type
 360 .Ft regoff_t
 361 .Po a signed arithmetic type at least as large as an
 362 .Ft off_t
 363 and a
 364 .Ft ssize_t
 365 .Pc ,
 366 containing respectively the offset of the first character of a substring
 367 and the offset of the first character after the end of the substring.
 368 Offsets are measured from the beginning of the
 369 .Fa string
 370 argument given to
 371 .Fn regexec .
 372 An empty substring is denoted by equal offsets, both indicating the character
 373 following the empty substring.
 374 .Pp
 375 The 0th member of the
 376 .Fa pmatch
 377 array is filled in to indicate what substring of
 378 .Fa string
 379 was matched by the entire RE.
 380 Remaining members report what substring was matched by parenthesized
 381 subexpressions within the RE; member
 382 .Va i
 383 reports subexpression
 384 .Va i ,
 385 with subexpressions counted
 386 .Pq starting at 1
 387 by the order of their opening parentheses in the RE, left to right.
 388 Unused entries in the array
 389 .Po corresponding either to subexpressions that did not participate in the match
 390 at all, or to subexpressions that do not exist in the RE
 391 .Po that is,
 392 .Va i
 393 >
 394 .Fa preg Ns -> Ns Va re_nsub
 395 .Pc
 396 .Pc
 397 have both
 398 .Va rm_so
 399 and
 400 .Va rm_eo
 401 set to -1.
 402 If a subexpression participated in the match several times,
 403 the reported substring is the last one it matched.
 404 .Po Note, as an example in particular, that when the RE
 405 .Qq (b*)+
 406 matches
 407 .Qq bbb ,
 408 the parenthesized subexpression matches each of the three
 409 .So Li b Sc Ns s
 410 and then an infinite number of empty strings following the last
 411 .Qq b ,
 412 so the reported substring is one of the empties.
 413 .Pc
 414 .Pp
 415 If
 416 .Dv REG_STARTEND
 417 is specified,
 418 .Fa pmatch
 419 must point to at least one
 420 .Ft regmatch_t
 421 .Po even if
 422 .Fa nmatch
 423 is 0 or
 424 .Dv REG_NOSUB
 425 was specified
 426 .Pc ,
 427 to hold the input offsets for
 428 .Dv REG_STARTEND .
 429 Use for output is still entirely controlled by
 430 .Fa nmatch ;
 431 if
 432 .Fa nmatch
 433 is 0 or
 434 .Dv REG_NOSUB
 435 was specified,
 436 the value of
 437 .Fa pmatch Ns [0]
 438 will not be changed by a successful
 439 .Fn regexec .
 440 .Ss Fn regerror
 441 The
 442 .Fn regerror
 443 function maps a non-zero
 444 .Fa errcode
 445 from either
 446 .Fn regcomp
 447 or
 448 .Fn regexec
 449 to a human-readable, printable message.
 450 If
 451 .Fa preg
 452 is non-NULL, the error code should have arisen from use of the
 453 .Ft regex_t
 454 pointed to by
 455 .Fa preg ,
 456 and if the error code came from
 457 .Fn regcomp ,
 458 it should have been the result from the most recent
 459 .Fn regcomp
 460 using that
 461 .Ft regex_t .
 462 The
 463 .Po
 464 .Fn regerror
 465 may be able to supply a more detailed message using information
 466 from the
 467 .Ft regex_t .
 468 .Pc
 469 The
 470 .Fn regerror
 471 function places the NUL-terminated message into the buffer pointed to by
 472 .Fa errbuf ,
 473 limiting the length
 474 .Pq including the NUL
 475 to at most
 476 .Fa errbuf_size
 477 bytes.
 478 If the whole message will not fit, as much of it as will fit before the
 479 terminating NUL is supplied.
 480 In any case, the returned value is the size of buffer needed to hold the whole
 481 message
 482 .Pq including terminating NUL .
 483 If
 484 .Fa errbuf_size
 485 is 0,
 486 .Fa errbuf
 487 is ignored but the return value is still correct.
 488 .Ss Fn regfree
 489 The
 490 .Fn regfree
 491 function frees any dynamically-allocated storage associated with the compiled RE
 492 pointed to by
 493 .Fa preg .
 494 The remaining
 495 .Ft regex_t
 496 is no longer a valid compiled RE and the effect of supplying it to
 497 .Fn regexec
 498 or
 499 .Fn regerror
 500 is undefined.
 501 .Sh RETURN VALUES
 502 On successful completion, the
 503 .Fn regcomp
 504 function returns 0.
 505 Otherwise, it returns an integer value indicating an error as described in
 506 .In regex.h ,
 507 and the content of preg is undefined.
 508 .Pp
 509 On successful completion, the
 510 .Fn regexec
 511 function returns 0.
 512 Otherwise it returns
 513 .Dv REG_NOMATCH
 514 to indicate no match.
 515 .Pp
 516 Upon successful completion, the
 517 .Fn regerror
 518 function returns the number of bytes needed to hold the entire generated string.
 519 .Pp
 520 The
 521 .Fn regfree
 522 function returns no value.
 523 .Pp
 524 The following constants are defined as error return values:
 525 .Pp
 526 .Bl -tag -width "REG_ECOLLATE" -compact
 527 .It Dv REG_NOMATCH
 528 The
 529 .Fn regexec
 530 function failed to match.
 531 .It Dv REG_BADPAT
 532 Invalid regular expression.
 533 .It Dv REG_ECOLLATE
 534 Invalid collating element referenced.
 535 .It Dv REG_ECTYPE
 536 Invalid character class type referenced.
 537 .It Dv REG_EESCAPE
 538 Trailing
 539 .Qq \&\e
 540 in pattern.
 541 .It Dv REG_ESUBREG
 542 Number in
 543 .Qq \&\e Ns Em digit
 544 invalid or in error.
 545 .It Dv REG_EBRACK
 546 .Qq []
 547 imbalance.
 548 .It Dv REG_ENOSYS
 549 The function is not supported.
 550 .It Dv REG_EPAREN
 551 .Qq \e(\e)
 552 or
 553 .Qq ()
 554 imbalance.
 555 .It Dv REG_EBRACE
 556 .Qq \e{\e}
 557 imbalance.
 558 .It Dv REG_BADBR
 559 Content of
 560 .Qq \e{\e}
 561 invalid: not a number, number too large, more than two
 562 numbers, first larger than second.
 563 .It Dv REG_ERANGE
 564 Invalid endpoint in range expression.
 565 .It Dv REG_ESPACE
 566 Out of memory.
 567 .It Dv REG_BADRPT
 568 .Qq \&? ,
 569 .Qq *
 570 or
 571 .Qq +
 572 not preceded by valid regular expression.
 573 .It Dv REG_EMPTY
 574 Empty (sub)expression.
 575 .It Dv REG_INVARG
 576 Invalid argument, e.g. negative-length string.
 577 .El
 578 .Sh USAGE
 579 An application could use:
 580 .Bd -literal -offset Ds
 581 regerror(code, preg, (char *)NULL, (size_t)0)
 582 .Ed
 583 .Pp
 584 to find out how big a buffer is needed for the generated string,
 585 .Fn malloc
 586 a buffer to hold the string, and then call
 587 .Fn regerror
 588 again to get the string
 589 .Po see
 590 .Xr malloc 3C
 591 .Pc .
 592 Alternately, it could allocate a fixed, static buffer that is big enough to hold
 593 most strings, and then use
 594 .Fn malloc
 595 allocate a larger buffer if it finds that this is too small.
 596 .Sh EXAMPLES
 597 Matching string against the extended regular expression in pattern.
 598 .Bd -literal -offset Ds
 599 #include <regex.h>
 600 
 601 /*
 602 * Match string against the extended regular expression in
 603 * pattern, treating errors as no match.
 604 *
 605 * return 1 for match, 0 for no match
 606 */
 607 int
 608 match(const char *string, char *pattern)
 609 {
 610         int status;
 611         regex_t re;
 612 
 613         if (regcomp(&re, pattern, REG_EXTENDED\||\|REG_NOSUB) != 0) {
 614                 return(0);      /* report error */
 615         }
 616         status = regexec(&re, string, (size_t) 0, NULL, 0);
 617         regfree(&re);
 618         if (status != 0) {
 619                 return(0);      /* report error */
 620         }
 621         return(1);
 622 }
 623 .Ed
 624 .Pp
 625 The following demonstrates how the
 626 .Dv REG_NOTBOL
 627 flag could be used with
 628 .Fn regexec
 629 to find all substrings in a line that match a pattern supplied by a user.
 630 .Pq For simplicity of the example, very little error checking is done.
 631 .Bd -literal -offset Ds
 632 (void) regcomp(&re, pattern, 0);
 633 /* this call to regexec() finds the first match on the line */
 634 error = regexec(&re, &buffer[0], 1, &pm, 0);
 635 while (error == 0) {    /* while matches found */
 636         /* substring found between pm.rm_so and pm.rm_eo */
 637         /* This call to regexec() finds the next match */
 638         error = regexec(&re, buffer + pm.rm_eo, 1, &pm, REG_NOTBOL);
 639 }
 640 .Ed
 641 .Sh ERRORS
 642 No errors are defined.
 643 .Sh CODE SET INDEPENDENCE
 644 .Sy Enabled
 645 .Sh INTERFACE STABILITY
 646 .Sy Standard
 647 .Sh MT-LEVEL
 648 .Sy MT-Safe with exceptions
 649 .Pp
 650 The
 651 .Fn regcomp
 652 function can be used safely in a multithreaded application as long as
 653 .Xr setlocale 3C
 654 is not being called to change the locale.
 655 .Sh SEE ALSO
 656 .Xr attributes 5 ,
 657 .Xr regex 5 ,
 658 .Xr standards 5
 659 .Pp
 660 .St -p1003.2 ,
 661 sections 2.8
 662 .Pq Regular Expression Notation
 663 and
 664 B.5
 665 .Pq C Binding for Regular Expression Matching .