1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * grep - pattern matching program - combined grep, egrep, and fgrep.
29 * Based on MKS grep command, with XCU & Solaris mods.
30 */
31
32 /*
33 * Copyright 1985, 1992 by Mortice Kern Systems Inc. All rights reserved.
34 *
35 */
36
37 /* Copyright 2012 Nexenta Systems, Inc. All rights reserved. */
38
39 /*
40 * Copyright 2013 Damian Bogel. All rights reserved.
41 */
42
43 #include <string.h>
44 #include <stdlib.h>
45 #include <ctype.h>
46 #include <stdarg.h>
47 #include <regex.h>
48 #include <limits.h>
49 #include <sys/types.h>
50 #include <sys/stat.h>
51 #include <fcntl.h>
52 #include <stdio.h>
53 #include <locale.h>
54 #include <wchar.h>
55 #include <errno.h>
56 #include <unistd.h>
57 #include <wctype.h>
58 #include <ftw.h>
59 #include <sys/param.h>
60
61 #define STDIN_FILENAME gettext("(standard input)")
62
63 #define BSIZE 512 /* Size of block for -b */
64 #define BUFSIZE 8192 /* Input buffer size */
65 #define MAX_DEPTH 1000 /* how deep to recurse */
66
67 #define AFTER 1 /* 'After' Context */
68 #define BEFORE 2 /* 'Before' Context */
69 #define CONTEXT (AFTER|BEFORE) /* Full Context */
70
71 #define M_CSETSIZE 256 /* singlebyte chars */
72 static int bmglen; /* length of BMG pattern */
73 static char *bmgpat; /* BMG pattern */
74 static int bmgtab[M_CSETSIZE]; /* BMG delta1 table */
75
76 typedef struct _PATTERN {
77 char *pattern; /* original pattern */
78 wchar_t *wpattern; /* wide, lowercased pattern */
79 struct _PATTERN *next;
80 regex_t re; /* compiled pattern */
81 } PATTERN;
82
83 static PATTERN *patterns;
84 static char errstr[128]; /* regerror string buffer */
85 static int regflags = 0; /* regcomp options */
86 static int matched = 0; /* return of the grep() */
87 static int errors = 0; /* count of errors */
88 static uchar_t fgrep = 0; /* Invoked as fgrep */
89 static uchar_t egrep = 0; /* Invoked as egrep */
90 static uchar_t nvflag = 1; /* Print matching lines */
91 static uchar_t cflag; /* Count of matches */
92 static uchar_t iflag; /* Case insensitve matching */
93 static uchar_t Hflag; /* Precede lines by file name */
94 static uchar_t hflag; /* Supress printing of filename */
95 static uchar_t lflag; /* Print file names of matches */
96 static uchar_t nflag; /* Precede lines by line number */
97 static uchar_t rflag; /* Search directories recursively */
98 static uchar_t bflag; /* Preccede matches by block number */
99 static uchar_t sflag; /* Suppress file error messages */
100 static uchar_t qflag; /* Suppress standard output */
101 static uchar_t wflag; /* Search for expression as a word */
102 static uchar_t xflag; /* Anchoring */
103 static uchar_t Eflag; /* Egrep or -E flag */
104 static uchar_t Fflag; /* Fgrep or -F flag */
105 static uchar_t Rflag; /* Like rflag, but follow symlinks */
106 static uchar_t outfn; /* Put out file name */
107 static uchar_t conflag; /* show context of matches */
108 static char *cmdname;
109
110 static int use_wchar, use_bmg, mblocale;
111
112 static size_t outbuflen, prntbuflen, conbuflen;
113 static unsigned long conalen, conblen, conmatches;
114 static char *prntbuf, *conbuf;
115 static wchar_t *outline;
116
117 static void addfile(const char *fn);
118 static void addpattern(char *s);
119 static void fixpatterns(void);
120 static void usage(void);
121 static int grep(int, const char *);
122 static void bmgcomp(char *, int);
123 static char *bmgexec(char *, char *);
124 static int recursive(const char *, const struct stat *, int, struct FTW *);
125 static void process_path(const char *);
126 static void process_file(const char *, int);
127
128 /*
129 * mainline for grep
130 */
131 int
132 main(int argc, char **argv)
133 {
134 char *ap, *test;
135 int c;
136 int fflag = 0;
137 int i, n_pattern = 0, n_file = 0;
138 char **pattern_list = NULL;
139 char **file_list = NULL;
140
141 (void) setlocale(LC_ALL, "");
142 #if !defined(TEXT_DOMAIN) /* Should be defined by cc -D */
143 #define TEXT_DOMAIN "SYS_TEST" /* Use this only if it weren't */
144 #endif
145 (void) textdomain(TEXT_DOMAIN);
146
147 /*
148 * true if this is running on the multibyte locale
149 */
150 mblocale = (MB_CUR_MAX > 1);
151 /*
152 * Skip leading slashes
153 */
154 cmdname = argv[0];
155 if (ap = strrchr(cmdname, '/'))
156 cmdname = ap + 1;
157
158 ap = cmdname;
159 /*
160 * Detect egrep/fgrep via command name, map to -E and -F options.
161 */
162 if (*ap == 'e' || *ap == 'E') {
163 regflags |= REG_EXTENDED;
164 egrep++;
165 } else {
166 if (*ap == 'f' || *ap == 'F') {
167 fgrep++;
168 }
169 }
170
171 /* check for non-standard "-line-count" option */
172 for (i = 1; i < argc; i++) {
173 if (strcmp(argv[i], "--") == 0)
174 break;
175
176 if ((argv[i][0] == '-') && isdigit(argv[i][1])) {
177 if (strlen(&argv[i][1]) !=
178 strspn(&argv[i][1], "0123456789")) {
179 (void) fprintf(stderr, gettext(
180 "%s: Bad number flag\n"), argv[0]);
181 usage();
182 }
183
184 conalen = conblen = strtoul(&argv[i][1], (char **)NULL,
185 10);
186
187 /* isdigit() check prevents negative arguments */
188 if (conalen >= ULONG_MAX) {
189 (void) fprintf(stderr, gettext(
190 "%s: Bad context argument\n"), argv[0]);
191 }
192
193 if (conalen)
194 conflag = CONTEXT;
195
196 while (i < argc) {
197 argv[i] = argv[i + 1];
198 i++;
199 }
200 argc--;
201 }
202 }
203
204 while ((c = getopt(argc, argv, "vwchHilnrbse:f:qxEFIRA:B:C:")) != EOF) {
205 unsigned long tval;
206 switch (c) {
207 case 'v': /* POSIX: negate matches */
208 nvflag = 0;
209 break;
210
211 case 'c': /* POSIX: write count */
212 cflag++;
213 break;
214
215 case 'i': /* POSIX: ignore case */
216 iflag++;
217 regflags |= REG_ICASE;
218 break;
219
220 case 'l': /* POSIX: Write filenames only */
221 lflag++;
222 break;
223
224 case 'n': /* POSIX: Write line numbers */
225 nflag++;
226 break;
227
228 case 'r': /* Solaris: search recursively */
229 rflag++;
230 break;
231
232 case 'b': /* Solaris: Write file block numbers */
233 bflag++;
234 break;
235
236 case 's': /* POSIX: No error msgs for files */
237 sflag++;
238 break;
239
240 case 'e': /* POSIX: pattern list */
241 n_pattern++;
242 pattern_list = realloc(pattern_list,
243 sizeof (char *) * n_pattern);
244 if (pattern_list == NULL) {
245 (void) fprintf(stderr,
246 gettext("%s: out of memory\n"),
247 cmdname);
248 exit(2);
249 }
250 *(pattern_list + n_pattern - 1) = optarg;
251 break;
252
253 case 'f': /* POSIX: pattern file */
254 fflag = 1;
255 n_file++;
256 file_list = realloc(file_list,
257 sizeof (char *) * n_file);
258 if (file_list == NULL) {
259 (void) fprintf(stderr,
260 gettext("%s: out of memory\n"),
261 cmdname);
262 exit(2);
263 }
264 *(file_list + n_file - 1) = optarg;
265 break;
266
267 /* based on options order h or H is set as in GNU grep */
268 case 'h': /* Solaris: supress printing of file name */
269 hflag = 1;
270 Hflag = 0;
271 break;
272 /* Solaris: precede every matching with file name */
273 case 'H':
274 Hflag = 1;
275 hflag = 0;
276 break;
277
278 case 'q': /* POSIX: quiet: status only */
279 qflag++;
280 break;
281
282 case 'w': /* Solaris: treat pattern as word */
283 wflag++;
284 break;
285
286 case 'x': /* POSIX: full line matches */
287 xflag++;
288 regflags |= REG_ANCHOR;
289 break;
290
291 case 'E': /* POSIX: Extended RE's */
292 regflags |= REG_EXTENDED;
293 Eflag++;
294 break;
295
296 case 'F': /* POSIX: strings, not RE's */
297 Fflag++;
298 break;
299
300 case 'R': /* Solaris: like rflag, but follow symlinks */
301 Rflag++;
302 rflag++;
303 break;
304
305 case 'A': /* print N lines after each match */
306 conalen = strtoul(optarg, &test, 10);
307 /* *test will be non-null if optarg is negative */
308 if (*test != '\0' || conalen >= ULONG_MAX) {
309 (void) fprintf(stderr, gettext(
310 "%s: Bad context argument\n"), argv[0]);
311 exit(2);
312 }
313 if (conalen)
314 conflag |= AFTER;
315 else
316 conflag &= ~AFTER;
317 break;
318 case 'B': /* print N lines before each match */
319 conblen = strtoul(optarg, &test, 10);
320 /* *test will be non-null if optarg is negative */
321 if (*test != '\0' || conblen >= ULONG_MAX) {
322 (void) fprintf(stderr, gettext(
323 "%s: Bad context argument\n"), argv[0]);
324 exit(2);
325 }
326 if (conblen)
327 conflag |= BEFORE;
328 else
329 conflag &= ~BEFORE;
330 break;
331 case 'C': /* print N lines around each match */
332 tval = strtoul(optarg, &test, 10);
333 /* *test will be non-null if optarg is negative */
334 if (*test != '\0' || tval >= ULONG_MAX) {
335 (void) fprintf(stderr, gettext(
336 "%s: Bad context argument\n"), argv[0]);
337 exit(2);
338 }
339 if (tval) {
340 if (!(conflag & BEFORE))
341 conblen = tval;
342 if (!(conflag & AFTER))
343 conalen = tval;
344 conflag = CONTEXT;
345 }
346 break;
347
348 default:
349 usage();
350 }
351 }
352 /*
353 * If we're invoked as egrep or fgrep we need to do some checks
354 */
355
356 if (egrep || fgrep) {
357 /*
358 * Use of -E or -F with egrep or fgrep is illegal
359 */
360 if (Eflag || Fflag)
361 usage();
362 /*
363 * Don't allow use of wflag with egrep / fgrep
364 */
365 if (wflag)
366 usage();
367 /*
368 * For Solaris the -s flag is equivalent to XCU -q
369 */
370 if (sflag)
371 qflag++;
372 /*
373 * done with above checks - set the appropriate flags
374 */
375 if (egrep)
376 Eflag++;
377 else /* Else fgrep */
378 Fflag++;
379 }
380
381 if (wflag && (Eflag || Fflag)) {
382 /*
383 * -w cannot be specified with grep -F
384 */
385 usage();
386 }
387
388 /*
389 * -E and -F flags are mutually exclusive - check for this
390 */
391 if (Eflag && Fflag)
392 usage();
393
394 /*
395 * -l overrides -H like in GNU grep
396 */
397 if (lflag)
398 Hflag = 0;
399
400 /*
401 * -c, -l and -q flags are mutually exclusive
402 * We have -c override -l like in Solaris.
403 * -q overrides -l & -c programmatically in grep() function.
404 */
405 if (cflag && lflag)
406 lflag = 0;
407
408 argv += optind - 1;
409 argc -= optind - 1;
410
411 /*
412 * Now handling -e and -f option
413 */
414 if (pattern_list) {
415 for (i = 0; i < n_pattern; i++) {
416 addpattern(pattern_list[i]);
417 }
418 free(pattern_list);
419 }
420 if (file_list) {
421 for (i = 0; i < n_file; i++) {
422 addfile(file_list[i]);
423 }
424 free(file_list);
425 }
426
427 /*
428 * No -e or -f? Make sure there is one more arg, use it as the pattern.
429 */
430 if (patterns == NULL && !fflag) {
431 if (argc < 2)
432 usage();
433 addpattern(argv[1]);
434 argc--;
435 argv++;
436 }
437
438 /*
439 * If -x flag is not specified or -i flag is specified
440 * with fgrep in a multibyte locale, need to use
441 * the wide character APIs. Otherwise, byte-oriented
442 * process will be done.
443 */
444 use_wchar = Fflag && mblocale && (!xflag || iflag);
445
446 /*
447 * Compile Patterns and also decide if BMG can be used
448 */
449 fixpatterns();
450
451 /* Process all files: stdin, or rest of arg list */
452 if (argc < 2) {
453 matched = grep(0, STDIN_FILENAME);
454 } else {
455 if (Hflag || (argc > 2 && hflag == 0))
456 outfn = 1; /* Print filename on match line */
457 for (argv++; *argv != NULL; argv++) {
458 process_path(*argv);
459 }
460 }
461 /*
462 * Return() here is used instead of exit
463 */
464
465 (void) fflush(stdout);
466
467 if (errors)
468 return (2);
469 return (matched ? 0 : 1);
470 }
471
472 static void
473 process_path(const char *path)
474 {
475 struct stat st;
476 int walkflags = FTW_CHDIR;
477 char *buf = NULL;
478
479 if (rflag) {
480 if (stat(path, &st) != -1 &&
481 (st.st_mode & S_IFMT) == S_IFDIR) {
482 outfn = 1; /* Print filename */
483
484 /*
485 * Add trailing slash if arg
486 * is directory, to resolve symlinks.
487 */
488 if (path[strlen(path) - 1] != '/') {
489 (void) asprintf(&buf, "%s/", path);
490 if (buf != NULL)
491 path = buf;
492 }
493
494 /*
495 * Search through subdirs if path is directory.
496 * Don't follow symlinks if Rflag is not set.
497 */
498 if (!Rflag)
499 walkflags |= FTW_PHYS;
500
501 if (nftw(path, recursive, MAX_DEPTH, walkflags) != 0) {
502 if (!sflag)
503 (void) fprintf(stderr,
504 gettext("%s: can't open \"%s\"\n"),
505 cmdname, path);
506 errors = 1;
507 }
508 return;
509 }
510 }
511 process_file(path, 0);
512 }
513
514 /*
515 * Read and process all files in directory recursively.
516 */
517 static int
518 recursive(const char *name, const struct stat *statp, int info, struct FTW *ftw)
519 {
520 /*
521 * Process files and follow symlinks if Rflag set.
522 */
523 if (info != FTW_F) {
524 /* Report broken symlinks and unreadable files */
525 if (!sflag &&
526 (info == FTW_SLN || info == FTW_DNR || info == FTW_NS)) {
527 (void) fprintf(stderr,
528 gettext("%s: can't open \"%s\"\n"), cmdname, name);
529 }
530 return (0);
531 }
532
533
534 /* Skip devices and pipes if Rflag is not set */
535 if (!Rflag && !S_ISREG(statp->st_mode))
536 return (0);
537 /* Pass offset to relative name from FTW_CHDIR */
538 process_file(name, ftw->base);
539 return (0);
540 }
541
542 /*
543 * Opens file and call grep function.
544 */
545 static void
546 process_file(const char *name, int base)
547 {
548 int fd;
549
550 if ((fd = open(name + base, O_RDONLY)) == -1) {
551 errors = 1;
552 if (!sflag) /* Silent mode */
553 (void) fprintf(stderr, gettext(
554 "%s: can't open \"%s\"\n"),
555 cmdname, name);
556 return;
557 }
558 matched |= grep(fd, name);
559 (void) close(fd);
560
561 if (ferror(stdout)) {
562 (void) fprintf(stderr, gettext(
563 "%s: error writing to stdout\n"),
564 cmdname);
565 (void) fflush(stdout);
566 exit(2);
567 }
568
569 }
570
571 /*
572 * Add a file of strings to the pattern list.
573 */
574 static void
575 addfile(const char *fn)
576 {
577 FILE *fp;
578 char *inbuf;
579 char *bufp;
580 size_t bufsiz, buflen, bufused;
581
582 /*
583 * Open the pattern file
584 */
585 if ((fp = fopen(fn, "r")) == NULL) {
586 (void) fprintf(stderr, gettext("%s: can't open \"%s\"\n"),
587 cmdname, fn);
588 exit(2);
589 }
590 bufsiz = BUFSIZE;
591 if ((inbuf = malloc(bufsiz)) == NULL) {
592 (void) fprintf(stderr,
593 gettext("%s: out of memory\n"), cmdname);
594 exit(2);
595 }
596 bufp = inbuf;
597 bufused = 0;
598 /*
599 * Read in the file, reallocing as we need more memory
600 */
601 while (fgets(bufp, bufsiz - bufused, fp) != NULL) {
602 buflen = strlen(bufp);
603 bufused += buflen;
604 if (bufused + 1 == bufsiz && bufp[buflen - 1] != '\n') {
605 /*
606 * if this line does not fit to the buffer,
607 * realloc larger buffer
608 */
609 bufsiz += BUFSIZE;
610 if ((inbuf = realloc(inbuf, bufsiz)) == NULL) {
611 (void) fprintf(stderr,
612 gettext("%s: out of memory\n"),
613 cmdname);
614 exit(2);
615 }
616 bufp = inbuf + bufused;
617 continue;
618 }
619 if (bufp[buflen - 1] == '\n') {
620 bufp[--buflen] = '\0';
621 }
622 addpattern(inbuf);
623
624 bufp = inbuf;
625 bufused = 0;
626 }
627 free(inbuf);
628 free(prntbuf);
629 free(conbuf);
630 (void) fclose(fp);
631 }
632
633 /*
634 * Add a string to the pattern list.
635 */
636 static void
637 addpattern(char *s)
638 {
639 PATTERN *pp;
640 char *wordbuf;
641 char *np;
642
643 for (; ; ) {
644 np = strchr(s, '\n');
645 if (np != NULL)
646 *np = '\0';
647 if ((pp = malloc(sizeof (PATTERN))) == NULL) {
648 (void) fprintf(stderr, gettext(
649 "%s: out of memory\n"),
650 cmdname);
651 exit(2);
652 }
653 if (wflag) {
654 /*
655 * Solaris wflag support: Add '<' '>' to pattern to
656 * select it as a word. Doesn't make sense with -F
657 * but we're Libertarian.
658 */
659 size_t slen, wordlen;
660
661 slen = strlen(s);
662 wordlen = slen + 5; /* '\\' '<' s '\\' '>' '\0' */
663 if ((wordbuf = malloc(wordlen)) == NULL) {
664 (void) fprintf(stderr,
665 gettext("%s: out of memory\n"),
666 cmdname);
667 exit(2);
668 }
669 (void) strcpy(wordbuf, "\\<");
670 (void) strcpy(wordbuf + 2, s);
671 (void) strcpy(wordbuf + 2 + slen, "\\>");
672 } else {
673 if ((wordbuf = strdup(s)) == NULL) {
674 (void) fprintf(stderr,
675 gettext("%s: out of memory\n"),
676 cmdname);
677 exit(2);
678 }
679 }
680 pp->pattern = wordbuf;
681 pp->next = patterns;
682 patterns = pp;
683 if (np == NULL)
684 break;
685 s = np + 1;
686 }
687 }
688
689 /*
690 * Fix patterns.
691 * Must do after all arguments read, in case later -i option.
692 */
693 static void
694 fixpatterns(void)
695 {
696 PATTERN *pp;
697 int rv, fix_pattern, npatterns;
698
699 /*
700 * As REG_ANCHOR flag is not supported in the current Solaris,
701 * need to fix the specified pattern if -x is specified with
702 * grep or egrep
703 */
704 fix_pattern = !Fflag && xflag;
705
706 for (npatterns = 0, pp = patterns; pp != NULL; pp = pp->next) {
707 npatterns++;
708 if (fix_pattern) {
709 char *cp, *cq;
710 size_t plen, nplen;
711
712 plen = strlen(pp->pattern);
713 /* '^' pattern '$' */
714 nplen = 1 + plen + 1 + 1;
715 if ((cp = malloc(nplen)) == NULL) {
716 (void) fprintf(stderr,
717 gettext("%s: out of memory\n"),
718 cmdname);
719 exit(2);
720 }
721 cq = cp;
722 *cq++ = '^';
723 cq = strcpy(cq, pp->pattern) + plen;
724 *cq++ = '$';
725 *cq = '\0';
726 free(pp->pattern);
727 pp->pattern = cp;
728 }
729
730 if (Fflag) {
731 if (use_wchar) {
732 /*
733 * Fflag && mblocale && iflag
734 * Fflag && mblocale && !xflag
735 */
736 size_t n;
737 n = strlen(pp->pattern) + 1;
738 if ((pp->wpattern =
739 malloc(sizeof (wchar_t) * n)) == NULL) {
740 (void) fprintf(stderr,
741 gettext("%s: out of memory\n"),
742 cmdname);
743 exit(2);
744 }
745 if (mbstowcs(pp->wpattern, pp->pattern, n) ==
746 (size_t)-1) {
747 (void) fprintf(stderr,
748 gettext("%s: failed to convert "
749 "\"%s\" to wide-characters\n"),
750 cmdname, pp->pattern);
751 exit(2);
752 }
753 if (iflag) {
754 wchar_t *wp;
755 for (wp = pp->wpattern; *wp != L'\0';
756 wp++) {
757 *wp = towlower((wint_t)*wp);
758 }
759 }
760 free(pp->pattern);
761 } else {
762 /*
763 * Fflag && mblocale && !iflag
764 * Fflag && !mblocale && iflag
765 * Fflag && !mblocale && !iflag
766 */
767 if (iflag) {
768 unsigned char *cp;
769 for (cp = (unsigned char *)pp->pattern;
770 *cp != '\0'; cp++) {
771 *cp = tolower(*cp);
772 }
773 }
774 }
775 /*
776 * fgrep: No regular expressions.
777 */
778 continue;
779 }
780
781 /*
782 * For non-fgrep, compile the regular expression,
783 * give an informative error message, and exit if
784 * it didn't compile.
785 */
786 if ((rv = regcomp(&pp->re, pp->pattern, regflags)) != 0) {
787 (void) regerror(rv, &pp->re, errstr, sizeof (errstr));
788 (void) fprintf(stderr,
789 gettext("%s: RE error in %s: %s\n"),
790 cmdname, pp->pattern, errstr);
791 exit(2);
792 }
793 free(pp->pattern);
794 }
795
796 /*
797 * Decide if we are able to run the Boyer-Moore-Gosper algorithm.
798 * Use the Boyer-Moore-Gosper algorithm if:
799 * - fgrep (Fflag)
800 * - singlebyte locale (!mblocale)
801 * - no ignoring case (!iflag)
802 * - no printing line numbers (!nflag)
803 * - no negating the output (nvflag)
804 * - only one pattern (npatterns == 1)
805 * - non zero length pattern (strlen(patterns->pattern) != 0)
806 * - no context required (!conflag)
807 *
808 * It's guaranteed patterns->pattern is still alive
809 * when Fflag && !mblocale.
810 */
811 use_bmg = Fflag && !mblocale && !iflag && !nflag && nvflag &&
812 (npatterns == 1) && (strlen(patterns->pattern) != 0) && !conflag;
813 }
814
815 /*
816 * Search a newline from the beginning of the string
817 */
818 static char *
819 find_nl(const char *ptr, size_t len)
820 {
821 while (len-- != 0) {
822 if (*ptr++ == '\n') {
823 return ((char *)--ptr);
824 }
825 }
826 return (NULL);
827 }
828
829 /*
830 * Search a newline from the end of the string
831 */
832 static char *
833 rfind_nl(const char *ptr, size_t len)
834 {
835 const char *uptr = ptr + len;
836 while (len--) {
837 if (*--uptr == '\n') {
838 return ((char *)uptr);
839 }
840 }
841 return (NULL);
842 }
843
844 /*
845 * Duplicate the specified string converting each character
846 * into a lower case.
847 */
848 static char *
849 istrdup(const char *s1)
850 {
851 static size_t ibuflen = 0;
852 static char *ibuf = NULL;
853 size_t slen;
854 char *p;
855
856 slen = strlen(s1);
857 if (slen >= ibuflen) {
858 /* ibuf does not fit to s1 */
859 ibuflen = slen + 1;
860 ibuf = realloc(ibuf, ibuflen);
861 if (ibuf == NULL) {
862 (void) fprintf(stderr,
863 gettext("%s: out of memory\n"), cmdname);
864 exit(2);
865 }
866 }
867 p = ibuf;
868 do {
869 *p++ = tolower(*s1);
870 } while (*s1++ != '\0');
871 return (ibuf);
872 }
873
874 /*
875 * Do grep on a single file.
876 * Return true in any lines matched.
877 *
878 * We have two strategies:
879 * The fast one is used when we have a single pattern with
880 * a string known to occur in the pattern. We can then
881 * do a BMG match on the whole buffer.
882 * This is an order of magnitude faster.
883 * Otherwise we split the buffer into lines,
884 * and check for a match on each line.
885 */
886 static int
887 grep(int fd, const char *fn)
888 {
889 PATTERN *pp;
890 off_t data_len; /* length of the data chunk */
891 off_t line_len; /* length of the current line */
892 off_t line_offset; /* current line's offset from the beginning */
893 off_t blkoffset; /* line_offset but context-compatible */
894 long long lineno, linenum;
895 long long matches = 0; /* Number of matching lines */
896 long long conacnt = 0, conbcnt = 0; /* context line count */
897 int newlinep; /* 0 if the last line of file has no newline */
898 char *ptr, *ptrend, *prntptr, *prntptrend;
899 char *nextptr = NULL, *nextend = NULL;
900 char *conptr = NULL, *conptrend = NULL;
901 char *matchptr = NULL;
902 int conaprnt = 0, conbprnt = 0, lastmatch = 0;
903 int nearmatch = conmatches ? 1 : 0; /* w/in N+1 of last match */
904 size_t prntlen;
905
906 if (patterns == NULL)
907 return (0); /* no patterns to match -- just return */
908
909 pp = patterns;
910
911 if (use_bmg) {
912 bmgcomp(pp->pattern, strlen(pp->pattern));
913 }
914
915 if (use_wchar && outline == NULL) {
916 outbuflen = BUFSIZE + 1;
917 outline = malloc(sizeof (wchar_t) * outbuflen);
918 if (outline == NULL) {
919 (void) fprintf(stderr, gettext("%s: out of memory\n"),
920 cmdname);
921 exit(2);
922 }
923 }
924
925 if (prntbuf == NULL) {
926 prntbuflen = BUFSIZE;
927 if ((prntbuf = malloc(prntbuflen + 1)) == NULL) {
928 (void) fprintf(stderr, gettext("%s: out of memory\n"),
929 cmdname);
930 exit(2);
931 }
932 }
933
934 if (conflag && (conbuf == NULL)) {
935 conbuflen = BUFSIZE;
936 if ((conbuf = malloc(BUFSIZE+1)) == NULL) {
937 (void) fprintf(stderr, gettext("%s: out of memory\n"),
938 cmdname);
939 exit(2);
940 }
941 }
942
943 blkoffset = line_offset = 0;
944 lineno = 0;
945 linenum = 1;
946 newlinep = 1;
947 data_len = 0;
948 for (; ; ) {
949 long count;
950 off_t offset = 0;
951 int eof = 0, rv = REG_NOMATCH;
952 char separate;
953
954 if (data_len == 0) {
955 /*
956 * If no data in the buffer, reset ptr
957 */
958 ptr = prntbuf;
959 if (conptr == NULL)
960 conptrend = conptr = conbuf;
961 }
962 if (ptr == prntbuf) {
963 /*
964 * The current data chunk starts from prntbuf.
965 * This means either the buffer has no data
966 * or the buffer has no newline.
967 * So, read more data from input.
968 */
969 count = read(fd, ptr + data_len, prntbuflen - data_len);
970 if (count < 0) {
971 /* read error */
972 if (cflag) {
973 if (outfn && !rflag) {
974 (void) fprintf(stdout,
975 "%s:", fn);
976 }
977 if (!qflag && !rflag) {
978 (void) fprintf(stdout, "%lld\n",
979 matches);
980 }
981 }
982 return (0);
983 } else if (count == 0) {
984 /* no new data */
985 eof = 1;
986
987 /* we never want to match EOF */
988 pp = (PATTERN *) !nvflag;
989
990 if (data_len == 0) {
991 /* end of file already reached */
992 if (conflag) {
993 *conptrend = '\n';
994 goto L_next_line;
995 } else {
996 goto out;
997 }
998 }
999 /* last line of file has no newline */
1000 ptrend = ptr + data_len;
1001 newlinep = 0;
1002 goto L_start_process;
1003 }
1004 offset = data_len;
1005 data_len += count;
1006 }
1007
1008 /*
1009 * Look for newline in the chunk
1010 * between ptr + offset and ptr + data_len - offset.
1011 */
1012 ptrend = find_nl(ptr + offset, data_len - offset);
1013 if (ptrend == NULL) {
1014 /* no newline found in this chunk */
1015 if (ptr > prntbuf) {
1016 /*
1017 * Move remaining data to the beginning
1018 * of the buffer.
1019 * Remaining data lie from ptr for
1020 * data_len bytes.
1021 */
1022 (void) memmove(prntbuf, ptr, data_len);
1023 }
1024 if (data_len == prntbuflen) {
1025 /*
1026 * Not enough room in the buffer
1027 */
1028 prntbuflen += BUFSIZE;
1029 prntbuf = realloc(prntbuf, prntbuflen + 1);
1030 if (prntbuf == NULL) {
1031 (void) fprintf(stderr,
1032 gettext("%s: out of memory\n"),
1033 cmdname);
1034 exit(2);
1035 }
1036 }
1037 ptr = prntbuf;
1038 /* read the next input */
1039 continue;
1040 }
1041 L_start_process:
1042
1043 /*
1044 * Beginning of the chunk: ptr
1045 * End of the chunk: ptr + data_len
1046 * Beginning of the line: ptr
1047 * End of the line: ptrend
1048 */
1049
1050 if (use_bmg) {
1051 /*
1052 * Use Boyer-Moore-Gosper algorithm to find out if
1053 * this chunk (not this line) contains the specified
1054 * pattern. If not, restart from the last line
1055 * of this chunk.
1056 */
1057 char *bline;
1058 bline = bmgexec(ptr, ptr + data_len);
1059 if (bline == NULL) {
1060 /*
1061 * No pattern found in this chunk.
1062 * Need to find the last line
1063 * in this chunk.
1064 */
1065 ptrend = rfind_nl(ptr, data_len);
1066
1067 /*
1068 * When this chunk does not contain newline,
1069 * ptrend becomes NULL, which should happen
1070 * when the last line of file does not end
1071 * with a newline. At such a point,
1072 * newlinep should have been set to 0.
1073 * Therefore, just after jumping to
1074 * L_skip_line, the main for-loop quits,
1075 * and the line_len value won't be
1076 * used.
1077 */
1078 line_len = ptrend - ptr;
1079 goto L_skip_line;
1080 }
1081 if (bline > ptrend) {
1082 /*
1083 * Pattern found not in the first line
1084 * of this chunk.
1085 * Discard the first line.
1086 */
1087 line_len = ptrend - ptr;
1088 goto L_skip_line;
1089 }
1090 /*
1091 * Pattern found in the first line of this chunk.
1092 * Using this result.
1093 */
1094 *ptrend = '\0';
1095 line_len = ptrend - ptr;
1096
1097 /*
1098 * before jumping to L_next_line,
1099 * need to handle xflag if specified
1100 */
1101 if (xflag && (line_len != bmglen ||
1102 strcmp(bmgpat, ptr) != 0)) {
1103 /* didn't match */
1104 pp = NULL;
1105 } else {
1106 pp = patterns; /* to make it happen */
1107 }
1108 goto L_next_line;
1109 }
1110 lineno++;
1111 /*
1112 * Line starts from ptr and ends at ptrend.
1113 * line_len will be the length of the line.
1114 */
1115 *ptrend = '\0';
1116 line_len = ptrend - ptr;
1117
1118 /*
1119 * From now, the process will be performed based
1120 * on the line from ptr to ptrend.
1121 */
1122 if (use_wchar) {
1123 size_t len;
1124
1125 if (line_len >= outbuflen) {
1126 outbuflen = line_len + 1;
1127 outline = realloc(outline,
1128 sizeof (wchar_t) * outbuflen);
1129 if (outline == NULL) {
1130 (void) fprintf(stderr,
1131 gettext("%s: out of memory\n"),
1132 cmdname);
1133 exit(2);
1134 }
1135 }
1136
1137 len = mbstowcs(outline, ptr, line_len);
1138 if (len == (size_t)-1) {
1139 (void) fprintf(stderr, gettext(
1140 "%s: input file \"%s\": line %lld: invalid multibyte character\n"),
1141 cmdname, fn, lineno);
1142 /* never match a line with invalid sequence */
1143 goto L_skip_line;
1144 }
1145 outline[len] = L'\0';
1146
1147 if (iflag) {
1148 wchar_t *cp;
1149 for (cp = outline; *cp != '\0'; cp++) {
1150 *cp = towlower((wint_t)*cp);
1151 }
1152 }
1153
1154 if (xflag) {
1155 for (pp = patterns; pp; pp = pp->next) {
1156 if (outline[0] == pp->wpattern[0] &&
1157 wcscmp(outline,
1158 pp->wpattern) == 0) {
1159 /* matched */
1160 rv = REG_OK;
1161 break;
1162 }
1163 }
1164 } else {
1165 for (pp = patterns; pp; pp = pp->next) {
1166 if (wcswcs(outline, pp->wpattern)
1167 != NULL) {
1168 /* matched */
1169 rv = REG_OK;
1170 break;
1171 }
1172 }
1173 }
1174 } else if (Fflag) {
1175 /* fgrep in byte-oriented handling */
1176 char *fptr;
1177 if (iflag) {
1178 fptr = istrdup(ptr);
1179 } else {
1180 fptr = ptr;
1181 }
1182 if (xflag) {
1183 /* fgrep -x */
1184 for (pp = patterns; pp; pp = pp->next) {
1185 if (fptr[0] == pp->pattern[0] &&
1186 strcmp(fptr, pp->pattern) == 0) {
1187 /* matched */
1188 rv = REG_OK;
1189 break;
1190 }
1191 }
1192 } else {
1193 for (pp = patterns; pp; pp = pp->next) {
1194 if (strstr(fptr, pp->pattern) != NULL) {
1195 /* matched */
1196 rv = REG_OK;
1197 break;
1198 }
1199 }
1200 }
1201 } else {
1202 /* grep or egrep */
1203 for (pp = patterns; pp; pp = pp->next) {
1204 rv = regexec(&pp->re, ptr, 0, NULL, 0);
1205 if (rv == REG_OK) {
1206 /* matched */
1207 break;
1208 }
1209
1210 switch (rv) {
1211 case REG_NOMATCH:
1212 break;
1213 case REG_ECHAR:
1214 (void) fprintf(stderr, gettext(
1215 "%s: input file \"%s\": line %lld: invalid multibyte character\n"),
1216 cmdname, fn, lineno);
1217 break;
1218 default:
1219 (void) regerror(rv, &pp->re, errstr,
1220 sizeof (errstr));
1221 (void) fprintf(stderr, gettext(
1222 "%s: input file \"%s\": line %lld: %s\n"),
1223 cmdname, fn, lineno, errstr);
1224 exit(2);
1225 }
1226 }
1227 }
1228
1229 /*
1230 * Context is set up as follows:
1231 * For a 'Before' context, we maintain a set of pointers
1232 * containing 'N' lines of context. If the current number of
1233 * lines contained is greater than N, and N isn't a match, the
1234 * start pointer is moved forward to the next newline.
1235 *
1236 * If we ever find a match, we print out immediately.
1237 * 'nearmatch' tells us if we're within N+1 lines of the last
1238 * match ; if we are, and we find another match, we don't
1239 * separate the matches. 'nearmatch' becomes false when
1240 * a line gets rotated out of the context.
1241 *
1242 * For an 'After' context, we simply wait until we've found a
1243 * match, then create a context N+1 lines big. If we don't find
1244 * a match within the context, we print out the current context.
1245 * Otherwise, we save a reference to the new matching line,
1246 * print out the other context, and reset our context pointers
1247 * to the new matching line.
1248 *
1249 * 'nearmatch' becomes false when we find a non-matching line
1250 * that isn't a part of any context.
1251 *
1252 * A full-context is implemented as a combination of the
1253 * 'Before' and 'After' context logic. Before we find a match,
1254 * we follow the Before logic. When we find a match, we
1255 * follow the After logic. 'nearmatch' is handled by the Before
1256 * logic.
1257 */
1258
1259 if (!conflag)
1260 goto L_next_line;
1261
1262 if (line_len + (conptrend - conbuf) > conbuflen) {
1263 char *oldconbuf = conbuf;
1264 char *oldconptr = conptr;
1265 long tmp = matchptr - conptr;
1266
1267 conbuflen += BUFSIZE;
1268 conbuf = realloc(conbuf, conbuflen + 1);
1269 if (conbuf == NULL) {
1270 (void) fprintf(stderr,
1271 gettext("%s: out of memory\n"),
1272 cmdname);
1273 exit(2);
1274 }
1275
1276 conptr = conbuf + (conptr - oldconbuf);
1277 conptrend = conptr + (conptrend - oldconptr);
1278 if (matchptr)
1279 matchptr = conptr + tmp;
1280 }
1281 (void) memcpy((conptrend > conptr) ?
1282 conptrend + 1 : conptrend, ptr, line_len);
1283 conptrend += line_len + (conptrend > conptr);
1284 *conptrend = '\n';
1285
1286 if (!nvflag == rv) {
1287 /* matched */
1288 if (lastmatch) {
1289 if (conflag & AFTER) {
1290 conaprnt = 1;
1291 nextend = conptrend;
1292 conptrend = conptr + lastmatch;
1293 nextptr = conptrend + 1;
1294 *nextend = '\n';
1295 }
1296 } else {
1297 if (conflag == AFTER) {
1298 conptr = conptrend - (line_len);
1299 linenum = lineno;
1300 blkoffset = line_offset;
1301 }
1302 blkoffset = line_offset -
1303 (conptrend - conptr - line_len);
1304 }
1305
1306 if (conflag == BEFORE)
1307 conbprnt = 1;
1308
1309 lastmatch = conptrend - conptr;
1310 goto L_next_line;
1311 }
1312
1313 if (!lastmatch) {
1314 if (conflag & BEFORE) {
1315 if (conbcnt >= conblen) {
1316 char *tmp = conptr;
1317 conptr = find_nl(conptr,
1318 conptrend - conptr) + 1;
1319 if (bflag)
1320 blkoffset += conptr - tmp;
1321 linenum++;
1322 nearmatch = 1;
1323 } else {
1324 conbcnt++;
1325 }
1326 }
1327 if (conflag == AFTER)
1328 nearmatch = 1;
1329 } else {
1330 if (++conacnt >= conalen && !conaprnt && conalen)
1331 conaprnt = 1;
1332 else
1333 lastmatch = conptrend - conptr;
1334 }
1335
1336 L_next_line:
1337 /*
1338 * Here, if pp points to non-NULL, something has been matched
1339 * to the pattern.
1340 */
1341 if (nvflag == (pp != NULL)) {
1342 matches++;
1343 if (!nextend)
1344 matchptr = conflag ? conptrend : ptrend;
1345 }
1346
1347 /*
1348 * Set up some print context so that we can treat
1349 * single-line matches as a zero-N context.
1350 * Apply CLI flags to each line of the context.
1351 *
1352 * For context, we only print if we both have a match and are
1353 * either at the end of the data stream, or we've previously
1354 * declared that we want to print for a particular context.
1355 */
1356 if (lastmatch && (eof || conaprnt || conbprnt)) {
1357
1358 /*
1359 * We'd normally do this earlier, but we had to
1360 * escape early because we reached the end of the data.
1361 */
1362 if (eof && nextptr)
1363 conptrend = nextend;
1364
1365 prntlen = conptrend - conptr + 1;
1366 prntptrend = prntptr = conptr;
1367 if (conmatches++ && nearmatch && !cflag)
1368 (void) fwrite("--\n", 1, 3, stdout);
1369 } else if (!conflag && nvflag == (pp != NULL)) {
1370 *ptrend = '\n';
1371 prntlen = line_len + 1;
1372 prntptrend = prntptr = ptr;
1373 linenum = lineno;
1374 blkoffset = line_offset;
1375 } else if (eof) {
1376 /* No match and no more data */
1377 goto out;
1378 } else {
1379 /* No match, or we're not done building context */
1380 goto L_skip_line;
1381 }
1382
1383 while ((prntptrend = find_nl(prntptrend+1, prntlen)) != NULL) {
1384
1385 /*
1386 * GNU grep uses '-' for context lines and ':' for
1387 * matching lines, so replicate that here.
1388 */
1389 if (prntptrend == matchptr) {
1390 if (eof && nextptr) {
1391 matchptr = nextend;
1392 nextptr = NULL;
1393 } else {
1394 matchptr = NULL;
1395 }
1396 separate = ':';
1397 } else {
1398 separate = '-';
1399 }
1400
1401 /*
1402 * Handle q, l, and c flags.
1403 */
1404 if (qflag) {
1405 /* no need to continue */
1406 /*
1407 * End of this line is ptrend.
1408 * We have read up to ptr + data_len.
1409 */
1410 off_t pos;
1411 pos = ptr + data_len - (ptrend + 1);
1412 (void) lseek(fd, -pos, SEEK_CUR);
1413 exit(0);
1414 }
1415 if (lflag) {
1416 (void) printf("%s\n", fn);
1417 goto out;
1418 }
1419 if (!cflag) {
1420 if (Hflag || outfn) {
1421 (void) printf("%s%c", fn, separate);
1422 }
1423 if (bflag) {
1424 (void) printf("%lld%c", (offset_t)
1425 (blkoffset / BSIZE), separate);
1426 }
1427 if (nflag) {
1428 (void) printf("%lld%c", linenum,
1429 separate);
1430 }
1431 (void) fwrite(prntptr, 1,
1432 prntptrend - prntptr + 1, stdout);
1433 }
1434 if (ferror(stdout)) {
1435 return (0);
1436 }
1437 linenum++;
1438 prntlen -= prntptrend - prntptr + 1;
1439 blkoffset += prntptrend - prntptr + 1;
1440 prntptr = prntptrend + 1;
1441 }
1442
1443 if (eof)
1444 goto out;
1445
1446 /*
1447 * Update context buffer and variables post-print
1448 */
1449 if (conflag) {
1450 conptr = conbuf;
1451 conaprnt = conbprnt = 0;
1452 nearmatch = 0;
1453 conacnt = conbcnt = 0;
1454
1455 if (nextptr) {
1456 (void) memmove(conbuf, nextptr,
1457 nextend - nextptr + 1);
1458 blkoffset += nextptr - conptrend - 1;
1459 conptrend = conptr + (nextend - nextptr);
1460 matchptr = conptrend;
1461 linenum = lineno;
1462 lastmatch = conptrend - conptr;
1463 } else {
1464 conptrend = conptr;
1465 conacnt = 0;
1466 lastmatch = 0;
1467 }
1468 nextptr = nextend = NULL;
1469 }
1470
1471 L_skip_line:
1472 if (!newlinep)
1473 break;
1474
1475 data_len -= line_len + 1;
1476 line_offset += line_len + 1;
1477 ptr = ptrend + 1;
1478 }
1479
1480 out:
1481 if (cflag) {
1482 if (Hflag || outfn) {
1483 (void) printf("%s:", fn);
1484 }
1485 if (!qflag) {
1486 (void) printf("%lld\n", matches);
1487 }
1488 }
1489 return (matches != 0);
1490 }
1491
1492 /*
1493 * usage message for grep
1494 */
1495 static void
1496 usage(void)
1497 {
1498 if (egrep || fgrep) {
1499 (void) fprintf(stderr, gettext("Usage:\t%s"), cmdname);
1500 (void) fprintf(stderr,
1501 gettext(" [-c|-l|-q] [-r|-R] [-A #|-B #|-C #|-#] "
1502 "[-bhHinsvx] pattern_list [file ...]\n"));
1503
1504 (void) fprintf(stderr, "\t%s", cmdname);
1505 (void) fprintf(stderr,
1506 gettext(" [-c|-l|-q] [-r|-R] [-A #|-B #|-C #|-#] "
1507 "[-bhHinsvx] [-e pattern_list]... "
1508 "[-f pattern_file]... [file...]\n"));
1509 } else {
1510 (void) fprintf(stderr, gettext("Usage:\t%s"), cmdname);
1511 (void) fprintf(stderr,
1512 gettext(" [-c|-l|-q] [-r|-R] [-A #|-B #|-C #|-#] "
1513 "[-bhHinsvx] pattern_list [file ...]\n"));
1514
1515 (void) fprintf(stderr, "\t%s", cmdname);
1516 (void) fprintf(stderr,
1517 gettext(" [-c|-l|-q] [-r|-R] [-A #|-B #|-C #|-#] "
1518 "[-bhHinsvx] [-e pattern_list]... "
1519 "[-f pattern_file]... [file...]\n"));
1520
1521 (void) fprintf(stderr, "\t%s", cmdname);
1522 (void) fprintf(stderr,
1523 gettext(" -E [-c|-l|-q] [-r|-R] [-A #|-B #|-C #|-#] "
1524 "[-bhHinsvx] pattern_list [file ...]\n"));
1525
1526 (void) fprintf(stderr, "\t%s", cmdname);
1527 (void) fprintf(stderr,
1528 gettext(" -E [-c|-l|-q] [-r|-R] [-A #|-B #|-C #|-#] "
1529 "[-bhHinsvx] [-e pattern_list]... "
1530 "[-f pattern_file]... [file...]\n"));
1531
1532 (void) fprintf(stderr, "\t%s", cmdname);
1533 (void) fprintf(stderr,
1534 gettext(" -F [-c|-l|-q] [-r|-R] [-A #|-B #|-C #|-#] "
1535 "[-bhHinsvx] pattern_list [file ...]\n"));
1536
1537 (void) fprintf(stderr, "\t%s", cmdname);
1538 (void) fprintf(stderr,
1539 gettext(" -F [-c|-l|-q] [-A #|-B #|-C #|-#] "
1540 "[-bhHinsvx] [-e pattern_list]... "
1541 "[-f pattern_file]... [file...]\n"));
1542 }
1543 exit(2);
1544 /* NOTREACHED */
1545 }
1546
1547 /*
1548 * Compile literal pattern into BMG tables
1549 */
1550 static void
1551 bmgcomp(char *pat, int len)
1552 {
1553 int i;
1554 int tlen;
1555 unsigned char *uc = (unsigned char *)pat;
1556
1557 bmglen = len;
1558 bmgpat = pat;
1559
1560 for (i = 0; i < M_CSETSIZE; i++) {
1561 bmgtab[i] = len;
1562 }
1563
1564 len--;
1565 for (tlen = len, i = 0; i <= len; i++, tlen--) {
1566 bmgtab[*uc++] = tlen;
1567 }
1568 }
1569
1570 /*
1571 * BMG search.
1572 */
1573 static char *
1574 bmgexec(char *str, char *end)
1575 {
1576 int t;
1577 char *k, *s, *p;
1578
1579 k = str + bmglen - 1;
1580 if (bmglen == 1) {
1581 return (memchr(str, bmgpat[0], end - str));
1582 }
1583 for (; ; ) {
1584 /* inner loop, should be most optimized */
1585 while (k < end && (t = bmgtab[(unsigned char)*k]) != 0) {
1586 k += t;
1587 }
1588 if (k >= end) {
1589 return (NULL);
1590 }
1591 for (s = k, p = bmgpat + bmglen - 1; *--s == *--p; ) {
1592 if (p == bmgpat) {
1593 return (s);
1594 }
1595 }
1596 k++;
1597 }
1598 /* NOTREACHED */
1599 }