1 /*
2 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
3 * Use is subject to license terms.
4 */
5
6 /* Copyright 1984, 1986, 1987, 1988, 1989 AT&T */
7 /* All Rights Reserved */
8
9 /*
10 * Copyright 1980 Regents of the University of California.
11 * All rights reserved. The Berkeley software License Agreement
12 * specifies the terms and conditions for redistribution.
13 */
14
15 #pragma ident "%Z%%M% %I% %E% SMI"
16
17 /*
18 * Get name sections from manual pages.
19 * -t for building toc
20 * -i for building intro entries
21 * other apropos database
22 */
23
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <stdarg.h>
27 #include <string.h>
28 #include <unistd.h>
29 #include <limits.h>
30 #include <locale.h>
31 #include <wchar.h>
32 #include <errno.h>
33 #include <sys/param.h>
34
35 #define PLEN 3 /* prefix length "man" */
36
37 static char path[MAXPATHLEN+1];
38 static int tocrc;
39 static int intro;
40 static char *progname;
41
42 static void trimln(char *);
43 static void roff_trim(char *cp);
44 static void doname(char *);
45 static void section(char *, char *);
46 static void split(char *, char *);
47 static void dorefname(char *);
48 static void troffpage(char *);
49 static void sgmlpage(char *);
50
51 /*
52 * Test to see if this is an SGML manpage or a regular manpage
53 * Unless the first line begins with <!DOCTYPE, we assume it isn't.
54 */
55 static int
56 issgml(FILE *fp)
57 {
58 static const char magic[] = "<!DOCTYPE";
59 char buf[sizeof (magic)];
60 size_t n = sizeof (magic) - 1;
61
62 if (read(fileno(fp), buf, n) != n ||
63 lseek(fileno(fp), 0, SEEK_SET) != 0)
64 return (0);
65 return (strncmp(magic, buf, n) == 0);
66 }
67
68 int
69 main(int argc, char *argv[])
70 {
71 int c;
72
73 (void) setlocale(LC_ALL, "");
74
75 progname = argv[0];
76
77 while ((c = getopt(argc, argv, "it")) != EOF)
78 switch (c) {
79 case 't':
80 tocrc++;
81 break;
82 case 'i':
83 intro++;
84 break;
85 case '?':
86 default:
87 (void) fprintf(stderr,
88 "usage: %s [-i][-t] files..\n", progname);
89 exit(1);
90 }
91
92 if (getcwd(path, sizeof (path)) == NULL) {
93 (void) fprintf(stderr, "%s: getcwd: %s\n", progname, path);
94 exit(1);
95 }
96
97 for (; optind < argc; optind++) {
98 char *name = argv[optind];
99
100 if (freopen(name, "r", stdin) == 0) {
101 (void) fprintf(stderr,
102 "%s: %s: %s\n", progname, name, strerror(errno));
103 continue;
104 }
105
106 /*
107 * Most of the info we care about is in the first kbyte
108 */
109 (void) setvbuf(stdin, NULL, _IOFBF, 1024);
110
111 if (issgml(stdin))
112 sgmlpage(name);
113 else
114 troffpage(name);
115 }
116
117 return (0);
118 }
119
120 /*
121 * Parse a troff-format manpage
122 */
123 static void
124 troffpage(char *name)
125 {
126 char headbuf[BUFSIZ];
127 char linbuf[BUFSIZ];
128 char *strptr;
129 int i = 0;
130
131 for (;;) {
132 if (fgets(headbuf, sizeof (headbuf), stdin) == NULL)
133 return;
134 if (headbuf[0] != '.')
135 continue;
136 if (headbuf[1] == 'T' && headbuf[2] == 'H')
137 break;
138 if (headbuf[1] == 't' && headbuf[2] == 'h')
139 break;
140 }
141 for (;;) {
142 if (fgets(linbuf, sizeof (linbuf), stdin) == NULL)
143 return;
144 if (linbuf[0] != '.')
145 continue;
146 if (linbuf[1] == 'S' && linbuf[2] == 'H')
147 break;
148 if (linbuf[1] == 's' && linbuf[2] == 'h')
149 break;
150 }
151 trimln(headbuf);
152 if (tocrc)
153 doname(name);
154 if (!intro)
155 section(name, headbuf);
156 for (;;) {
157 if (fgets(linbuf, sizeof (linbuf), stdin) == NULL)
158 break;
159 if (linbuf[0] == '.') {
160 if (linbuf[1] == 'S' && linbuf[2] == 'H')
161 break;
162 if (linbuf[1] == 's' && linbuf[2] == 'h')
163 break;
164 if (linbuf[1] == '\\' && linbuf[2] == '"')
165 continue;
166 }
167 trimln(linbuf);
168 roff_trim(linbuf);
169 if (intro) {
170 split(linbuf, name);
171 continue;
172 }
173 if (i != 0)
174 (void) printf(" ");
175 i++;
176 (void) printf("%s", linbuf);
177 }
178 (void) printf("\n");
179 }
180
181
182 /*
183 * Substitute section defined in page with new section spec
184 * of the form xx/yy where xx is the section suffix of the
185 * directory and yy is the filename extension (unless xx
186 * and yy are equal, in which case xx is the section).
187 * Pages should be placed in their proper directory with the
188 * proper name to simplify things.
189 *
190 * For example take the following names:
191 * man1/ar.1v (1/1V)
192 * man1/find.1 (1)
193 * man1/loco (1/)
194 *
195 */
196 static void
197 section(char *name, char *buf)
198 {
199 char scratch[MAXPATHLEN+1];
200 char *p = buf;
201 char *dir, *fname;
202 char *dp, *np;
203 int i;
204 int plen = PLEN;
205
206 /*
207 * split dirname and filename
208 */
209 (void) strcpy(scratch, name);
210 if ((fname = strrchr(scratch, '/')) == NULL) {
211 fname = name;
212 dir = path;
213 } else {
214 dir = scratch;
215 *fname = 0;
216 fname++;
217 }
218 dp = strrchr(dir, '/');
219
220 if (*(dp+1) == 's')
221 plen = PLEN + 1;
222
223 if (dp != NULL) {
224 dp = dp+plen+1;
225 } else {
226 dp = dir+plen;
227 }
228 np = strrchr(fname, '.');
229 if (np != NULL) {
230 ++np;
231 } else {
232 np = "";
233 }
234 for (i = 0; i < 2; i++) {
235 while (*p && *p != ' ' && *p != '\t')
236 p++;
237 if (!*p)
238 break;
239 while (*p && (*p == ' ' || *p == '\t'))
240 p++;
241 if (!*p)
242 break;
243 }
244 *p++ = 0;
245 (void) printf("%s", buf);
246 if (strcmp(np, dp) == 0)
247 (void) printf("%s", dp);
248 else
249 (void) printf("%s/%s", dp, np);
250 while (*p && *p != ' ' && *p != '\t')
251 p++;
252 (void) printf("%s\t", p);
253 }
254
255 static void
256 trimln(char *cp)
257 {
258 while (*cp)
259 cp++;
260 if (*--cp == '\n')
261 *cp = 0;
262 }
263
264 static void
265 roff_trim(char *cp)
266 {
267 if (*cp == '.') {
268 while ((*cp != ' ') && (*cp != '\0')) {
269 strcpy(cp, cp+1);
270 }
271 strcpy(cp, cp+1);
272 }
273 while (*cp) {
274 if (strncmp(cp, "\\f", 2) == 0) {
275 if ((*(cp+2) >= 48) && (*(cp+2) <= 57)) {
276 strcpy(cp, cp+3);
277 }
278 if (*(cp+2) == '(') {
279 strcpy(cp, cp+5);
280 }
281 }
282 cp++;
283 }
284 }
285
286 static void
287 doname(char *name)
288 {
289 char *dp = name, *ep;
290
291 again:
292 while (*dp && *dp != '.')
293 (void) putchar(*dp++);
294 if (*dp)
295 for (ep = dp+1; *ep; ep++)
296 if (*ep == '.') {
297 (void) putchar(*dp++);
298 goto again;
299 }
300 (void) putchar('(');
301 if (*dp)
302 dp++;
303 while (*dp)
304 (void) putchar(*dp++);
305 (void) putchar(')');
306 (void) putchar(' ');
307 }
308
309 static void
310 split(char *line, char *name)
311 {
312 char *cp, *dp;
313 char *sp, *sep;
314
315 cp = strchr(line, '-');
316 if (cp == 0)
317 return;
318 sp = cp + 1;
319 for (--cp; *cp == ' ' || *cp == '\t' || *cp == '\\'; cp--)
320 ;
321 *++cp = '\0';
322 while (*sp && (*sp == ' ' || *sp == '\t'))
323 sp++;
324 for (sep = "", dp = line; dp && *dp; dp = cp, sep = "\n") {
325 cp = strchr(dp, ',');
326 if (cp) {
327 char *tp;
328
329 for (tp = cp - 1; *tp == ' ' || *tp == '\t'; tp--)
330 ;
331 *++tp = '\0';
332 for (++cp; *cp == ' ' || *cp == '\t'; cp++)
333 ;
334 }
335 (void) printf("%s%s\t", sep, dp);
336 dorefname(name);
337 (void) printf("\t%s", sp);
338 }
339 }
340
341 static void
342 dorefname(char *name)
343 {
344 char *dp = name, *ep;
345
346 again:
347 while (*dp && *dp != '.')
348 (void) putchar(*dp++);
349 if (*dp)
350 for (ep = dp+1; *ep; ep++)
351 if (*ep == '.') {
352 (void) putchar(*dp++);
353 goto again;
354 }
355 (void) putchar('.');
356 if (*dp)
357 dp++;
358 while (*dp)
359 (void) putchar(*dp++);
360 }
361
362 /*
363 * The rest of the routines in the file form a simplistic parser
364 * for SGML manpages. We assume the input is syntactically correct
365 * SGML, and that the fields occur in the input file in order.
366 */
367
368 /*
369 * Some utilities for constructing arbitrary length wide character strings
370 */
371
372 typedef struct {
373 wchar_t *str;
374 size_t size;
375 long index;
376 } string_t;
377
378 #define DEF_STR_SIZE 16
379 #define DEF_STR_GROWTH 16
380
381 static void
382 outofspace(char *where)
383 {
384 (void) fprintf(stderr, "%s: '%s' - out of memory\n", progname, where);
385 exit(1);
386 }
387
388 static string_t *
389 newstring(size_t initial)
390 {
391 string_t *s = malloc(sizeof (*s));
392
393 if (s == NULL)
394 outofspace("new s");
395
396 initial *= sizeof (wchar_t);
397 if (initial < DEF_STR_SIZE)
398 initial = DEF_STR_SIZE;
399
400 s->str = malloc(initial);
401 if (s->str == NULL)
402 outofspace("new str");
403
404 s->size = initial;
405 s->index = 0;
406 *s->str = L'\0';
407 return (s);
408 }
409
410 static void
411 delstring(string_t **s)
412 {
413 free((*s)->str);
414 (*s)->str = NULL;
415 free(*s);
416 *s = NULL;
417 }
418
419 static wchar_t *
420 getwstring(string_t *s)
421 {
422 static const wchar_t wnull = L'\0';
423
424 if (s)
425 return (s->str);
426 return ((wchar_t *)&wnull);
427 }
428
429 static char *
430 getcstring(string_t *s)
431 {
432 size_t len = (wcslen(s->str) + 1) * MB_CUR_MAX;
433 char *cstr = malloc(len);
434 char *p = cstr;
435 wchar_t *wp = s->str;
436
437 if (p == NULL)
438 outofspace("getc");
439 while (*wp)
440 p += wctomb(p, *wp++);
441 *p = '\0';
442 return (cstr);
443 }
444
445 static void
446 appendwstring(string_t *s, const wchar_t *str)
447 {
448 size_t len = wcslen(str) + 1;
449
450 s->size += sizeof (wchar_t) * len;
451 s->str = realloc(s->str, s->size);
452 if (s->str == NULL)
453 outofspace("appendw");
454 (void) wcscat(s->str, str);
455 s->index = wcslen(s->str) + 1;
456 }
457
458 static void
459 putwstring(string_t *s, wchar_t wc)
460 {
461 if ((s->index + 1) * sizeof (wchar_t) >= s->size) {
462 s->size += DEF_STR_GROWTH;
463 s->str = realloc(s->str, s->size);
464 if (s->str == NULL)
465 outofspace("put");
466 }
467 s->str[s->index++] = wc;
468 }
469
470 /*
471 * Find the closing > of an SGML comment block
472 * (allowing for multibyte, embedded, comments)
473 */
474 static void
475 eatcomments(void)
476 {
477 int pending = 1;
478
479 while (pending)
480 switch (getwchar()) {
481 default:
482 break;
483 case L'<':
484 pending++;
485 break;
486 case L'>':
487 pending--;
488 break;
489 case WEOF:
490 return;
491 }
492 }
493
494 /*
495 * Find the next token on stdin.
496 * Handles nested comment strings, and removes any trailing newlines
497 * from the stream after the closing '>'.
498 */
499 static int
500 find_token(char *tokbuf, size_t tokbuflen)
501 {
502 int c;
503 wint_t wc;
504 char *tokp;
505
506 top:
507 while ((wc = getwchar()) != WEOF)
508 if (wc == L'<')
509 break;
510
511 if (wc == WEOF && errno == EILSEQ)
512 return (0);
513
514 switch (c = getchar()) {
515 case EOF:
516 return (0);
517 default:
518 (void) ungetc(c, stdin);
519 break;
520 case '!':
521 eatcomments();
522 goto top;
523 }
524
525 tokp = tokbuf;
526
527 while ((c = getchar()) != EOF) {
528 if (c == '>') {
529 while ((c = getchar()) != EOF)
530 if (c != '\n') {
531 (void) ungetc(c, stdin);
532 break;
533 }
534 *tokp = '\0';
535 return (1);
536 }
537 if (tokp - tokbuf < tokbuflen)
538 *tokp++ = (char)c;
539 }
540
541 return (0);
542 }
543
544 /*
545 * This structure is filled out during the parsing of each page we encounter
546 */
547 typedef struct {
548 char *name;
549 string_t *title;
550 string_t *volnum;
551 string_t *date;
552 string_t *names;
553 string_t *purpose;
554 } manpage_t;
555
556 static void
557 warning(manpage_t *m, const char *fmt, ...)
558 {
559 va_list ap;
560 va_start(ap, fmt);
561 (void) fprintf(stderr, "%s: %s - ", progname, m->name);
562 (void) vfprintf(stderr, fmt, ap);
563 va_end(ap);
564 }
565
566 /*
567 * Fetch a string from stdin, terminated by the endtoken.
568 * These strings may be localized, so do this with wide characters.
569 * Hack: skip over (completely ignore) all other tokens
570 * Hack: map all &blort; constructs to spaces.
571 */
572 static string_t *
573 filestring(manpage_t *m, size_t initial, char *endtoken)
574 {
575 char tokbuf[BUFSIZ * MB_LEN_MAX];
576 string_t *s = newstring(initial);
577 wint_t wc;
578
579 while ((wc = getwchar()) != WEOF)
580 switch (wc) {
581 case L'\n':
582 if ((wc = getwchar()) != WEOF)
583 (void) ungetwc(wc, stdin);
584 if (wc != L'<')
585 putwstring(s, L' ');
586 break;
587 case L'<':
588 (void) ungetwc(wc, stdin);
589 if (!find_token(tokbuf, sizeof (tokbuf)) ||
590 strcasecmp(endtoken, tokbuf) == 0)
591 goto done;
592 break;
593 case L'&':
594 while ((wc = getwchar()) != WEOF)
595 if (wc == L';')
596 break;
597 wc = L' ';
598 /* FALLTHROUGH */
599 default:
600 putwstring(s, wc);
601 break;
602 }
603
604 if (errno == EILSEQ)
605 warning(m, "%s while parsing %s\n", strerror(errno), endtoken);
606 done:
607 putwstring(s, L'\0');
608 return (s);
609 }
610
611 /*
612 * <refentrytitle> TITLE </refentrytitle>
613 */
614 static int
615 refentrytitle(manpage_t *m)
616 {
617 if (m->title != NULL)
618 warning(m, "repeated refentrytitle\n");
619 m->title = filestring(m, 8, "/refentrytitle");
620 return (1);
621 }
622
623 /*
624 * <manvolnum> MANVOLNUM </manvolnum>
625 */
626 static int
627 manvolnum(manpage_t *m)
628 {
629 if (m->volnum != NULL)
630 warning(m, "repeated manvolnum\n");
631 m->volnum = filestring(m, 3, "/manvolnum");
632 return (1);
633 }
634
635 /*
636 * <refmiscinfo class="date"> DATE </refmiscinfo>
637 */
638 static int
639 refmiscinfo_date(manpage_t *m)
640 {
641 if (m->date != NULL)
642 warning(m, "repeated date\n");
643 m->date = filestring(m, 11, "/refmiscinfo");
644 return (1);
645 }
646
647 /*
648 * .. </refmeta>
649 */
650 static int
651 print_refmeta(manpage_t *m)
652 {
653 char headbuf[BUFSIZ];
654
655 (void) snprintf(headbuf, sizeof (headbuf), ".TH %ws %ws \"%ws\"",
656 getwstring(m->title), getwstring(m->volnum), getwstring(m->date));
657
658 trimln(headbuf);
659 if (tocrc)
660 doname(m->name);
661 if (!intro)
662 section(m->name, headbuf);
663
664 if (m->title)
665 delstring(&m->title);
666 if (m->volnum)
667 delstring(&m->volnum);
668 if (m->date)
669 delstring(&m->date);
670
671 return (1);
672 }
673
674 static int
675 appendname(manpage_t *m, char *term)
676 {
677 string_t *r = filestring(m, 0, term);
678
679 if (m->names) {
680 appendwstring(m->names, L", ");
681 appendwstring(m->names, getwstring(r));
682 delstring(&r);
683 } else
684 m->names = r;
685 return (1);
686 }
687
688 /*
689 * <refdescriptor> REFDESCRIPTOR </refdescriptor>
690 */
691 static int
692 refdescriptor(manpage_t *m)
693 {
694 return (appendname(m, "/refdescriptor"));
695 }
696
697 /*
698 * <refname> REFNAME </refname>
699 */
700 static int
701 refname(manpage_t *m)
702 {
703 return (appendname(m, "/refname"));
704 }
705
706 /*
707 * <refpurpose> PURPOSE </refpurpose>
708 */
709 static int
710 refpurpose(manpage_t *m)
711 {
712 if (m->purpose != NULL)
713 warning(m, "repeated refpurpose\n");
714 m->purpose = filestring(m, 0, "/refpurpose");
715 return (1);
716 }
717
718 /*
719 * .. </refnamediv> - this is our chance to bail out.
720 */
721 static int
722 terminate(manpage_t *m)
723 {
724 if (m->names) {
725 appendwstring(m->names, L" \\- ");
726 appendwstring(m->names, getwstring(m->purpose));
727 if (intro) {
728 char *buf = getcstring(m->names);
729 split(buf, m->name);
730 free(buf);
731 } else
732 (void) printf("%ws", getwstring(m->names));
733 }
734
735 if (m->names)
736 delstring(&m->names);
737 if (m->purpose)
738 delstring(&m->purpose);
739
740 (void) printf("\n");
741 return (0);
742 }
743
744
745 /*
746 * Basic control structure of the SGML "parser".
747 * It's very simplistic - when named tags are encountered in the
748 * input stream, control is transferred to the corresponding routine.
749 * No checking is done for correct pairing of tags. A few other hacks
750 * are sneaked into the lexical routines above.
751 * Output is generated after seeing the /refmeta and /refnamediv
752 * closing tags.
753 */
754 static const struct {
755 char *name;
756 int (*action)(manpage_t *);
757 } acts[] = {
758 { "refentrytitle", refentrytitle },
759 { "manvolnum", manvolnum },
760 { "refmiscinfo class=\"date\"", refmiscinfo_date },
761 { "/refmeta", print_refmeta },
762 { "refdescriptor", refdescriptor },
763 { "refname", refname },
764 { "refpurpose", refpurpose },
765 { "/refnamediv", terminate },
766 { 0 }
767 };
768
769 static void
770 sgmlpage(char *name)
771 {
772 int rc = 1, a;
773 char tokbuf[BUFSIZ];
774 manpage_t manpage, *m = &manpage;
775
776 (void) memset(m, 0, sizeof (*m));
777 m->name = name;
778
779 do {
780 if (!find_token(tokbuf, sizeof (tokbuf)))
781 break;
782 for (a = 0; acts[a].name; a++) {
783 if (strcasecmp(acts[a].name, tokbuf) != 0)
784 continue;
785 rc = acts[a].action(m);
786 break;
787 }
788 } while (rc);
789 }