1 /* 2 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6 /* Copyright 1984, 1986, 1987, 1988, 1989 AT&T */ 7 /* All Rights Reserved */ 8 9 /* 10 * Copyright 1980 Regents of the University of California. 11 * All rights reserved. The Berkeley software License Agreement 12 * specifies the terms and conditions for redistribution. 13 */ 14 15 #pragma ident "%Z%%M% %I% %E% SMI" 16 17 /* 18 * Get name sections from manual pages. 19 * -t for building toc 20 * -i for building intro entries 21 * other apropos database 22 */ 23 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <stdarg.h> 27 #include <string.h> 28 #include <unistd.h> 29 #include <limits.h> 30 #include <locale.h> 31 #include <wchar.h> 32 #include <errno.h> 33 #include <sys/param.h> 34 35 #define PLEN 3 /* prefix length "man" */ 36 37 static char path[MAXPATHLEN+1]; 38 static int tocrc; 39 static int intro; 40 static char *progname; 41 42 static void trimln(char *); 43 static void roff_trim(char *cp); 44 static void doname(char *); 45 static void section(char *, char *); 46 static void split(char *, char *); 47 static void dorefname(char *); 48 static void troffpage(char *); 49 static void sgmlpage(char *); 50 51 /* 52 * Test to see if this is an SGML manpage or a regular manpage 53 * Unless the first line begins with <!DOCTYPE, we assume it isn't. 54 */ 55 static int 56 issgml(FILE *fp) 57 { 58 static const char magic[] = "<!DOCTYPE"; 59 char buf[sizeof (magic)]; 60 size_t n = sizeof (magic) - 1; 61 62 if (read(fileno(fp), buf, n) != n || 63 lseek(fileno(fp), 0, SEEK_SET) != 0) 64 return (0); 65 return (strncmp(magic, buf, n) == 0); 66 } 67 68 int 69 main(int argc, char *argv[]) 70 { 71 int c; 72 73 (void) setlocale(LC_ALL, ""); 74 75 progname = argv[0]; 76 77 while ((c = getopt(argc, argv, "it")) != EOF) 78 switch (c) { 79 case 't': 80 tocrc++; 81 break; 82 case 'i': 83 intro++; 84 break; 85 case '?': 86 default: 87 (void) fprintf(stderr, 88 "usage: %s [-i][-t] files..\n", progname); 89 exit(1); 90 } 91 92 if (getcwd(path, sizeof (path)) == NULL) { 93 (void) fprintf(stderr, "%s: getcwd: %s\n", progname, path); 94 exit(1); 95 } 96 97 for (; optind < argc; optind++) { 98 char *name = argv[optind]; 99 100 if (freopen(name, "r", stdin) == 0) { 101 (void) fprintf(stderr, 102 "%s: %s: %s\n", progname, name, strerror(errno)); 103 continue; 104 } 105 106 /* 107 * Most of the info we care about is in the first kbyte 108 */ 109 (void) setvbuf(stdin, NULL, _IOFBF, 1024); 110 111 if (issgml(stdin)) 112 sgmlpage(name); 113 else 114 troffpage(name); 115 } 116 117 return (0); 118 } 119 120 /* 121 * Parse a troff-format manpage 122 */ 123 static void 124 troffpage(char *name) 125 { 126 char headbuf[BUFSIZ]; 127 char linbuf[BUFSIZ]; 128 char *strptr; 129 int i = 0; 130 131 for (;;) { 132 if (fgets(headbuf, sizeof (headbuf), stdin) == NULL) 133 return; 134 if (headbuf[0] != '.') 135 continue; 136 if (headbuf[1] == 'T' && headbuf[2] == 'H') 137 break; 138 if (headbuf[1] == 't' && headbuf[2] == 'h') 139 break; 140 } 141 for (;;) { 142 if (fgets(linbuf, sizeof (linbuf), stdin) == NULL) 143 return; 144 if (linbuf[0] != '.') 145 continue; 146 if (linbuf[1] == 'S' && linbuf[2] == 'H') 147 break; 148 if (linbuf[1] == 's' && linbuf[2] == 'h') 149 break; 150 } 151 trimln(headbuf); 152 if (tocrc) 153 doname(name); 154 if (!intro) 155 section(name, headbuf); 156 for (;;) { 157 if (fgets(linbuf, sizeof (linbuf), stdin) == NULL) 158 break; 159 if (linbuf[0] == '.') { 160 if (linbuf[1] == 'S' && linbuf[2] == 'H') 161 break; 162 if (linbuf[1] == 's' && linbuf[2] == 'h') 163 break; 164 if (linbuf[1] == '\\' && linbuf[2] == '"') 165 continue; 166 } 167 trimln(linbuf); 168 roff_trim(linbuf); 169 if (intro) { 170 split(linbuf, name); 171 continue; 172 } 173 if (i != 0) 174 (void) printf(" "); 175 i++; 176 (void) printf("%s", linbuf); 177 } 178 (void) printf("\n"); 179 } 180 181 182 /* 183 * Substitute section defined in page with new section spec 184 * of the form xx/yy where xx is the section suffix of the 185 * directory and yy is the filename extension (unless xx 186 * and yy are equal, in which case xx is the section). 187 * Pages should be placed in their proper directory with the 188 * proper name to simplify things. 189 * 190 * For example take the following names: 191 * man1/ar.1v (1/1V) 192 * man1/find.1 (1) 193 * man1/loco (1/) 194 * 195 */ 196 static void 197 section(char *name, char *buf) 198 { 199 char scratch[MAXPATHLEN+1]; 200 char *p = buf; 201 char *dir, *fname; 202 char *dp, *np; 203 int i; 204 int plen = PLEN; 205 206 /* 207 * split dirname and filename 208 */ 209 (void) strcpy(scratch, name); 210 if ((fname = strrchr(scratch, '/')) == NULL) { 211 fname = name; 212 dir = path; 213 } else { 214 dir = scratch; 215 *fname = 0; 216 fname++; 217 } 218 dp = strrchr(dir, '/'); 219 220 if (*(dp+1) == 's') 221 plen = PLEN + 1; 222 223 if (dp != NULL) { 224 dp = dp+plen+1; 225 } else { 226 dp = dir+plen; 227 } 228 np = strrchr(fname, '.'); 229 if (np != NULL) { 230 ++np; 231 } else { 232 np = ""; 233 } 234 for (i = 0; i < 2; i++) { 235 while (*p && *p != ' ' && *p != '\t') 236 p++; 237 if (!*p) 238 break; 239 while (*p && (*p == ' ' || *p == '\t')) 240 p++; 241 if (!*p) 242 break; 243 } 244 *p++ = 0; 245 (void) printf("%s", buf); 246 if (strcmp(np, dp) == 0) 247 (void) printf("%s", dp); 248 else 249 (void) printf("%s/%s", dp, np); 250 while (*p && *p != ' ' && *p != '\t') 251 p++; 252 (void) printf("%s\t", p); 253 } 254 255 static void 256 trimln(char *cp) 257 { 258 while (*cp) 259 cp++; 260 if (*--cp == '\n') 261 *cp = 0; 262 } 263 264 static void 265 roff_trim(char *cp) 266 { 267 if (*cp == '.') { 268 while ((*cp != ' ') && (*cp != '\0')) { 269 strcpy(cp, cp+1); 270 } 271 strcpy(cp, cp+1); 272 } 273 while (*cp) { 274 if (strncmp(cp, "\\f", 2) == 0) { 275 if ((*(cp+2) >= 48) && (*(cp+2) <= 57)) { 276 strcpy(cp, cp+3); 277 } 278 if (*(cp+2) == '(') { 279 strcpy(cp, cp+5); 280 } 281 } 282 cp++; 283 } 284 } 285 286 static void 287 doname(char *name) 288 { 289 char *dp = name, *ep; 290 291 again: 292 while (*dp && *dp != '.') 293 (void) putchar(*dp++); 294 if (*dp) 295 for (ep = dp+1; *ep; ep++) 296 if (*ep == '.') { 297 (void) putchar(*dp++); 298 goto again; 299 } 300 (void) putchar('('); 301 if (*dp) 302 dp++; 303 while (*dp) 304 (void) putchar(*dp++); 305 (void) putchar(')'); 306 (void) putchar(' '); 307 } 308 309 static void 310 split(char *line, char *name) 311 { 312 char *cp, *dp; 313 char *sp, *sep; 314 315 cp = strchr(line, '-'); 316 if (cp == 0) 317 return; 318 sp = cp + 1; 319 for (--cp; *cp == ' ' || *cp == '\t' || *cp == '\\'; cp--) 320 ; 321 *++cp = '\0'; 322 while (*sp && (*sp == ' ' || *sp == '\t')) 323 sp++; 324 for (sep = "", dp = line; dp && *dp; dp = cp, sep = "\n") { 325 cp = strchr(dp, ','); 326 if (cp) { 327 char *tp; 328 329 for (tp = cp - 1; *tp == ' ' || *tp == '\t'; tp--) 330 ; 331 *++tp = '\0'; 332 for (++cp; *cp == ' ' || *cp == '\t'; cp++) 333 ; 334 } 335 (void) printf("%s%s\t", sep, dp); 336 dorefname(name); 337 (void) printf("\t%s", sp); 338 } 339 } 340 341 static void 342 dorefname(char *name) 343 { 344 char *dp = name, *ep; 345 346 again: 347 while (*dp && *dp != '.') 348 (void) putchar(*dp++); 349 if (*dp) 350 for (ep = dp+1; *ep; ep++) 351 if (*ep == '.') { 352 (void) putchar(*dp++); 353 goto again; 354 } 355 (void) putchar('.'); 356 if (*dp) 357 dp++; 358 while (*dp) 359 (void) putchar(*dp++); 360 } 361 362 /* 363 * The rest of the routines in the file form a simplistic parser 364 * for SGML manpages. We assume the input is syntactically correct 365 * SGML, and that the fields occur in the input file in order. 366 */ 367 368 /* 369 * Some utilities for constructing arbitrary length wide character strings 370 */ 371 372 typedef struct { 373 wchar_t *str; 374 size_t size; 375 long index; 376 } string_t; 377 378 #define DEF_STR_SIZE 16 379 #define DEF_STR_GROWTH 16 380 381 static void 382 outofspace(char *where) 383 { 384 (void) fprintf(stderr, "%s: '%s' - out of memory\n", progname, where); 385 exit(1); 386 } 387 388 static string_t * 389 newstring(size_t initial) 390 { 391 string_t *s = malloc(sizeof (*s)); 392 393 if (s == NULL) 394 outofspace("new s"); 395 396 initial *= sizeof (wchar_t); 397 if (initial < DEF_STR_SIZE) 398 initial = DEF_STR_SIZE; 399 400 s->str = malloc(initial); 401 if (s->str == NULL) 402 outofspace("new str"); 403 404 s->size = initial; 405 s->index = 0; 406 *s->str = L'\0'; 407 return (s); 408 } 409 410 static void 411 delstring(string_t **s) 412 { 413 free((*s)->str); 414 (*s)->str = NULL; 415 free(*s); 416 *s = NULL; 417 } 418 419 static wchar_t * 420 getwstring(string_t *s) 421 { 422 static const wchar_t wnull = L'\0'; 423 424 if (s) 425 return (s->str); 426 return ((wchar_t *)&wnull); 427 } 428 429 static char * 430 getcstring(string_t *s) 431 { 432 size_t len = (wcslen(s->str) + 1) * MB_CUR_MAX; 433 char *cstr = malloc(len); 434 char *p = cstr; 435 wchar_t *wp = s->str; 436 437 if (p == NULL) 438 outofspace("getc"); 439 while (*wp) 440 p += wctomb(p, *wp++); 441 *p = '\0'; 442 return (cstr); 443 } 444 445 static void 446 appendwstring(string_t *s, const wchar_t *str) 447 { 448 size_t len = wcslen(str) + 1; 449 450 s->size += sizeof (wchar_t) * len; 451 s->str = realloc(s->str, s->size); 452 if (s->str == NULL) 453 outofspace("appendw"); 454 (void) wcscat(s->str, str); 455 s->index = wcslen(s->str) + 1; 456 } 457 458 static void 459 putwstring(string_t *s, wchar_t wc) 460 { 461 if ((s->index + 1) * sizeof (wchar_t) >= s->size) { 462 s->size += DEF_STR_GROWTH; 463 s->str = realloc(s->str, s->size); 464 if (s->str == NULL) 465 outofspace("put"); 466 } 467 s->str[s->index++] = wc; 468 } 469 470 /* 471 * Find the closing > of an SGML comment block 472 * (allowing for multibyte, embedded, comments) 473 */ 474 static void 475 eatcomments(void) 476 { 477 int pending = 1; 478 479 while (pending) 480 switch (getwchar()) { 481 default: 482 break; 483 case L'<': 484 pending++; 485 break; 486 case L'>': 487 pending--; 488 break; 489 case WEOF: 490 return; 491 } 492 } 493 494 /* 495 * Find the next token on stdin. 496 * Handles nested comment strings, and removes any trailing newlines 497 * from the stream after the closing '>'. 498 */ 499 static int 500 find_token(char *tokbuf, size_t tokbuflen) 501 { 502 int c; 503 wint_t wc; 504 char *tokp; 505 506 top: 507 while ((wc = getwchar()) != WEOF) 508 if (wc == L'<') 509 break; 510 511 if (wc == WEOF && errno == EILSEQ) 512 return (0); 513 514 switch (c = getchar()) { 515 case EOF: 516 return (0); 517 default: 518 (void) ungetc(c, stdin); 519 break; 520 case '!': 521 eatcomments(); 522 goto top; 523 } 524 525 tokp = tokbuf; 526 527 while ((c = getchar()) != EOF) { 528 if (c == '>') { 529 while ((c = getchar()) != EOF) 530 if (c != '\n') { 531 (void) ungetc(c, stdin); 532 break; 533 } 534 *tokp = '\0'; 535 return (1); 536 } 537 if (tokp - tokbuf < tokbuflen) 538 *tokp++ = (char)c; 539 } 540 541 return (0); 542 } 543 544 /* 545 * This structure is filled out during the parsing of each page we encounter 546 */ 547 typedef struct { 548 char *name; 549 string_t *title; 550 string_t *volnum; 551 string_t *date; 552 string_t *names; 553 string_t *purpose; 554 } manpage_t; 555 556 static void 557 warning(manpage_t *m, const char *fmt, ...) 558 { 559 va_list ap; 560 va_start(ap, fmt); 561 (void) fprintf(stderr, "%s: %s - ", progname, m->name); 562 (void) vfprintf(stderr, fmt, ap); 563 va_end(ap); 564 } 565 566 /* 567 * Fetch a string from stdin, terminated by the endtoken. 568 * These strings may be localized, so do this with wide characters. 569 * Hack: skip over (completely ignore) all other tokens 570 * Hack: map all &blort; constructs to spaces. 571 */ 572 static string_t * 573 filestring(manpage_t *m, size_t initial, char *endtoken) 574 { 575 char tokbuf[BUFSIZ * MB_LEN_MAX]; 576 string_t *s = newstring(initial); 577 wint_t wc; 578 579 while ((wc = getwchar()) != WEOF) 580 switch (wc) { 581 case L'\n': 582 if ((wc = getwchar()) != WEOF) 583 (void) ungetwc(wc, stdin); 584 if (wc != L'<') 585 putwstring(s, L' '); 586 break; 587 case L'<': 588 (void) ungetwc(wc, stdin); 589 if (!find_token(tokbuf, sizeof (tokbuf)) || 590 strcasecmp(endtoken, tokbuf) == 0) 591 goto done; 592 break; 593 case L'&': 594 while ((wc = getwchar()) != WEOF) 595 if (wc == L';') 596 break; 597 wc = L' '; 598 /* FALLTHROUGH */ 599 default: 600 putwstring(s, wc); 601 break; 602 } 603 604 if (errno == EILSEQ) 605 warning(m, "%s while parsing %s\n", strerror(errno), endtoken); 606 done: 607 putwstring(s, L'\0'); 608 return (s); 609 } 610 611 /* 612 * <refentrytitle> TITLE </refentrytitle> 613 */ 614 static int 615 refentrytitle(manpage_t *m) 616 { 617 if (m->title != NULL) 618 warning(m, "repeated refentrytitle\n"); 619 m->title = filestring(m, 8, "/refentrytitle"); 620 return (1); 621 } 622 623 /* 624 * <manvolnum> MANVOLNUM </manvolnum> 625 */ 626 static int 627 manvolnum(manpage_t *m) 628 { 629 if (m->volnum != NULL) 630 warning(m, "repeated manvolnum\n"); 631 m->volnum = filestring(m, 3, "/manvolnum"); 632 return (1); 633 } 634 635 /* 636 * <refmiscinfo class="date"> DATE </refmiscinfo> 637 */ 638 static int 639 refmiscinfo_date(manpage_t *m) 640 { 641 if (m->date != NULL) 642 warning(m, "repeated date\n"); 643 m->date = filestring(m, 11, "/refmiscinfo"); 644 return (1); 645 } 646 647 /* 648 * .. </refmeta> 649 */ 650 static int 651 print_refmeta(manpage_t *m) 652 { 653 char headbuf[BUFSIZ]; 654 655 (void) snprintf(headbuf, sizeof (headbuf), ".TH %ws %ws \"%ws\"", 656 getwstring(m->title), getwstring(m->volnum), getwstring(m->date)); 657 658 trimln(headbuf); 659 if (tocrc) 660 doname(m->name); 661 if (!intro) 662 section(m->name, headbuf); 663 664 if (m->title) 665 delstring(&m->title); 666 if (m->volnum) 667 delstring(&m->volnum); 668 if (m->date) 669 delstring(&m->date); 670 671 return (1); 672 } 673 674 static int 675 appendname(manpage_t *m, char *term) 676 { 677 string_t *r = filestring(m, 0, term); 678 679 if (m->names) { 680 appendwstring(m->names, L", "); 681 appendwstring(m->names, getwstring(r)); 682 delstring(&r); 683 } else 684 m->names = r; 685 return (1); 686 } 687 688 /* 689 * <refdescriptor> REFDESCRIPTOR </refdescriptor> 690 */ 691 static int 692 refdescriptor(manpage_t *m) 693 { 694 return (appendname(m, "/refdescriptor")); 695 } 696 697 /* 698 * <refname> REFNAME </refname> 699 */ 700 static int 701 refname(manpage_t *m) 702 { 703 return (appendname(m, "/refname")); 704 } 705 706 /* 707 * <refpurpose> PURPOSE </refpurpose> 708 */ 709 static int 710 refpurpose(manpage_t *m) 711 { 712 if (m->purpose != NULL) 713 warning(m, "repeated refpurpose\n"); 714 m->purpose = filestring(m, 0, "/refpurpose"); 715 return (1); 716 } 717 718 /* 719 * .. </refnamediv> - this is our chance to bail out. 720 */ 721 static int 722 terminate(manpage_t *m) 723 { 724 if (m->names) { 725 appendwstring(m->names, L" \\- "); 726 appendwstring(m->names, getwstring(m->purpose)); 727 if (intro) { 728 char *buf = getcstring(m->names); 729 split(buf, m->name); 730 free(buf); 731 } else 732 (void) printf("%ws", getwstring(m->names)); 733 } 734 735 if (m->names) 736 delstring(&m->names); 737 if (m->purpose) 738 delstring(&m->purpose); 739 740 (void) printf("\n"); 741 return (0); 742 } 743 744 745 /* 746 * Basic control structure of the SGML "parser". 747 * It's very simplistic - when named tags are encountered in the 748 * input stream, control is transferred to the corresponding routine. 749 * No checking is done for correct pairing of tags. A few other hacks 750 * are sneaked into the lexical routines above. 751 * Output is generated after seeing the /refmeta and /refnamediv 752 * closing tags. 753 */ 754 static const struct { 755 char *name; 756 int (*action)(manpage_t *); 757 } acts[] = { 758 { "refentrytitle", refentrytitle }, 759 { "manvolnum", manvolnum }, 760 { "refmiscinfo class=\"date\"", refmiscinfo_date }, 761 { "/refmeta", print_refmeta }, 762 { "refdescriptor", refdescriptor }, 763 { "refname", refname }, 764 { "refpurpose", refpurpose }, 765 { "/refnamediv", terminate }, 766 { 0 } 767 }; 768 769 static void 770 sgmlpage(char *name) 771 { 772 int rc = 1, a; 773 char tokbuf[BUFSIZ]; 774 manpage_t manpage, *m = &manpage; 775 776 (void) memset(m, 0, sizeof (*m)); 777 m->name = name; 778 779 do { 780 if (!find_token(tokbuf, sizeof (tokbuf))) 781 break; 782 for (a = 0; acts[a].name; a++) { 783 if (strcasecmp(acts[a].name, tokbuf) != 0) 784 continue; 785 rc = acts[a].action(m); 786 break; 787 } 788 } while (rc); 789 }