1 /*
   2  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
   3  * Use is subject to license terms.
   4  */
   5 
   6 /*      Copyright 1984, 1986, 1987, 1988, 1989 AT&T */
   7 /*      All Rights Reserved   */
   8 
   9 /*
  10  * Copyright 1980 Regents of the University of California.
  11  * All rights reserved.  The Berkeley software License Agreement
  12  * specifies the terms and conditions for redistribution.
  13  */
  14 
  15 #pragma ident   "%Z%%M% %I%     %E% SMI"
  16 
  17 /*
  18  * Get name sections from manual pages.
  19  *      -t      for building toc
  20  *      -i      for building intro entries
  21  *      other   apropos database
  22  */
  23 
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <stdarg.h>
  27 #include <string.h>
  28 #include <unistd.h>
  29 #include <limits.h>
  30 #include <locale.h>
  31 #include <wchar.h>
  32 #include <errno.h>
  33 #include <sys/param.h>
  34 
  35 #define PLEN    3       /* prefix length "man" */
  36 
  37 static char path[MAXPATHLEN+1];
  38 static int tocrc;
  39 static int intro;
  40 static char *progname;
  41 
  42 static void trimln(char *);
  43 static void roff_trim(char *cp);
  44 static void doname(char *);
  45 static void section(char *, char *);
  46 static void split(char *, char *);
  47 static void dorefname(char *);
  48 static void troffpage(char *);
  49 static void sgmlpage(char *);
  50 
  51 /*
  52  * Test to see if this is an SGML manpage or a regular manpage
  53  * Unless the first line begins with <!DOCTYPE, we assume it isn't.
  54  */
  55 static int
  56 issgml(FILE *fp)
  57 {
  58         static const char magic[] = "<!DOCTYPE";
  59         char buf[sizeof (magic)];
  60         size_t n = sizeof (magic) - 1;
  61 
  62         if (read(fileno(fp), buf, n) != n ||
  63             lseek(fileno(fp), 0, SEEK_SET) != 0)
  64                 return (0);
  65         return (strncmp(magic, buf, n) == 0);
  66 }
  67 
  68 int
  69 main(int argc, char *argv[])
  70 {
  71         int c;
  72 
  73         (void) setlocale(LC_ALL, "");
  74 
  75         progname = argv[0];
  76 
  77         while ((c = getopt(argc, argv, "it")) != EOF)
  78                 switch (c) {
  79                 case 't':
  80                         tocrc++;
  81                         break;
  82                 case 'i':
  83                         intro++;
  84                         break;
  85                 case '?':
  86                 default:
  87                         (void) fprintf(stderr,
  88                             "usage: %s [-i][-t] files..\n", progname);
  89                         exit(1);
  90                 }
  91 
  92         if (getcwd(path, sizeof (path)) == NULL) {
  93                 (void) fprintf(stderr, "%s: getcwd: %s\n", progname, path);
  94                 exit(1);
  95         }
  96 
  97         for (; optind < argc; optind++) {
  98                 char *name = argv[optind];
  99 
 100                 if (freopen(name, "r", stdin) == 0) {
 101                         (void) fprintf(stderr,
 102                             "%s: %s: %s\n", progname, name, strerror(errno));
 103                         continue;
 104                 }
 105 
 106                 /*
 107                  * Most of the info we care about is in the first kbyte
 108                  */
 109                 (void) setvbuf(stdin, NULL, _IOFBF, 1024);
 110 
 111                 if (issgml(stdin))
 112                         sgmlpage(name);
 113                 else
 114                         troffpage(name);
 115         }
 116 
 117         return (0);
 118 }
 119 
 120 /*
 121  * Parse a troff-format manpage
 122  */
 123 static void
 124 troffpage(char *name)
 125 {
 126         char headbuf[BUFSIZ];
 127         char linbuf[BUFSIZ];
 128         char *strptr;
 129         int i = 0;
 130 
 131         for (;;) {
 132                 if (fgets(headbuf, sizeof (headbuf), stdin) == NULL)
 133                         return;
 134                 if (headbuf[0] != '.')
 135                         continue;
 136                 if (headbuf[1] == 'T' && headbuf[2] == 'H')
 137                         break;
 138                 if (headbuf[1] == 't' && headbuf[2] == 'h')
 139                         break;
 140         }
 141         for (;;) {
 142                 if (fgets(linbuf, sizeof (linbuf), stdin) == NULL)
 143                         return;
 144                 if (linbuf[0] != '.')
 145                         continue;
 146                 if (linbuf[1] == 'S' && linbuf[2] == 'H')
 147                         break;
 148                 if (linbuf[1] == 's' && linbuf[2] == 'h')
 149                         break;
 150         }
 151         trimln(headbuf);
 152         if (tocrc)
 153                 doname(name);
 154         if (!intro)
 155                 section(name, headbuf);
 156         for (;;) {
 157                 if (fgets(linbuf, sizeof (linbuf), stdin) == NULL)
 158                         break;
 159                 if (linbuf[0] == '.') {
 160                         if (linbuf[1] == 'S' && linbuf[2] == 'H')
 161                                 break;
 162                         if (linbuf[1] == 's' && linbuf[2] == 'h')
 163                                 break;
 164                         if (linbuf[1] == '\\' && linbuf[2] == '"')
 165                                 continue;
 166                 }
 167                 trimln(linbuf);
 168                 roff_trim(linbuf);
 169                 if (intro) {
 170                         split(linbuf, name);
 171                         continue;
 172                 }
 173                 if (i != 0)
 174                         (void) printf(" ");
 175                 i++;
 176                 (void) printf("%s", linbuf);
 177         }
 178         (void) printf("\n");
 179 }
 180 
 181 
 182 /*
 183  * Substitute section defined in page with new section spec
 184  * of the form xx/yy where xx is the section suffix of the
 185  * directory and yy is the filename extension (unless xx
 186  * and yy are equal, in which case xx is the section).
 187  * Pages should be placed in their proper directory with the
 188  * proper name to simplify things.
 189  *
 190  * For example take the following names:
 191  *    man1/ar.1v        (1/1V)
 192  *    man1/find.1       (1)
 193  *    man1/loco         (1/)
 194  *
 195  */
 196 static void
 197 section(char *name, char *buf)
 198 {
 199         char scratch[MAXPATHLEN+1];
 200         char *p = buf;
 201         char *dir, *fname;
 202         char *dp, *np;
 203         int i;
 204         int plen = PLEN;
 205 
 206         /*
 207          * split dirname and filename
 208          */
 209         (void) strcpy(scratch, name);
 210         if ((fname = strrchr(scratch, '/')) == NULL) {
 211                 fname = name;
 212                 dir = path;
 213         } else {
 214                 dir = scratch;
 215                 *fname = 0;
 216                 fname++;
 217         }
 218         dp = strrchr(dir, '/');
 219 
 220         if (*(dp+1) == 's')
 221                 plen = PLEN + 1;
 222 
 223         if (dp != NULL) {
 224                 dp = dp+plen+1;
 225         } else {
 226                 dp = dir+plen;
 227         }
 228         np = strrchr(fname, '.');
 229         if (np != NULL) {
 230                 ++np;
 231         } else {
 232                 np = "";
 233         }
 234         for (i = 0; i < 2; i++) {
 235                 while (*p && *p != ' ' && *p != '\t')
 236                         p++;
 237                 if (!*p)
 238                         break;
 239                 while (*p && (*p == ' ' || *p == '\t'))
 240                         p++;
 241                 if (!*p)
 242                         break;
 243         }
 244         *p++ = 0;
 245         (void) printf("%s", buf);
 246         if (strcmp(np, dp) == 0)
 247                 (void) printf("%s", dp);
 248         else
 249                 (void) printf("%s/%s", dp, np);
 250         while (*p && *p != ' ' && *p != '\t')
 251                 p++;
 252         (void) printf("%s\t", p);
 253 }
 254 
 255 static void
 256 trimln(char *cp)
 257 {
 258         while (*cp)
 259                 cp++;
 260         if (*--cp == '\n')
 261                 *cp = 0;
 262 }
 263 
 264 static void
 265 roff_trim(char *cp)
 266 {
 267         if (*cp == '.') {
 268                 while ((*cp != ' ') && (*cp != '\0')) {
 269                         strcpy(cp, cp+1);
 270                 }
 271                 strcpy(cp, cp+1);
 272         }
 273         while (*cp) {
 274                 if (strncmp(cp, "\\f", 2) == 0) {
 275                         if ((*(cp+2) >= 48) && (*(cp+2) <= 57)) {
 276                                 strcpy(cp, cp+3);
 277                         }
 278                         if (*(cp+2) == '(') {
 279                                 strcpy(cp, cp+5);
 280                         }
 281                 }
 282                 cp++;
 283         }
 284 }
 285 
 286 static void
 287 doname(char *name)
 288 {
 289         char *dp = name, *ep;
 290 
 291 again:
 292         while (*dp && *dp != '.')
 293                 (void) putchar(*dp++);
 294         if (*dp)
 295                 for (ep = dp+1; *ep; ep++)
 296                         if (*ep == '.') {
 297                                 (void) putchar(*dp++);
 298                                 goto again;
 299                         }
 300         (void) putchar('(');
 301         if (*dp)
 302                 dp++;
 303         while (*dp)
 304                 (void) putchar(*dp++);
 305         (void) putchar(')');
 306         (void) putchar(' ');
 307 }
 308 
 309 static void
 310 split(char *line, char *name)
 311 {
 312         char *cp, *dp;
 313         char *sp, *sep;
 314 
 315         cp = strchr(line, '-');
 316         if (cp == 0)
 317                 return;
 318         sp = cp + 1;
 319         for (--cp; *cp == ' ' || *cp == '\t' || *cp == '\\'; cp--)
 320                 ;
 321         *++cp = '\0';
 322         while (*sp && (*sp == ' ' || *sp == '\t'))
 323                 sp++;
 324         for (sep = "", dp = line; dp && *dp; dp = cp, sep = "\n") {
 325                 cp = strchr(dp, ',');
 326                 if (cp) {
 327                         char *tp;
 328 
 329                         for (tp = cp - 1; *tp == ' ' || *tp == '\t'; tp--)
 330                                 ;
 331                         *++tp = '\0';
 332                         for (++cp; *cp == ' ' || *cp == '\t'; cp++)
 333                                 ;
 334                 }
 335                 (void) printf("%s%s\t", sep, dp);
 336                 dorefname(name);
 337                 (void) printf("\t%s", sp);
 338         }
 339 }
 340 
 341 static void
 342 dorefname(char *name)
 343 {
 344         char *dp = name, *ep;
 345 
 346 again:
 347         while (*dp && *dp != '.')
 348                 (void) putchar(*dp++);
 349         if (*dp)
 350                 for (ep = dp+1; *ep; ep++)
 351                         if (*ep == '.') {
 352                                 (void) putchar(*dp++);
 353                                 goto again;
 354                         }
 355         (void) putchar('.');
 356         if (*dp)
 357                 dp++;
 358         while (*dp)
 359                 (void) putchar(*dp++);
 360 }
 361 
 362 /*
 363  * The rest of the routines in the file form a simplistic parser
 364  * for SGML manpages.  We assume the input is syntactically correct
 365  * SGML, and that the fields occur in the input file in order.
 366  */
 367 
 368 /*
 369  * Some utilities for constructing arbitrary length wide character strings
 370  */
 371 
 372 typedef struct {
 373         wchar_t *str;
 374         size_t size;
 375         long index;
 376 } string_t;
 377 
 378 #define DEF_STR_SIZE    16
 379 #define DEF_STR_GROWTH  16
 380 
 381 static void
 382 outofspace(char *where)
 383 {
 384         (void) fprintf(stderr, "%s: '%s' - out of memory\n", progname, where);
 385         exit(1);
 386 }
 387 
 388 static string_t *
 389 newstring(size_t initial)
 390 {
 391         string_t *s = malloc(sizeof (*s));
 392 
 393         if (s == NULL)
 394                 outofspace("new s");
 395 
 396         initial *= sizeof (wchar_t);
 397         if (initial < DEF_STR_SIZE)
 398                 initial = DEF_STR_SIZE;
 399 
 400         s->str = malloc(initial);
 401         if (s->str == NULL)
 402                 outofspace("new str");
 403 
 404         s->size = initial;
 405         s->index = 0;
 406         *s->str = L'\0';
 407         return (s);
 408 }
 409 
 410 static void
 411 delstring(string_t **s)
 412 {
 413         free((*s)->str);
 414         (*s)->str = NULL;
 415         free(*s);
 416         *s = NULL;
 417 }
 418 
 419 static wchar_t *
 420 getwstring(string_t *s)
 421 {
 422         static const wchar_t wnull = L'\0';
 423 
 424         if (s)
 425                 return (s->str);
 426         return ((wchar_t *)&wnull);
 427 }
 428 
 429 static char *
 430 getcstring(string_t *s)
 431 {
 432         size_t len = (wcslen(s->str) + 1) * MB_CUR_MAX;
 433         char *cstr = malloc(len);
 434         char *p = cstr;
 435         wchar_t *wp = s->str;
 436 
 437         if (p == NULL)
 438                 outofspace("getc");
 439         while (*wp)
 440                 p += wctomb(p, *wp++);
 441         *p = '\0';
 442         return (cstr);
 443 }
 444 
 445 static void
 446 appendwstring(string_t *s, const wchar_t *str)
 447 {
 448         size_t len = wcslen(str) + 1;
 449 
 450         s->size += sizeof (wchar_t) * len;
 451         s->str = realloc(s->str, s->size);
 452         if (s->str == NULL)
 453                 outofspace("appendw");
 454         (void) wcscat(s->str, str);
 455         s->index = wcslen(s->str) + 1;
 456 }
 457 
 458 static void
 459 putwstring(string_t *s, wchar_t wc)
 460 {
 461         if ((s->index + 1) * sizeof (wchar_t) >= s->size) {
 462                 s->size += DEF_STR_GROWTH;
 463                 s->str = realloc(s->str, s->size);
 464                 if (s->str == NULL)
 465                         outofspace("put");
 466         }
 467         s->str[s->index++] = wc;
 468 }
 469 
 470 /*
 471  * Find the closing > of an SGML comment block
 472  * (allowing for multibyte, embedded, comments)
 473  */
 474 static void
 475 eatcomments(void)
 476 {
 477         int pending = 1;
 478 
 479         while (pending)
 480                 switch (getwchar()) {
 481                 default:
 482                         break;
 483                 case L'<':
 484                         pending++;
 485                         break;
 486                 case L'>':
 487                         pending--;
 488                         break;
 489                 case WEOF:
 490                         return;
 491                 }
 492 }
 493 
 494 /*
 495  * Find the next token on stdin.
 496  * Handles nested comment strings, and removes any trailing newlines
 497  * from the stream after the closing '>'.
 498  */
 499 static int
 500 find_token(char *tokbuf, size_t tokbuflen)
 501 {
 502         int c;
 503         wint_t wc;
 504         char *tokp;
 505 
 506 top:
 507         while ((wc = getwchar()) != WEOF)
 508                 if (wc == L'<')
 509                         break;
 510 
 511         if (wc == WEOF && errno == EILSEQ)
 512                 return (0);
 513 
 514         switch (c = getchar()) {
 515         case EOF:
 516                 return (0);
 517         default:
 518                 (void) ungetc(c, stdin);
 519                 break;
 520         case '!':
 521                 eatcomments();
 522                 goto top;
 523         }
 524 
 525         tokp = tokbuf;
 526 
 527         while ((c = getchar()) != EOF) {
 528                 if (c == '>') {
 529                         while ((c = getchar()) != EOF)
 530                                 if (c != '\n') {
 531                                         (void) ungetc(c, stdin);
 532                                         break;
 533                                 }
 534                         *tokp = '\0';
 535                         return (1);
 536                 }
 537                 if (tokp - tokbuf < tokbuflen)
 538                         *tokp++ = (char)c;
 539         }
 540 
 541         return (0);
 542 }
 543 
 544 /*
 545  * This structure is filled out during the parsing of each page we encounter
 546  */
 547 typedef struct {
 548         char *name;
 549         string_t *title;
 550         string_t *volnum;
 551         string_t *date;
 552         string_t *names;
 553         string_t *purpose;
 554 } manpage_t;
 555 
 556 static void
 557 warning(manpage_t *m, const char *fmt, ...)
 558 {
 559         va_list ap;
 560         va_start(ap, fmt);
 561         (void) fprintf(stderr, "%s: %s - ", progname, m->name);
 562         (void) vfprintf(stderr, fmt, ap);
 563         va_end(ap);
 564 }
 565 
 566 /*
 567  * Fetch a string from stdin, terminated by the endtoken.
 568  * These strings may be localized, so do this with wide characters.
 569  * Hack: skip over (completely ignore) all other tokens
 570  * Hack: map all &blort; constructs to spaces.
 571  */
 572 static string_t *
 573 filestring(manpage_t *m, size_t initial, char *endtoken)
 574 {
 575         char tokbuf[BUFSIZ * MB_LEN_MAX];
 576         string_t *s = newstring(initial);
 577         wint_t wc;
 578 
 579         while ((wc = getwchar()) != WEOF)
 580                 switch (wc) {
 581                 case L'\n':
 582                         if ((wc = getwchar()) != WEOF)
 583                                 (void) ungetwc(wc, stdin);
 584                         if (wc != L'<')
 585                                 putwstring(s, L' ');
 586                         break;
 587                 case L'<':
 588                         (void) ungetwc(wc, stdin);
 589                         if (!find_token(tokbuf, sizeof (tokbuf)) ||
 590                             strcasecmp(endtoken, tokbuf) == 0)
 591                                 goto done;
 592                         break;
 593                 case L'&':
 594                         while ((wc = getwchar()) != WEOF)
 595                                 if (wc == L';')
 596                                         break;
 597                         wc = L' ';
 598                         /* FALLTHROUGH */
 599                 default:
 600                         putwstring(s, wc);
 601                         break;
 602                 }
 603 
 604         if (errno == EILSEQ)
 605                 warning(m, "%s while parsing %s\n", strerror(errno), endtoken);
 606 done:
 607         putwstring(s, L'\0');
 608         return (s);
 609 }
 610 
 611 /*
 612  * <refentrytitle> TITLE </refentrytitle>
 613  */
 614 static int
 615 refentrytitle(manpage_t *m)
 616 {
 617         if (m->title != NULL)
 618                 warning(m, "repeated refentrytitle\n");
 619         m->title = filestring(m, 8, "/refentrytitle");
 620         return (1);
 621 }
 622 
 623 /*
 624  * <manvolnum> MANVOLNUM </manvolnum>
 625  */
 626 static int
 627 manvolnum(manpage_t *m)
 628 {
 629         if (m->volnum != NULL)
 630                 warning(m, "repeated manvolnum\n");
 631         m->volnum = filestring(m, 3, "/manvolnum");
 632         return (1);
 633 }
 634 
 635 /*
 636  * <refmiscinfo class="date"> DATE </refmiscinfo>
 637  */
 638 static int
 639 refmiscinfo_date(manpage_t *m)
 640 {
 641         if (m->date != NULL)
 642                 warning(m, "repeated date\n");
 643         m->date = filestring(m, 11, "/refmiscinfo");
 644         return (1);
 645 }
 646 
 647 /*
 648  * .. </refmeta>
 649  */
 650 static int
 651 print_refmeta(manpage_t *m)
 652 {
 653         char headbuf[BUFSIZ];
 654 
 655         (void) snprintf(headbuf, sizeof (headbuf), ".TH %ws %ws \"%ws\"",
 656             getwstring(m->title), getwstring(m->volnum), getwstring(m->date));
 657 
 658         trimln(headbuf);
 659         if (tocrc)
 660                 doname(m->name);
 661         if (!intro)
 662                 section(m->name, headbuf);
 663 
 664         if (m->title)
 665                 delstring(&m->title);
 666         if (m->volnum)
 667                 delstring(&m->volnum);
 668         if (m->date)
 669                 delstring(&m->date);
 670 
 671         return (1);
 672 }
 673 
 674 static int
 675 appendname(manpage_t *m, char *term)
 676 {
 677         string_t *r = filestring(m, 0, term);
 678 
 679         if (m->names) {
 680                 appendwstring(m->names, L", ");
 681                 appendwstring(m->names, getwstring(r));
 682                 delstring(&r);
 683         } else
 684                 m->names = r;
 685         return (1);
 686 }
 687 
 688 /*
 689  * <refdescriptor> REFDESCRIPTOR </refdescriptor>
 690  */
 691 static int
 692 refdescriptor(manpage_t *m)
 693 {
 694         return (appendname(m, "/refdescriptor"));
 695 }
 696 
 697 /*
 698  * <refname> REFNAME </refname>
 699  */
 700 static int
 701 refname(manpage_t *m)
 702 {
 703         return (appendname(m, "/refname"));
 704 }
 705 
 706 /*
 707  * <refpurpose> PURPOSE </refpurpose>
 708  */
 709 static int
 710 refpurpose(manpage_t *m)
 711 {
 712         if (m->purpose != NULL)
 713                 warning(m, "repeated refpurpose\n");
 714         m->purpose = filestring(m, 0, "/refpurpose");
 715         return (1);
 716 }
 717 
 718 /*
 719  * .. </refnamediv> - this is our chance to bail out.
 720  */
 721 static int
 722 terminate(manpage_t *m)
 723 {
 724         if (m->names) {
 725                 appendwstring(m->names, L" \\- ");
 726                 appendwstring(m->names, getwstring(m->purpose));
 727                 if (intro) {
 728                         char *buf = getcstring(m->names);
 729                         split(buf, m->name);
 730                         free(buf);
 731                 } else
 732                         (void) printf("%ws", getwstring(m->names));
 733         }
 734 
 735         if (m->names)
 736                 delstring(&m->names);
 737         if (m->purpose)
 738                 delstring(&m->purpose);
 739 
 740         (void) printf("\n");
 741         return (0);
 742 }
 743 
 744 
 745 /*
 746  * Basic control structure of the SGML "parser".
 747  * It's very simplistic - when named tags are encountered in the
 748  * input stream, control is transferred to the corresponding routine.
 749  * No checking is done for correct pairing of tags.  A few other hacks
 750  * are sneaked into the lexical routines above.
 751  * Output is generated after seeing the /refmeta and /refnamediv
 752  * closing tags.
 753  */
 754 static const struct {
 755         char *name;
 756         int (*action)(manpage_t *);
 757 } acts[] = {
 758         { "refentrytitle",              refentrytitle },
 759         { "manvolnum",                  manvolnum },
 760         { "refmiscinfo class=\"date\"", refmiscinfo_date },
 761         { "/refmeta",                   print_refmeta },
 762         { "refdescriptor",              refdescriptor },
 763         { "refname",                    refname },
 764         { "refpurpose",                 refpurpose },
 765         { "/refnamediv",                terminate },
 766         { 0 }
 767 };
 768 
 769 static void
 770 sgmlpage(char *name)
 771 {
 772         int rc = 1, a;
 773         char tokbuf[BUFSIZ];
 774         manpage_t manpage, *m = &manpage;
 775 
 776         (void) memset(m, 0, sizeof (*m));
 777         m->name = name;
 778 
 779         do {
 780                 if (!find_token(tokbuf, sizeof (tokbuf)))
 781                         break;
 782                 for (a = 0; acts[a].name; a++) {
 783                         if (strcasecmp(acts[a].name, tokbuf) != 0)
 784                                 continue;
 785                         rc = acts[a].action(m);
 786                         break;
 787                 }
 788         } while (rc);
 789 }