1 /*      $Id: html.c,v 1.150 2011/10/05 21:35:17 kristaps Exp $ */
   2 /*
   3  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
   4  * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  */
  18 #ifdef HAVE_CONFIG_H
  19 #include "config.h"
  20 #endif
  21 
  22 #include <sys/types.h>
  23 
  24 #include <assert.h>
  25 #include <ctype.h>
  26 #include <stdarg.h>
  27 #include <stdio.h>
  28 #include <stdint.h>
  29 #include <stdlib.h>
  30 #include <string.h>
  31 #include <unistd.h>
  32 
  33 #include "mandoc.h"
  34 #include "libmandoc.h"
  35 #include "out.h"
  36 #include "html.h"
  37 #include "main.h"
  38 
  39 struct  htmldata {
  40         const char       *name;
  41         int               flags;
  42 #define HTML_CLRLINE     (1 << 0)
  43 #define HTML_NOSTACK     (1 << 1)
  44 #define HTML_AUTOCLOSE   (1 << 2) /* Tag has auto-closure. */
  45 };
  46 
  47 static  const struct htmldata htmltags[TAG_MAX] = {
  48         {"html",        HTML_CLRLINE}, /* TAG_HTML */
  49         {"head",        HTML_CLRLINE}, /* TAG_HEAD */
  50         {"body",        HTML_CLRLINE}, /* TAG_BODY */
  51         {"meta",        HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */
  52         {"title",       HTML_CLRLINE}, /* TAG_TITLE */
  53         {"div",         HTML_CLRLINE}, /* TAG_DIV */
  54         {"h1",          0}, /* TAG_H1 */
  55         {"h2",          0}, /* TAG_H2 */
  56         {"span",        0}, /* TAG_SPAN */
  57         {"link",        HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */
  58         {"br",          HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */
  59         {"a",           0}, /* TAG_A */
  60         {"table",       HTML_CLRLINE}, /* TAG_TABLE */
  61         {"tbody",       HTML_CLRLINE}, /* TAG_TBODY */
  62         {"col",         HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */
  63         {"tr",          HTML_CLRLINE}, /* TAG_TR */
  64         {"td",          HTML_CLRLINE}, /* TAG_TD */
  65         {"li",          HTML_CLRLINE}, /* TAG_LI */
  66         {"ul",          HTML_CLRLINE}, /* TAG_UL */
  67         {"ol",          HTML_CLRLINE}, /* TAG_OL */
  68         {"dl",          HTML_CLRLINE}, /* TAG_DL */
  69         {"dt",          HTML_CLRLINE}, /* TAG_DT */
  70         {"dd",          HTML_CLRLINE}, /* TAG_DD */
  71         {"blockquote",  HTML_CLRLINE}, /* TAG_BLOCKQUOTE */
  72         {"p",           HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_P */
  73         {"pre",         HTML_CLRLINE }, /* TAG_PRE */
  74         {"b",           0 }, /* TAG_B */
  75         {"i",           0 }, /* TAG_I */
  76         {"code",        0 }, /* TAG_CODE */
  77         {"small",       0 }, /* TAG_SMALL */
  78 };
  79 
  80 static  const char      *const htmlattrs[ATTR_MAX] = {
  81         "http-equiv", /* ATTR_HTTPEQUIV */
  82         "content", /* ATTR_CONTENT */
  83         "name", /* ATTR_NAME */
  84         "rel", /* ATTR_REL */
  85         "href", /* ATTR_HREF */
  86         "type", /* ATTR_TYPE */
  87         "media", /* ATTR_MEDIA */
  88         "class", /* ATTR_CLASS */
  89         "style", /* ATTR_STYLE */
  90         "width", /* ATTR_WIDTH */
  91         "id", /* ATTR_ID */
  92         "summary", /* ATTR_SUMMARY */
  93         "align", /* ATTR_ALIGN */
  94         "colspan", /* ATTR_COLSPAN */
  95 };
  96 
  97 static  const char      *const roffscales[SCALE_MAX] = {
  98         "cm", /* SCALE_CM */
  99         "in", /* SCALE_IN */
 100         "pc", /* SCALE_PC */
 101         "pt", /* SCALE_PT */
 102         "em", /* SCALE_EM */
 103         "em", /* SCALE_MM */
 104         "ex", /* SCALE_EN */
 105         "ex", /* SCALE_BU */
 106         "em", /* SCALE_VS */
 107         "ex", /* SCALE_FS */
 108 };
 109 
 110 static  void     bufncat(struct html *, const char *, size_t);
 111 static  void     print_ctag(struct html *, enum htmltag);
 112 static  int      print_encode(struct html *, const char *, int);
 113 static  void     print_metaf(struct html *, enum mandoc_esc);
 114 static  void     print_attr(struct html *, const char *, const char *);
 115 static  void     *ml_alloc(char *, enum htmltype);
 116 
 117 static void *
 118 ml_alloc(char *outopts, enum htmltype type)
 119 {
 120         struct html     *h;
 121         const char      *toks[5];
 122         char            *v;
 123 
 124         toks[0] = "style";
 125         toks[1] = "man";
 126         toks[2] = "includes";
 127         toks[3] = "fragment";
 128         toks[4] = NULL;
 129 
 130         h = mandoc_calloc(1, sizeof(struct html));
 131 
 132         h->type = type;
 133         h->tags.head = NULL;
 134         h->symtab = mchars_alloc();
 135 
 136         while (outopts && *outopts)
 137                 switch (getsubopt(&outopts, UNCONST(toks), &v)) {
 138                 case (0):
 139                         h->style = v;
 140                         break;
 141                 case (1):
 142                         h->base_man = v;
 143                         break;
 144                 case (2):
 145                         h->base_includes = v;
 146                         break;
 147                 case (3):
 148                         h->oflags |= HTML_FRAGMENT;
 149                         break;
 150                 default:
 151                         break;
 152                 }
 153 
 154         return(h);
 155 }
 156 
 157 void *
 158 html_alloc(char *outopts)
 159 {
 160 
 161         return(ml_alloc(outopts, HTML_HTML_4_01_STRICT));
 162 }
 163 
 164 
 165 void *
 166 xhtml_alloc(char *outopts)
 167 {
 168 
 169         return(ml_alloc(outopts, HTML_XHTML_1_0_STRICT));
 170 }
 171 
 172 
 173 void
 174 html_free(void *p)
 175 {
 176         struct tag      *tag;
 177         struct html     *h;
 178 
 179         h = (struct html *)p;
 180 
 181         while ((tag = h->tags.head) != NULL) {
 182                 h->tags.head = tag->next; 
 183                 free(tag);
 184         }
 185         
 186         if (h->symtab)
 187                 mchars_free(h->symtab);
 188 
 189         free(h);
 190 }
 191 
 192 
 193 void
 194 print_gen_head(struct html *h)
 195 {
 196         struct htmlpair  tag[4];
 197 
 198         tag[0].key = ATTR_HTTPEQUIV;
 199         tag[0].val = "Content-Type";
 200         tag[1].key = ATTR_CONTENT;
 201         tag[1].val = "text/html; charset=utf-8";
 202         print_otag(h, TAG_META, 2, tag);
 203 
 204         tag[0].key = ATTR_NAME;
 205         tag[0].val = "resource-type";
 206         tag[1].key = ATTR_CONTENT;
 207         tag[1].val = "document";
 208         print_otag(h, TAG_META, 2, tag);
 209 
 210         if (h->style) {
 211                 tag[0].key = ATTR_REL;
 212                 tag[0].val = "stylesheet";
 213                 tag[1].key = ATTR_HREF;
 214                 tag[1].val = h->style;
 215                 tag[2].key = ATTR_TYPE;
 216                 tag[2].val = "text/css";
 217                 tag[3].key = ATTR_MEDIA;
 218                 tag[3].val = "all";
 219                 print_otag(h, TAG_LINK, 4, tag);
 220         }
 221 }
 222 
 223 static void
 224 print_metaf(struct html *h, enum mandoc_esc deco)
 225 {
 226         enum htmlfont    font;
 227 
 228         switch (deco) {
 229         case (ESCAPE_FONTPREV):
 230                 font = h->metal;
 231                 break;
 232         case (ESCAPE_FONTITALIC):
 233                 font = HTMLFONT_ITALIC;
 234                 break;
 235         case (ESCAPE_FONTBOLD):
 236                 font = HTMLFONT_BOLD;
 237                 break;
 238         case (ESCAPE_FONT):
 239                 /* FALLTHROUGH */
 240         case (ESCAPE_FONTROMAN):
 241                 font = HTMLFONT_NONE;
 242                 break;
 243         default:
 244                 abort();
 245                 /* NOTREACHED */
 246         }
 247 
 248         if (h->metaf) {
 249                 print_tagq(h, h->metaf);
 250                 h->metaf = NULL;
 251         }
 252 
 253         h->metal = h->metac;
 254         h->metac = font;
 255 
 256         if (HTMLFONT_NONE != font)
 257                 h->metaf = HTMLFONT_BOLD == font ?
 258                         print_otag(h, TAG_B, 0, NULL) :
 259                         print_otag(h, TAG_I, 0, NULL);
 260 }
 261 
 262 int
 263 html_strlen(const char *cp)
 264 {
 265         int              ssz, sz;
 266         const char      *seq, *p;
 267 
 268         /*
 269          * Account for escaped sequences within string length
 270          * calculations.  This follows the logic in term_strlen() as we
 271          * must calculate the width of produced strings.
 272          * Assume that characters are always width of "1".  This is
 273          * hacky, but it gets the job done for approximation of widths.
 274          */
 275 
 276         sz = 0;
 277         while (NULL != (p = strchr(cp, '\\'))) {
 278                 sz += (int)(p - cp);
 279                 ++cp;
 280                 switch (mandoc_escape(&cp, &seq, &ssz)) {
 281                 case (ESCAPE_ERROR):
 282                         return(sz);
 283                 case (ESCAPE_UNICODE):
 284                         /* FALLTHROUGH */
 285                 case (ESCAPE_NUMBERED):
 286                         /* FALLTHROUGH */
 287                 case (ESCAPE_SPECIAL):
 288                         sz++;
 289                         break;
 290                 default:
 291                         break;
 292                 }
 293         }
 294 
 295         assert(sz >= 0);
 296         return(sz + strlen(cp));
 297 }
 298 
 299 static int
 300 print_encode(struct html *h, const char *p, int norecurse)
 301 {
 302         size_t           sz;
 303         int              c, len, nospace;
 304         const char      *seq;
 305         enum mandoc_esc  esc;
 306         static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH, '\0' };
 307 
 308         nospace = 0;
 309 
 310         while ('\0' != *p) {
 311                 sz = strcspn(p, rejs);
 312 
 313                 fwrite(p, 1, sz, stdout);
 314                 p += (int)sz;
 315 
 316                 if ('\0' == *p)
 317                         break;
 318 
 319                 switch (*p++) {
 320                 case ('<'):
 321                         printf("&lt;");
 322                         continue;
 323                 case ('>'):
 324                         printf("&gt;");
 325                         continue;
 326                 case ('&'):
 327                         printf("&amp;");
 328                         continue;
 329                 case (ASCII_HYPH):
 330                         putchar('-');
 331                         continue;
 332                 default:
 333                         break;
 334                 }
 335 
 336                 esc = mandoc_escape(&p, &seq, &len);
 337                 if (ESCAPE_ERROR == esc)
 338                         break;
 339 
 340                 switch (esc) {
 341                 case (ESCAPE_UNICODE):
 342                         /* Skip passed "u" header. */
 343                         c = mchars_num2uc(seq + 1, len - 1);
 344                         if ('\0' != c)
 345                                 printf("&#x%x;", c);
 346                         break;
 347                 case (ESCAPE_NUMBERED):
 348                         c = mchars_num2char(seq, len);
 349                         if ('\0' != c)
 350                                 putchar(c);
 351                         break;
 352                 case (ESCAPE_SPECIAL):
 353                         c = mchars_spec2cp(h->symtab, seq, len);
 354                         if (c > 0)
 355                                 printf("&#%d;", c);
 356                         else if (-1 == c && 1 == len)
 357                                 putchar((int)*seq);
 358                         break;
 359                 case (ESCAPE_FONT):
 360                         /* FALLTHROUGH */
 361                 case (ESCAPE_FONTPREV):
 362                         /* FALLTHROUGH */
 363                 case (ESCAPE_FONTBOLD):
 364                         /* FALLTHROUGH */
 365                 case (ESCAPE_FONTITALIC):
 366                         /* FALLTHROUGH */
 367                 case (ESCAPE_FONTROMAN):
 368                         if (norecurse)
 369                                 break;
 370                         print_metaf(h, esc);
 371                         break;
 372                 case (ESCAPE_NOSPACE):
 373                         if ('\0' == *p)
 374                                 nospace = 1;
 375                         break;
 376                 default:
 377                         break;
 378                 }
 379         }
 380 
 381         return(nospace);
 382 }
 383 
 384 
 385 static void
 386 print_attr(struct html *h, const char *key, const char *val)
 387 {
 388         printf(" %s=\"", key);
 389         (void)print_encode(h, val, 1);
 390         putchar('\"');
 391 }
 392 
 393 
 394 struct tag *
 395 print_otag(struct html *h, enum htmltag tag, 
 396                 int sz, const struct htmlpair *p)
 397 {
 398         int              i;
 399         struct tag      *t;
 400 
 401         /* Push this tags onto the stack of open scopes. */
 402 
 403         if ( ! (HTML_NOSTACK & htmltags[tag].flags)) {
 404                 t = mandoc_malloc(sizeof(struct tag));
 405                 t->tag = tag;
 406                 t->next = h->tags.head;
 407                 h->tags.head = t;
 408         } else
 409                 t = NULL;
 410 
 411         if ( ! (HTML_NOSPACE & h->flags))
 412                 if ( ! (HTML_CLRLINE & htmltags[tag].flags)) {
 413                         /* Manage keeps! */
 414                         if ( ! (HTML_KEEP & h->flags)) {
 415                                 if (HTML_PREKEEP & h->flags)
 416                                         h->flags |= HTML_KEEP;
 417                                 putchar(' ');
 418                         } else
 419                                 printf("&#160;");
 420                 }
 421 
 422         if ( ! (h->flags & HTML_NONOSPACE))
 423                 h->flags &= ~HTML_NOSPACE;
 424         else
 425                 h->flags |= HTML_NOSPACE;
 426 
 427         /* Print out the tag name and attributes. */
 428 
 429         printf("<%s", htmltags[tag].name);
 430         for (i = 0; i < sz; i++)
 431                 print_attr(h, htmlattrs[p[i].key], p[i].val);
 432 
 433         /* Add non-overridable attributes. */
 434 
 435         if (TAG_HTML == tag && HTML_XHTML_1_0_STRICT == h->type) {
 436                 print_attr(h, "xmlns", "http://www.w3.org/1999/xhtml");
 437                 print_attr(h, "xml:lang", "en");
 438                 print_attr(h, "lang", "en");
 439         }
 440 
 441         /* Accommodate for XML "well-formed" singleton escaping. */
 442 
 443         if (HTML_AUTOCLOSE & htmltags[tag].flags)
 444                 switch (h->type) {
 445                 case (HTML_XHTML_1_0_STRICT):
 446                         putchar('/');
 447                         break;
 448                 default:
 449                         break;
 450                 }
 451 
 452         putchar('>');
 453 
 454         h->flags |= HTML_NOSPACE;
 455 
 456         if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags)
 457                 putchar('\n');
 458 
 459         return(t);
 460 }
 461 
 462 
 463 static void
 464 print_ctag(struct html *h, enum htmltag tag)
 465 {
 466         
 467         printf("</%s>", htmltags[tag].name);
 468         if (HTML_CLRLINE & htmltags[tag].flags) {
 469                 h->flags |= HTML_NOSPACE;
 470                 putchar('\n');
 471         } 
 472 }
 473 
 474 void
 475 print_gen_decls(struct html *h)
 476 {
 477         const char      *doctype;
 478         const char      *dtd;
 479         const char      *name;
 480 
 481         switch (h->type) {
 482         case (HTML_HTML_4_01_STRICT):
 483                 name = "HTML";
 484                 doctype = "-//W3C//DTD HTML 4.01//EN";
 485                 dtd = "http://www.w3.org/TR/html4/strict.dtd";
 486                 break;
 487         default:
 488                 puts("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
 489                 name = "html";
 490                 doctype = "-//W3C//DTD XHTML 1.0 Strict//EN";
 491                 dtd = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
 492                 break;
 493         }
 494 
 495         printf("<!DOCTYPE %s PUBLIC \"%s\" \"%s\">\n", 
 496                         name, doctype, dtd);
 497 }
 498 
 499 void
 500 print_text(struct html *h, const char *word)
 501 {
 502 
 503         if ( ! (HTML_NOSPACE & h->flags)) {
 504                 /* Manage keeps! */
 505                 if ( ! (HTML_KEEP & h->flags)) {
 506                         if (HTML_PREKEEP & h->flags)
 507                                 h->flags |= HTML_KEEP;
 508                         putchar(' ');
 509                 } else
 510                         printf("&#160;");
 511         }
 512 
 513         assert(NULL == h->metaf);
 514         if (HTMLFONT_NONE != h->metac)
 515                 h->metaf = HTMLFONT_BOLD == h->metac ?
 516                         print_otag(h, TAG_B, 0, NULL) :
 517                         print_otag(h, TAG_I, 0, NULL);
 518 
 519         assert(word);
 520         if ( ! print_encode(h, word, 0)) {
 521                 if ( ! (h->flags & HTML_NONOSPACE))
 522                         h->flags &= ~HTML_NOSPACE;
 523         } else
 524                 h->flags |= HTML_NOSPACE;
 525 
 526         if (h->metaf) {
 527                 print_tagq(h, h->metaf);
 528                 h->metaf = NULL;
 529         }
 530 
 531         h->flags &= ~HTML_IGNDELIM;
 532 }
 533 
 534 
 535 void
 536 print_tagq(struct html *h, const struct tag *until)
 537 {
 538         struct tag      *tag;
 539 
 540         while ((tag = h->tags.head) != NULL) {
 541                 /* 
 542                  * Remember to close out and nullify the current
 543                  * meta-font and table, if applicable.
 544                  */
 545                 if (tag == h->metaf)
 546                         h->metaf = NULL;
 547                 if (tag == h->tblt)
 548                         h->tblt = NULL;
 549                 print_ctag(h, tag->tag);
 550                 h->tags.head = tag->next;
 551                 free(tag);
 552                 if (until && tag == until)
 553                         return;
 554         }
 555 }
 556 
 557 
 558 void
 559 print_stagq(struct html *h, const struct tag *suntil)
 560 {
 561         struct tag      *tag;
 562 
 563         while ((tag = h->tags.head) != NULL) {
 564                 if (suntil && tag == suntil)
 565                         return;
 566                 /* 
 567                  * Remember to close out and nullify the current
 568                  * meta-font and table, if applicable.
 569                  */
 570                 if (tag == h->metaf)
 571                         h->metaf = NULL;
 572                 if (tag == h->tblt)
 573                         h->tblt = NULL;
 574                 print_ctag(h, tag->tag);
 575                 h->tags.head = tag->next;
 576                 free(tag);
 577         }
 578 }
 579 
 580 void
 581 bufinit(struct html *h)
 582 {
 583 
 584         h->buf[0] = '\0';
 585         h->buflen = 0;
 586 }
 587 
 588 void
 589 bufcat_style(struct html *h, const char *key, const char *val)
 590 {
 591 
 592         bufcat(h, key);
 593         bufcat(h, ":");
 594         bufcat(h, val);
 595         bufcat(h, ";");
 596 }
 597 
 598 void
 599 bufcat(struct html *h, const char *p)
 600 {
 601 
 602         h->buflen = strlcat(h->buf, p, BUFSIZ);
 603         assert(h->buflen < BUFSIZ);
 604 }
 605 
 606 void
 607 bufcat_fmt(struct html *h, const char *fmt, ...)
 608 {
 609         va_list          ap;
 610 
 611         va_start(ap, fmt);
 612         (void)vsnprintf(h->buf + (int)h->buflen, 
 613                         BUFSIZ - h->buflen - 1, fmt, ap);
 614         va_end(ap);
 615         h->buflen = strlen(h->buf);
 616 }
 617 
 618 static void
 619 bufncat(struct html *h, const char *p, size_t sz)
 620 {
 621 
 622         assert(h->buflen + sz + 1 < BUFSIZ);
 623         strncat(h->buf, p, sz);
 624         h->buflen += sz;
 625 }
 626 
 627 void
 628 buffmt_includes(struct html *h, const char *name)
 629 {
 630         const char      *p, *pp;
 631 
 632         pp = h->base_includes;
 633         
 634         bufinit(h);
 635         while (NULL != (p = strchr(pp, '%'))) {
 636                 bufncat(h, pp, (size_t)(p - pp));
 637                 switch (*(p + 1)) {
 638                 case('I'):
 639                         bufcat(h, name);
 640                         break;
 641                 default:
 642                         bufncat(h, p, 2);
 643                         break;
 644                 }
 645                 pp = p + 2;
 646         }
 647         if (pp)
 648                 bufcat(h, pp);
 649 }
 650 
 651 void
 652 buffmt_man(struct html *h, 
 653                 const char *name, const char *sec)
 654 {
 655         const char      *p, *pp;
 656 
 657         pp = h->base_man;
 658         
 659         bufinit(h);
 660         while (NULL != (p = strchr(pp, '%'))) {
 661                 bufncat(h, pp, (size_t)(p - pp));
 662                 switch (*(p + 1)) {
 663                 case('S'):
 664                         bufcat(h, sec ? sec : "1");
 665                         break;
 666                 case('N'):
 667                         bufcat_fmt(h, name);
 668                         break;
 669                 default:
 670                         bufncat(h, p, 2);
 671                         break;
 672                 }
 673                 pp = p + 2;
 674         }
 675         if (pp)
 676                 bufcat(h, pp);
 677 }
 678 
 679 void
 680 bufcat_su(struct html *h, const char *p, const struct roffsu *su)
 681 {
 682         double           v;
 683 
 684         v = su->scale;
 685         if (SCALE_MM == su->unit && 0.0 == (v /= 100.0))
 686                 v = 1.0;
 687 
 688         bufcat_fmt(h, "%s: %.2f%s;", p, v, roffscales[su->unit]);
 689 }
 690 
 691 void
 692 bufcat_id(struct html *h, const char *src)
 693 {
 694 
 695         /* Cf. <http://www.w3.org/TR/html4/types.html#h-6.2>. */
 696 
 697         while ('\0' != *src)
 698                 bufcat_fmt(h, "%.2x", *src++);
 699 }