1 /* $Id: html.c,v 1.150 2011/10/05 21:35:17 kristaps Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #ifdef HAVE_CONFIG_H 19 #include "config.h" 20 #endif 21 22 #include <sys/types.h> 23 24 #include <assert.h> 25 #include <ctype.h> 26 #include <stdarg.h> 27 #include <stdio.h> 28 #include <stdint.h> 29 #include <stdlib.h> 30 #include <string.h> 31 #include <unistd.h> 32 33 #include "mandoc.h" 34 #include "libmandoc.h" 35 #include "out.h" 36 #include "html.h" 37 #include "main.h" 38 39 struct htmldata { 40 const char *name; 41 int flags; 42 #define HTML_CLRLINE (1 << 0) 43 #define HTML_NOSTACK (1 << 1) 44 #define HTML_AUTOCLOSE (1 << 2) /* Tag has auto-closure. */ 45 }; 46 47 static const struct htmldata htmltags[TAG_MAX] = { 48 {"html", HTML_CLRLINE}, /* TAG_HTML */ 49 {"head", HTML_CLRLINE}, /* TAG_HEAD */ 50 {"body", HTML_CLRLINE}, /* TAG_BODY */ 51 {"meta", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */ 52 {"title", HTML_CLRLINE}, /* TAG_TITLE */ 53 {"div", HTML_CLRLINE}, /* TAG_DIV */ 54 {"h1", 0}, /* TAG_H1 */ 55 {"h2", 0}, /* TAG_H2 */ 56 {"span", 0}, /* TAG_SPAN */ 57 {"link", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */ 58 {"br", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */ 59 {"a", 0}, /* TAG_A */ 60 {"table", HTML_CLRLINE}, /* TAG_TABLE */ 61 {"tbody", HTML_CLRLINE}, /* TAG_TBODY */ 62 {"col", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */ 63 {"tr", HTML_CLRLINE}, /* TAG_TR */ 64 {"td", HTML_CLRLINE}, /* TAG_TD */ 65 {"li", HTML_CLRLINE}, /* TAG_LI */ 66 {"ul", HTML_CLRLINE}, /* TAG_UL */ 67 {"ol", HTML_CLRLINE}, /* TAG_OL */ 68 {"dl", HTML_CLRLINE}, /* TAG_DL */ 69 {"dt", HTML_CLRLINE}, /* TAG_DT */ 70 {"dd", HTML_CLRLINE}, /* TAG_DD */ 71 {"blockquote", HTML_CLRLINE}, /* TAG_BLOCKQUOTE */ 72 {"p", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_P */ 73 {"pre", HTML_CLRLINE }, /* TAG_PRE */ 74 {"b", 0 }, /* TAG_B */ 75 {"i", 0 }, /* TAG_I */ 76 {"code", 0 }, /* TAG_CODE */ 77 {"small", 0 }, /* TAG_SMALL */ 78 }; 79 80 static const char *const htmlattrs[ATTR_MAX] = { 81 "http-equiv", /* ATTR_HTTPEQUIV */ 82 "content", /* ATTR_CONTENT */ 83 "name", /* ATTR_NAME */ 84 "rel", /* ATTR_REL */ 85 "href", /* ATTR_HREF */ 86 "type", /* ATTR_TYPE */ 87 "media", /* ATTR_MEDIA */ 88 "class", /* ATTR_CLASS */ 89 "style", /* ATTR_STYLE */ 90 "width", /* ATTR_WIDTH */ 91 "id", /* ATTR_ID */ 92 "summary", /* ATTR_SUMMARY */ 93 "align", /* ATTR_ALIGN */ 94 "colspan", /* ATTR_COLSPAN */ 95 }; 96 97 static const char *const roffscales[SCALE_MAX] = { 98 "cm", /* SCALE_CM */ 99 "in", /* SCALE_IN */ 100 "pc", /* SCALE_PC */ 101 "pt", /* SCALE_PT */ 102 "em", /* SCALE_EM */ 103 "em", /* SCALE_MM */ 104 "ex", /* SCALE_EN */ 105 "ex", /* SCALE_BU */ 106 "em", /* SCALE_VS */ 107 "ex", /* SCALE_FS */ 108 }; 109 110 static void bufncat(struct html *, const char *, size_t); 111 static void print_ctag(struct html *, enum htmltag); 112 static int print_encode(struct html *, const char *, int); 113 static void print_metaf(struct html *, enum mandoc_esc); 114 static void print_attr(struct html *, const char *, const char *); 115 static void *ml_alloc(char *, enum htmltype); 116 117 static void * 118 ml_alloc(char *outopts, enum htmltype type) 119 { 120 struct html *h; 121 const char *toks[5]; 122 char *v; 123 124 toks[0] = "style"; 125 toks[1] = "man"; 126 toks[2] = "includes"; 127 toks[3] = "fragment"; 128 toks[4] = NULL; 129 130 h = mandoc_calloc(1, sizeof(struct html)); 131 132 h->type = type; 133 h->tags.head = NULL; 134 h->symtab = mchars_alloc(); 135 136 while (outopts && *outopts) 137 switch (getsubopt(&outopts, UNCONST(toks), &v)) { 138 case (0): 139 h->style = v; 140 break; 141 case (1): 142 h->base_man = v; 143 break; 144 case (2): 145 h->base_includes = v; 146 break; 147 case (3): 148 h->oflags |= HTML_FRAGMENT; 149 break; 150 default: 151 break; 152 } 153 154 return(h); 155 } 156 157 void * 158 html_alloc(char *outopts) 159 { 160 161 return(ml_alloc(outopts, HTML_HTML_4_01_STRICT)); 162 } 163 164 165 void * 166 xhtml_alloc(char *outopts) 167 { 168 169 return(ml_alloc(outopts, HTML_XHTML_1_0_STRICT)); 170 } 171 172 173 void 174 html_free(void *p) 175 { 176 struct tag *tag; 177 struct html *h; 178 179 h = (struct html *)p; 180 181 while ((tag = h->tags.head) != NULL) { 182 h->tags.head = tag->next; 183 free(tag); 184 } 185 186 if (h->symtab) 187 mchars_free(h->symtab); 188 189 free(h); 190 } 191 192 193 void 194 print_gen_head(struct html *h) 195 { 196 struct htmlpair tag[4]; 197 198 tag[0].key = ATTR_HTTPEQUIV; 199 tag[0].val = "Content-Type"; 200 tag[1].key = ATTR_CONTENT; 201 tag[1].val = "text/html; charset=utf-8"; 202 print_otag(h, TAG_META, 2, tag); 203 204 tag[0].key = ATTR_NAME; 205 tag[0].val = "resource-type"; 206 tag[1].key = ATTR_CONTENT; 207 tag[1].val = "document"; 208 print_otag(h, TAG_META, 2, tag); 209 210 if (h->style) { 211 tag[0].key = ATTR_REL; 212 tag[0].val = "stylesheet"; 213 tag[1].key = ATTR_HREF; 214 tag[1].val = h->style; 215 tag[2].key = ATTR_TYPE; 216 tag[2].val = "text/css"; 217 tag[3].key = ATTR_MEDIA; 218 tag[3].val = "all"; 219 print_otag(h, TAG_LINK, 4, tag); 220 } 221 } 222 223 static void 224 print_metaf(struct html *h, enum mandoc_esc deco) 225 { 226 enum htmlfont font; 227 228 switch (deco) { 229 case (ESCAPE_FONTPREV): 230 font = h->metal; 231 break; 232 case (ESCAPE_FONTITALIC): 233 font = HTMLFONT_ITALIC; 234 break; 235 case (ESCAPE_FONTBOLD): 236 font = HTMLFONT_BOLD; 237 break; 238 case (ESCAPE_FONT): 239 /* FALLTHROUGH */ 240 case (ESCAPE_FONTROMAN): 241 font = HTMLFONT_NONE; 242 break; 243 default: 244 abort(); 245 /* NOTREACHED */ 246 } 247 248 if (h->metaf) { 249 print_tagq(h, h->metaf); 250 h->metaf = NULL; 251 } 252 253 h->metal = h->metac; 254 h->metac = font; 255 256 if (HTMLFONT_NONE != font) 257 h->metaf = HTMLFONT_BOLD == font ? 258 print_otag(h, TAG_B, 0, NULL) : 259 print_otag(h, TAG_I, 0, NULL); 260 } 261 262 int 263 html_strlen(const char *cp) 264 { 265 int ssz, sz; 266 const char *seq, *p; 267 268 /* 269 * Account for escaped sequences within string length 270 * calculations. This follows the logic in term_strlen() as we 271 * must calculate the width of produced strings. 272 * Assume that characters are always width of "1". This is 273 * hacky, but it gets the job done for approximation of widths. 274 */ 275 276 sz = 0; 277 while (NULL != (p = strchr(cp, '\\'))) { 278 sz += (int)(p - cp); 279 ++cp; 280 switch (mandoc_escape(&cp, &seq, &ssz)) { 281 case (ESCAPE_ERROR): 282 return(sz); 283 case (ESCAPE_UNICODE): 284 /* FALLTHROUGH */ 285 case (ESCAPE_NUMBERED): 286 /* FALLTHROUGH */ 287 case (ESCAPE_SPECIAL): 288 sz++; 289 break; 290 default: 291 break; 292 } 293 } 294 295 assert(sz >= 0); 296 return(sz + strlen(cp)); 297 } 298 299 static int 300 print_encode(struct html *h, const char *p, int norecurse) 301 { 302 size_t sz; 303 int c, len, nospace; 304 const char *seq; 305 enum mandoc_esc esc; 306 static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH, '\0' }; 307 308 nospace = 0; 309 310 while ('\0' != *p) { 311 sz = strcspn(p, rejs); 312 313 fwrite(p, 1, sz, stdout); 314 p += (int)sz; 315 316 if ('\0' == *p) 317 break; 318 319 switch (*p++) { 320 case ('<'): 321 printf("<"); 322 continue; 323 case ('>'): 324 printf(">"); 325 continue; 326 case ('&'): 327 printf("&"); 328 continue; 329 case (ASCII_HYPH): 330 putchar('-'); 331 continue; 332 default: 333 break; 334 } 335 336 esc = mandoc_escape(&p, &seq, &len); 337 if (ESCAPE_ERROR == esc) 338 break; 339 340 switch (esc) { 341 case (ESCAPE_UNICODE): 342 /* Skip passed "u" header. */ 343 c = mchars_num2uc(seq + 1, len - 1); 344 if ('\0' != c) 345 printf("&#x%x;", c); 346 break; 347 case (ESCAPE_NUMBERED): 348 c = mchars_num2char(seq, len); 349 if ('\0' != c) 350 putchar(c); 351 break; 352 case (ESCAPE_SPECIAL): 353 c = mchars_spec2cp(h->symtab, seq, len); 354 if (c > 0) 355 printf("&#%d;", c); 356 else if (-1 == c && 1 == len) 357 putchar((int)*seq); 358 break; 359 case (ESCAPE_FONT): 360 /* FALLTHROUGH */ 361 case (ESCAPE_FONTPREV): 362 /* FALLTHROUGH */ 363 case (ESCAPE_FONTBOLD): 364 /* FALLTHROUGH */ 365 case (ESCAPE_FONTITALIC): 366 /* FALLTHROUGH */ 367 case (ESCAPE_FONTROMAN): 368 if (norecurse) 369 break; 370 print_metaf(h, esc); 371 break; 372 case (ESCAPE_NOSPACE): 373 if ('\0' == *p) 374 nospace = 1; 375 break; 376 default: 377 break; 378 } 379 } 380 381 return(nospace); 382 } 383 384 385 static void 386 print_attr(struct html *h, const char *key, const char *val) 387 { 388 printf(" %s=\"", key); 389 (void)print_encode(h, val, 1); 390 putchar('\"'); 391 } 392 393 394 struct tag * 395 print_otag(struct html *h, enum htmltag tag, 396 int sz, const struct htmlpair *p) 397 { 398 int i; 399 struct tag *t; 400 401 /* Push this tags onto the stack of open scopes. */ 402 403 if ( ! (HTML_NOSTACK & htmltags[tag].flags)) { 404 t = mandoc_malloc(sizeof(struct tag)); 405 t->tag = tag; 406 t->next = h->tags.head; 407 h->tags.head = t; 408 } else 409 t = NULL; 410 411 if ( ! (HTML_NOSPACE & h->flags)) 412 if ( ! (HTML_CLRLINE & htmltags[tag].flags)) { 413 /* Manage keeps! */ 414 if ( ! (HTML_KEEP & h->flags)) { 415 if (HTML_PREKEEP & h->flags) 416 h->flags |= HTML_KEEP; 417 putchar(' '); 418 } else 419 printf(" "); 420 } 421 422 if ( ! (h->flags & HTML_NONOSPACE)) 423 h->flags &= ~HTML_NOSPACE; 424 else 425 h->flags |= HTML_NOSPACE; 426 427 /* Print out the tag name and attributes. */ 428 429 printf("<%s", htmltags[tag].name); 430 for (i = 0; i < sz; i++) 431 print_attr(h, htmlattrs[p[i].key], p[i].val); 432 433 /* Add non-overridable attributes. */ 434 435 if (TAG_HTML == tag && HTML_XHTML_1_0_STRICT == h->type) { 436 print_attr(h, "xmlns", "http://www.w3.org/1999/xhtml"); 437 print_attr(h, "xml:lang", "en"); 438 print_attr(h, "lang", "en"); 439 } 440 441 /* Accommodate for XML "well-formed" singleton escaping. */ 442 443 if (HTML_AUTOCLOSE & htmltags[tag].flags) 444 switch (h->type) { 445 case (HTML_XHTML_1_0_STRICT): 446 putchar('/'); 447 break; 448 default: 449 break; 450 } 451 452 putchar('>'); 453 454 h->flags |= HTML_NOSPACE; 455 456 if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags) 457 putchar('\n'); 458 459 return(t); 460 } 461 462 463 static void 464 print_ctag(struct html *h, enum htmltag tag) 465 { 466 467 printf("</%s>", htmltags[tag].name); 468 if (HTML_CLRLINE & htmltags[tag].flags) { 469 h->flags |= HTML_NOSPACE; 470 putchar('\n'); 471 } 472 } 473 474 void 475 print_gen_decls(struct html *h) 476 { 477 const char *doctype; 478 const char *dtd; 479 const char *name; 480 481 switch (h->type) { 482 case (HTML_HTML_4_01_STRICT): 483 name = "HTML"; 484 doctype = "-//W3C//DTD HTML 4.01//EN"; 485 dtd = "http://www.w3.org/TR/html4/strict.dtd"; 486 break; 487 default: 488 puts("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); 489 name = "html"; 490 doctype = "-//W3C//DTD XHTML 1.0 Strict//EN"; 491 dtd = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; 492 break; 493 } 494 495 printf("<!DOCTYPE %s PUBLIC \"%s\" \"%s\">\n", 496 name, doctype, dtd); 497 } 498 499 void 500 print_text(struct html *h, const char *word) 501 { 502 503 if ( ! (HTML_NOSPACE & h->flags)) { 504 /* Manage keeps! */ 505 if ( ! (HTML_KEEP & h->flags)) { 506 if (HTML_PREKEEP & h->flags) 507 h->flags |= HTML_KEEP; 508 putchar(' '); 509 } else 510 printf(" "); 511 } 512 513 assert(NULL == h->metaf); 514 if (HTMLFONT_NONE != h->metac) 515 h->metaf = HTMLFONT_BOLD == h->metac ? 516 print_otag(h, TAG_B, 0, NULL) : 517 print_otag(h, TAG_I, 0, NULL); 518 519 assert(word); 520 if ( ! print_encode(h, word, 0)) { 521 if ( ! (h->flags & HTML_NONOSPACE)) 522 h->flags &= ~HTML_NOSPACE; 523 } else 524 h->flags |= HTML_NOSPACE; 525 526 if (h->metaf) { 527 print_tagq(h, h->metaf); 528 h->metaf = NULL; 529 } 530 531 h->flags &= ~HTML_IGNDELIM; 532 } 533 534 535 void 536 print_tagq(struct html *h, const struct tag *until) 537 { 538 struct tag *tag; 539 540 while ((tag = h->tags.head) != NULL) { 541 /* 542 * Remember to close out and nullify the current 543 * meta-font and table, if applicable. 544 */ 545 if (tag == h->metaf) 546 h->metaf = NULL; 547 if (tag == h->tblt) 548 h->tblt = NULL; 549 print_ctag(h, tag->tag); 550 h->tags.head = tag->next; 551 free(tag); 552 if (until && tag == until) 553 return; 554 } 555 } 556 557 558 void 559 print_stagq(struct html *h, const struct tag *suntil) 560 { 561 struct tag *tag; 562 563 while ((tag = h->tags.head) != NULL) { 564 if (suntil && tag == suntil) 565 return; 566 /* 567 * Remember to close out and nullify the current 568 * meta-font and table, if applicable. 569 */ 570 if (tag == h->metaf) 571 h->metaf = NULL; 572 if (tag == h->tblt) 573 h->tblt = NULL; 574 print_ctag(h, tag->tag); 575 h->tags.head = tag->next; 576 free(tag); 577 } 578 } 579 580 void 581 bufinit(struct html *h) 582 { 583 584 h->buf[0] = '\0'; 585 h->buflen = 0; 586 } 587 588 void 589 bufcat_style(struct html *h, const char *key, const char *val) 590 { 591 592 bufcat(h, key); 593 bufcat(h, ":"); 594 bufcat(h, val); 595 bufcat(h, ";"); 596 } 597 598 void 599 bufcat(struct html *h, const char *p) 600 { 601 602 h->buflen = strlcat(h->buf, p, BUFSIZ); 603 assert(h->buflen < BUFSIZ); 604 } 605 606 void 607 bufcat_fmt(struct html *h, const char *fmt, ...) 608 { 609 va_list ap; 610 611 va_start(ap, fmt); 612 (void)vsnprintf(h->buf + (int)h->buflen, 613 BUFSIZ - h->buflen - 1, fmt, ap); 614 va_end(ap); 615 h->buflen = strlen(h->buf); 616 } 617 618 static void 619 bufncat(struct html *h, const char *p, size_t sz) 620 { 621 622 assert(h->buflen + sz + 1 < BUFSIZ); 623 strncat(h->buf, p, sz); 624 h->buflen += sz; 625 } 626 627 void 628 buffmt_includes(struct html *h, const char *name) 629 { 630 const char *p, *pp; 631 632 pp = h->base_includes; 633 634 bufinit(h); 635 while (NULL != (p = strchr(pp, '%'))) { 636 bufncat(h, pp, (size_t)(p - pp)); 637 switch (*(p + 1)) { 638 case('I'): 639 bufcat(h, name); 640 break; 641 default: 642 bufncat(h, p, 2); 643 break; 644 } 645 pp = p + 2; 646 } 647 if (pp) 648 bufcat(h, pp); 649 } 650 651 void 652 buffmt_man(struct html *h, 653 const char *name, const char *sec) 654 { 655 const char *p, *pp; 656 657 pp = h->base_man; 658 659 bufinit(h); 660 while (NULL != (p = strchr(pp, '%'))) { 661 bufncat(h, pp, (size_t)(p - pp)); 662 switch (*(p + 1)) { 663 case('S'): 664 bufcat(h, sec ? sec : "1"); 665 break; 666 case('N'): 667 bufcat_fmt(h, name); 668 break; 669 default: 670 bufncat(h, p, 2); 671 break; 672 } 673 pp = p + 2; 674 } 675 if (pp) 676 bufcat(h, pp); 677 } 678 679 void 680 bufcat_su(struct html *h, const char *p, const struct roffsu *su) 681 { 682 double v; 683 684 v = su->scale; 685 if (SCALE_MM == su->unit && 0.0 == (v /= 100.0)) 686 v = 1.0; 687 688 bufcat_fmt(h, "%s: %.2f%s;", p, v, roffscales[su->unit]); 689 } 690 691 void 692 bufcat_id(struct html *h, const char *src) 693 { 694 695 /* Cf. <http://www.w3.org/TR/html4/types.html#h-6.2>. */ 696 697 while ('\0' != *src) 698 bufcat_fmt(h, "%.2x", *src++); 699 }