1 /* $Id: html.c,v 1.152 2013/08/08 20:07:47 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #ifdef HAVE_CONFIG_H 19 #include "config.h" 20 #endif 21 22 #include <sys/types.h> 23 24 #include <assert.h> 25 #include <ctype.h> 26 #include <stdarg.h> 27 #include <stdio.h> 28 #include <stdint.h> 29 #include <stdlib.h> 30 #include <string.h> 31 #include <unistd.h> 32 33 #include "mandoc.h" 34 #include "libmandoc.h" 35 #include "out.h" 36 #include "html.h" 37 #include "main.h" 38 39 struct htmldata { 40 const char *name; 41 int flags; 42 #define HTML_CLRLINE (1 << 0) 43 #define HTML_NOSTACK (1 << 1) 44 #define HTML_AUTOCLOSE (1 << 2) /* Tag has auto-closure. */ 45 }; 46 47 static const struct htmldata htmltags[TAG_MAX] = { 48 {"html", HTML_CLRLINE}, /* TAG_HTML */ 49 {"head", HTML_CLRLINE}, /* TAG_HEAD */ 50 {"body", HTML_CLRLINE}, /* TAG_BODY */ 51 {"meta", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */ 52 {"title", HTML_CLRLINE}, /* TAG_TITLE */ 53 {"div", HTML_CLRLINE}, /* TAG_DIV */ 54 {"h1", 0}, /* TAG_H1 */ 55 {"h2", 0}, /* TAG_H2 */ 56 {"span", 0}, /* TAG_SPAN */ 57 {"link", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */ 58 {"br", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */ 59 {"a", 0}, /* TAG_A */ 60 {"table", HTML_CLRLINE}, /* TAG_TABLE */ 61 {"tbody", HTML_CLRLINE}, /* TAG_TBODY */ 62 {"col", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */ 63 {"tr", HTML_CLRLINE}, /* TAG_TR */ 64 {"td", HTML_CLRLINE}, /* TAG_TD */ 65 {"li", HTML_CLRLINE}, /* TAG_LI */ 66 {"ul", HTML_CLRLINE}, /* TAG_UL */ 67 {"ol", HTML_CLRLINE}, /* TAG_OL */ 68 {"dl", HTML_CLRLINE}, /* TAG_DL */ 69 {"dt", HTML_CLRLINE}, /* TAG_DT */ 70 {"dd", HTML_CLRLINE}, /* TAG_DD */ 71 {"blockquote", HTML_CLRLINE}, /* TAG_BLOCKQUOTE */ 72 {"p", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_P */ 73 {"pre", HTML_CLRLINE }, /* TAG_PRE */ 74 {"b", 0 }, /* TAG_B */ 75 {"i", 0 }, /* TAG_I */ 76 {"code", 0 }, /* TAG_CODE */ 77 {"small", 0 }, /* TAG_SMALL */ 78 }; 79 80 static const char *const htmlattrs[ATTR_MAX] = { 81 "http-equiv", /* ATTR_HTTPEQUIV */ 82 "content", /* ATTR_CONTENT */ 83 "name", /* ATTR_NAME */ 84 "rel", /* ATTR_REL */ 85 "href", /* ATTR_HREF */ 86 "type", /* ATTR_TYPE */ 87 "media", /* ATTR_MEDIA */ 88 "class", /* ATTR_CLASS */ 89 "style", /* ATTR_STYLE */ 90 "width", /* ATTR_WIDTH */ 91 "id", /* ATTR_ID */ 92 "summary", /* ATTR_SUMMARY */ 93 "align", /* ATTR_ALIGN */ 94 "colspan", /* ATTR_COLSPAN */ 95 }; 96 97 static const char *const roffscales[SCALE_MAX] = { 98 "cm", /* SCALE_CM */ 99 "in", /* SCALE_IN */ 100 "pc", /* SCALE_PC */ 101 "pt", /* SCALE_PT */ 102 "em", /* SCALE_EM */ 103 "em", /* SCALE_MM */ 104 "ex", /* SCALE_EN */ 105 "ex", /* SCALE_BU */ 106 "em", /* SCALE_VS */ 107 "ex", /* SCALE_FS */ 108 }; 109 110 static void bufncat(struct html *, const char *, size_t); 111 static void print_ctag(struct html *, enum htmltag); 112 static int print_encode(struct html *, const char *, int); 113 static void print_metaf(struct html *, enum mandoc_esc); 114 static void print_attr(struct html *, const char *, const char *); 115 static void *ml_alloc(char *, enum htmltype); 116 117 static void * 118 ml_alloc(char *outopts, enum htmltype type) 119 { 120 struct html *h; 121 const char *toks[5]; 122 char *v; 123 124 toks[0] = "style"; 125 toks[1] = "man"; 126 toks[2] = "includes"; 127 toks[3] = "fragment"; 128 toks[4] = NULL; 129 130 h = mandoc_calloc(1, sizeof(struct html)); 131 132 h->type = type; 133 h->tags.head = NULL; 134 h->symtab = mchars_alloc(); 135 136 while (outopts && *outopts) 137 switch (getsubopt(&outopts, UNCONST(toks), &v)) { 138 case (0): 139 h->style = v; 140 break; 141 case (1): 142 h->base_man = v; 143 break; 144 case (2): 145 h->base_includes = v; 146 break; 147 case (3): 148 h->oflags |= HTML_FRAGMENT; 149 break; 150 default: 151 break; 152 } 153 154 return(h); 155 } 156 157 void * 158 html_alloc(char *outopts) 159 { 160 161 return(ml_alloc(outopts, HTML_HTML_4_01_STRICT)); 162 } 163 164 165 void * 166 xhtml_alloc(char *outopts) 167 { 168 169 return(ml_alloc(outopts, HTML_XHTML_1_0_STRICT)); 170 } 171 172 173 void 174 html_free(void *p) 175 { 176 struct tag *tag; 177 struct html *h; 178 179 h = (struct html *)p; 180 181 while ((tag = h->tags.head) != NULL) { 182 h->tags.head = tag->next; 183 free(tag); 184 } 185 186 if (h->symtab) 187 mchars_free(h->symtab); 188 189 free(h); 190 } 191 192 193 void 194 print_gen_head(struct html *h) 195 { 196 struct htmlpair tag[4]; 197 198 tag[0].key = ATTR_HTTPEQUIV; 199 tag[0].val = "Content-Type"; 200 tag[1].key = ATTR_CONTENT; 201 tag[1].val = "text/html; charset=utf-8"; 202 print_otag(h, TAG_META, 2, tag); 203 204 tag[0].key = ATTR_NAME; 205 tag[0].val = "resource-type"; 206 tag[1].key = ATTR_CONTENT; 207 tag[1].val = "document"; 208 print_otag(h, TAG_META, 2, tag); 209 210 if (h->style) { 211 tag[0].key = ATTR_REL; 212 tag[0].val = "stylesheet"; 213 tag[1].key = ATTR_HREF; 214 tag[1].val = h->style; 215 tag[2].key = ATTR_TYPE; 216 tag[2].val = "text/css"; 217 tag[3].key = ATTR_MEDIA; 218 tag[3].val = "all"; 219 print_otag(h, TAG_LINK, 4, tag); 220 } 221 } 222 223 static void 224 print_metaf(struct html *h, enum mandoc_esc deco) 225 { 226 enum htmlfont font; 227 228 switch (deco) { 229 case (ESCAPE_FONTPREV): 230 font = h->metal; 231 break; 232 case (ESCAPE_FONTITALIC): 233 font = HTMLFONT_ITALIC; 234 break; 235 case (ESCAPE_FONTBOLD): 236 font = HTMLFONT_BOLD; 237 break; 238 case (ESCAPE_FONTBI): 239 font = HTMLFONT_BI; 240 break; 241 case (ESCAPE_FONT): 242 /* FALLTHROUGH */ 243 case (ESCAPE_FONTROMAN): 244 font = HTMLFONT_NONE; 245 break; 246 default: 247 abort(); 248 /* NOTREACHED */ 249 } 250 251 if (h->metaf) { 252 print_tagq(h, h->metaf); 253 h->metaf = NULL; 254 } 255 256 h->metal = h->metac; 257 h->metac = font; 258 259 switch (font) { 260 case (HTMLFONT_ITALIC): 261 h->metaf = print_otag(h, TAG_I, 0, NULL); 262 break; 263 case (HTMLFONT_BOLD): 264 h->metaf = print_otag(h, TAG_B, 0, NULL); 265 break; 266 case (HTMLFONT_BI): 267 h->metaf = print_otag(h, TAG_B, 0, NULL); 268 print_otag(h, TAG_I, 0, NULL); 269 break; 270 default: 271 break; 272 } 273 } 274 275 int 276 html_strlen(const char *cp) 277 { 278 size_t rsz; 279 int skip, sz; 280 281 /* 282 * Account for escaped sequences within string length 283 * calculations. This follows the logic in term_strlen() as we 284 * must calculate the width of produced strings. 285 * Assume that characters are always width of "1". This is 286 * hacky, but it gets the job done for approximation of widths. 287 */ 288 289 sz = 0; 290 skip = 0; 291 while (1) { 292 rsz = strcspn(cp, "\\"); 293 if (rsz) { 294 cp += rsz; 295 if (skip) { 296 skip = 0; 297 rsz--; 298 } 299 sz += rsz; 300 } 301 if ('\0' == *cp) 302 break; 303 cp++; 304 switch (mandoc_escape(&cp, NULL, NULL)) { 305 case (ESCAPE_ERROR): 306 return(sz); 307 case (ESCAPE_UNICODE): 308 /* FALLTHROUGH */ 309 case (ESCAPE_NUMBERED): 310 /* FALLTHROUGH */ 311 case (ESCAPE_SPECIAL): 312 if (skip) 313 skip = 0; 314 else 315 sz++; 316 break; 317 case (ESCAPE_SKIPCHAR): 318 skip = 1; 319 break; 320 default: 321 break; 322 } 323 } 324 return(sz); 325 } 326 327 static int 328 print_encode(struct html *h, const char *p, int norecurse) 329 { 330 size_t sz; 331 int c, len, nospace; 332 const char *seq; 333 enum mandoc_esc esc; 334 static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH, '\0' }; 335 336 nospace = 0; 337 338 while ('\0' != *p) { 339 if (HTML_SKIPCHAR & h->flags && '\\' != *p) { 340 h->flags &= ~HTML_SKIPCHAR; 341 p++; 342 continue; 343 } 344 345 sz = strcspn(p, rejs); 346 347 fwrite(p, 1, sz, stdout); 348 p += (int)sz; 349 350 if ('\0' == *p) 351 break; 352 353 switch (*p++) { 354 case ('<'): 355 printf("<"); 356 continue; 357 case ('>'): 358 printf(">"); 359 continue; 360 case ('&'): 361 printf("&"); 362 continue; 363 case (ASCII_HYPH): 364 putchar('-'); 365 continue; 366 default: 367 break; 368 } 369 370 esc = mandoc_escape(&p, &seq, &len); 371 if (ESCAPE_ERROR == esc) 372 break; 373 374 switch (esc) { 375 case (ESCAPE_FONT): 376 /* FALLTHROUGH */ 377 case (ESCAPE_FONTPREV): 378 /* FALLTHROUGH */ 379 case (ESCAPE_FONTBOLD): 380 /* FALLTHROUGH */ 381 case (ESCAPE_FONTITALIC): 382 /* FALLTHROUGH */ 383 case (ESCAPE_FONTBI): 384 /* FALLTHROUGH */ 385 case (ESCAPE_FONTROMAN): 386 if (0 == norecurse) 387 print_metaf(h, esc); 388 continue; 389 case (ESCAPE_SKIPCHAR): 390 h->flags |= HTML_SKIPCHAR; 391 continue; 392 default: 393 break; 394 } 395 396 if (h->flags & HTML_SKIPCHAR) { 397 h->flags &= ~HTML_SKIPCHAR; 398 continue; 399 } 400 401 switch (esc) { 402 case (ESCAPE_UNICODE): 403 /* Skip passed "u" header. */ 404 c = mchars_num2uc(seq + 1, len - 1); 405 if ('\0' != c) 406 printf("&#x%x;", c); 407 break; 408 case (ESCAPE_NUMBERED): 409 c = mchars_num2char(seq, len); 410 if ('\0' != c) 411 putchar(c); 412 break; 413 case (ESCAPE_SPECIAL): 414 c = mchars_spec2cp(h->symtab, seq, len); 415 if (c > 0) 416 printf("&#%d;", c); 417 else if (-1 == c && 1 == len) 418 putchar((int)*seq); 419 break; 420 case (ESCAPE_NOSPACE): 421 if ('\0' == *p) 422 nospace = 1; 423 break; 424 default: 425 break; 426 } 427 } 428 429 return(nospace); 430 } 431 432 433 static void 434 print_attr(struct html *h, const char *key, const char *val) 435 { 436 printf(" %s=\"", key); 437 (void)print_encode(h, val, 1); 438 putchar('\"'); 439 } 440 441 442 struct tag * 443 print_otag(struct html *h, enum htmltag tag, 444 int sz, const struct htmlpair *p) 445 { 446 int i; 447 struct tag *t; 448 449 /* Push this tags onto the stack of open scopes. */ 450 451 if ( ! (HTML_NOSTACK & htmltags[tag].flags)) { 452 t = mandoc_malloc(sizeof(struct tag)); 453 t->tag = tag; 454 t->next = h->tags.head; 455 h->tags.head = t; 456 } else 457 t = NULL; 458 459 if ( ! (HTML_NOSPACE & h->flags)) 460 if ( ! (HTML_CLRLINE & htmltags[tag].flags)) { 461 /* Manage keeps! */ 462 if ( ! (HTML_KEEP & h->flags)) { 463 if (HTML_PREKEEP & h->flags) 464 h->flags |= HTML_KEEP; 465 putchar(' '); 466 } else 467 printf(" "); 468 } 469 470 if ( ! (h->flags & HTML_NONOSPACE)) 471 h->flags &= ~HTML_NOSPACE; 472 else 473 h->flags |= HTML_NOSPACE; 474 475 /* Print out the tag name and attributes. */ 476 477 printf("<%s", htmltags[tag].name); 478 for (i = 0; i < sz; i++) 479 print_attr(h, htmlattrs[p[i].key], p[i].val); 480 481 /* Add non-overridable attributes. */ 482 483 if (TAG_HTML == tag && HTML_XHTML_1_0_STRICT == h->type) { 484 print_attr(h, "xmlns", "http://www.w3.org/1999/xhtml"); 485 print_attr(h, "xml:lang", "en"); 486 print_attr(h, "lang", "en"); 487 } 488 489 /* Accommodate for XML "well-formed" singleton escaping. */ 490 491 if (HTML_AUTOCLOSE & htmltags[tag].flags) 492 switch (h->type) { 493 case (HTML_XHTML_1_0_STRICT): 494 putchar('/'); 495 break; 496 default: 497 break; 498 } 499 500 putchar('>'); 501 502 h->flags |= HTML_NOSPACE; 503 504 if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags) 505 putchar('\n'); 506 507 return(t); 508 } 509 510 511 static void 512 print_ctag(struct html *h, enum htmltag tag) 513 { 514 515 printf("</%s>", htmltags[tag].name); 516 if (HTML_CLRLINE & htmltags[tag].flags) { 517 h->flags |= HTML_NOSPACE; 518 putchar('\n'); 519 } 520 } 521 522 void 523 print_gen_decls(struct html *h) 524 { 525 const char *doctype; 526 const char *dtd; 527 const char *name; 528 529 switch (h->type) { 530 case (HTML_HTML_4_01_STRICT): 531 name = "HTML"; 532 doctype = "-//W3C//DTD HTML 4.01//EN"; 533 dtd = "http://www.w3.org/TR/html4/strict.dtd"; 534 break; 535 default: 536 puts("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); 537 name = "html"; 538 doctype = "-//W3C//DTD XHTML 1.0 Strict//EN"; 539 dtd = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; 540 break; 541 } 542 543 printf("<!DOCTYPE %s PUBLIC \"%s\" \"%s\">\n", 544 name, doctype, dtd); 545 } 546 547 void 548 print_text(struct html *h, const char *word) 549 { 550 551 if ( ! (HTML_NOSPACE & h->flags)) { 552 /* Manage keeps! */ 553 if ( ! (HTML_KEEP & h->flags)) { 554 if (HTML_PREKEEP & h->flags) 555 h->flags |= HTML_KEEP; 556 putchar(' '); 557 } else 558 printf(" "); 559 } 560 561 assert(NULL == h->metaf); 562 switch (h->metac) { 563 case (HTMLFONT_ITALIC): 564 h->metaf = print_otag(h, TAG_I, 0, NULL); 565 break; 566 case (HTMLFONT_BOLD): 567 h->metaf = print_otag(h, TAG_B, 0, NULL); 568 break; 569 case (HTMLFONT_BI): 570 h->metaf = print_otag(h, TAG_B, 0, NULL); 571 print_otag(h, TAG_I, 0, NULL); 572 break; 573 default: 574 break; 575 } 576 577 assert(word); 578 if ( ! print_encode(h, word, 0)) { 579 if ( ! (h->flags & HTML_NONOSPACE)) 580 h->flags &= ~HTML_NOSPACE; 581 } else 582 h->flags |= HTML_NOSPACE; 583 584 if (h->metaf) { 585 print_tagq(h, h->metaf); 586 h->metaf = NULL; 587 } 588 589 h->flags &= ~HTML_IGNDELIM; 590 } 591 592 593 void 594 print_tagq(struct html *h, const struct tag *until) 595 { 596 struct tag *tag; 597 598 while ((tag = h->tags.head) != NULL) { 599 /* 600 * Remember to close out and nullify the current 601 * meta-font and table, if applicable. 602 */ 603 if (tag == h->metaf) 604 h->metaf = NULL; 605 if (tag == h->tblt) 606 h->tblt = NULL; 607 print_ctag(h, tag->tag); 608 h->tags.head = tag->next; 609 free(tag); 610 if (until && tag == until) 611 return; 612 } 613 } 614 615 616 void 617 print_stagq(struct html *h, const struct tag *suntil) 618 { 619 struct tag *tag; 620 621 while ((tag = h->tags.head) != NULL) { 622 if (suntil && tag == suntil) 623 return; 624 /* 625 * Remember to close out and nullify the current 626 * meta-font and table, if applicable. 627 */ 628 if (tag == h->metaf) 629 h->metaf = NULL; 630 if (tag == h->tblt) 631 h->tblt = NULL; 632 print_ctag(h, tag->tag); 633 h->tags.head = tag->next; 634 free(tag); 635 } 636 } 637 638 void 639 bufinit(struct html *h) 640 { 641 642 h->buf[0] = '\0'; 643 h->buflen = 0; 644 } 645 646 void 647 bufcat_style(struct html *h, const char *key, const char *val) 648 { 649 650 bufcat(h, key); 651 bufcat(h, ":"); 652 bufcat(h, val); 653 bufcat(h, ";"); 654 } 655 656 void 657 bufcat(struct html *h, const char *p) 658 { 659 660 h->buflen = strlcat(h->buf, p, BUFSIZ); 661 assert(h->buflen < BUFSIZ); 662 } 663 664 void 665 bufcat_fmt(struct html *h, const char *fmt, ...) 666 { 667 va_list ap; 668 669 va_start(ap, fmt); 670 (void)vsnprintf(h->buf + (int)h->buflen, 671 BUFSIZ - h->buflen - 1, fmt, ap); 672 va_end(ap); 673 h->buflen = strlen(h->buf); 674 } 675 676 static void 677 bufncat(struct html *h, const char *p, size_t sz) 678 { 679 680 assert(h->buflen + sz + 1 < BUFSIZ); 681 strncat(h->buf, p, sz); 682 h->buflen += sz; 683 } 684 685 void 686 buffmt_includes(struct html *h, const char *name) 687 { 688 const char *p, *pp; 689 690 pp = h->base_includes; 691 692 bufinit(h); 693 while (NULL != (p = strchr(pp, '%'))) { 694 bufncat(h, pp, (size_t)(p - pp)); 695 switch (*(p + 1)) { 696 case('I'): 697 bufcat(h, name); 698 break; 699 default: 700 bufncat(h, p, 2); 701 break; 702 } 703 pp = p + 2; 704 } 705 if (pp) 706 bufcat(h, pp); 707 } 708 709 void 710 buffmt_man(struct html *h, 711 const char *name, const char *sec) 712 { 713 const char *p, *pp; 714 715 pp = h->base_man; 716 717 bufinit(h); 718 while (NULL != (p = strchr(pp, '%'))) { 719 bufncat(h, pp, (size_t)(p - pp)); 720 switch (*(p + 1)) { 721 case('S'): 722 bufcat(h, sec ? sec : "1"); 723 break; 724 case('N'): 725 bufcat_fmt(h, name); 726 break; 727 default: 728 bufncat(h, p, 2); 729 break; 730 } 731 pp = p + 2; 732 } 733 if (pp) 734 bufcat(h, pp); 735 } 736 737 void 738 bufcat_su(struct html *h, const char *p, const struct roffsu *su) 739 { 740 double v; 741 742 v = su->scale; 743 if (SCALE_MM == su->unit && 0.0 == (v /= 100.0)) 744 v = 1.0; 745 746 bufcat_fmt(h, "%s: %.2f%s;", p, v, roffscales[su->unit]); 747 } 748 749 void 750 bufcat_id(struct html *h, const char *src) 751 { 752 753 /* Cf. <http://www.w3.org/TR/html4/types.html#h-6.2>. */ 754 755 while ('\0' != *src) 756 bufcat_fmt(h, "%.2x", *src++); 757 }