1 /* $Id: mandoc.c,v 1.62 2011/12/03 16:08:51 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #ifdef HAVE_CONFIG_H 19 #include "config.h" 20 #endif 21 22 #include <sys/types.h> 23 24 #include <assert.h> 25 #include <ctype.h> 26 #include <errno.h> 27 #include <limits.h> 28 #include <stdlib.h> 29 #include <stdio.h> 30 #include <string.h> 31 #include <time.h> 32 33 #include "mandoc.h" 34 #include "libmandoc.h" 35 36 #define DATESIZE 32 37 38 static int a2time(time_t *, const char *, const char *); 39 static char *time2a(time_t); 40 static int numescape(const char *); 41 42 /* 43 * Pass over recursive numerical expressions. This context of this 44 * function is important: it's only called within character-terminating 45 * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial 46 * recursion: we don't care about what's in these blocks. 47 * This returns the number of characters skipped or -1 if an error 48 * occurs (the caller should bail). 49 */ 50 static int 51 numescape(const char *start) 52 { 53 int i; 54 size_t sz; 55 const char *cp; 56 57 i = 0; 58 59 /* The expression consists of a subexpression. */ 60 61 if ('\\' == start[i]) { 62 cp = &start[++i]; 63 /* 64 * Read past the end of the subexpression. 65 * Bail immediately on errors. 66 */ 67 if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL)) 68 return(-1); 69 return(i + cp - &start[i]); 70 } 71 72 if ('(' != start[i++]) 73 return(0); 74 75 /* 76 * A parenthesised subexpression. Read until the closing 77 * parenthesis, making sure to handle any nested subexpressions 78 * that might ruin our parse. 79 */ 80 81 while (')' != start[i]) { 82 sz = strcspn(&start[i], ")\\"); 83 i += (int)sz; 84 85 if ('\0' == start[i]) 86 return(-1); 87 else if ('\\' != start[i]) 88 continue; 89 90 cp = &start[++i]; 91 if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL)) 92 return(-1); 93 i += cp - &start[i]; 94 } 95 96 /* Read past the terminating ')'. */ 97 return(++i); 98 } 99 100 enum mandoc_esc 101 mandoc_escape(const char **end, const char **start, int *sz) 102 { 103 char c, term, numeric; 104 int i, lim, ssz, rlim; 105 const char *cp, *rstart; 106 enum mandoc_esc gly; 107 108 cp = *end; 109 rstart = cp; 110 if (start) 111 *start = rstart; 112 i = lim = 0; 113 gly = ESCAPE_ERROR; 114 term = numeric = '\0'; 115 116 switch ((c = cp[i++])) { 117 /* 118 * First the glyphs. There are several different forms of 119 * these, but each eventually returns a substring of the glyph 120 * name. 121 */ 122 case ('('): 123 gly = ESCAPE_SPECIAL; 124 lim = 2; 125 break; 126 case ('['): 127 gly = ESCAPE_SPECIAL; 128 /* 129 * Unicode escapes are defined in groff as \[uXXXX] to 130 * \[u10FFFF], where the contained value must be a valid 131 * Unicode codepoint. Here, however, only check whether 132 * it's not a zero-width escape. 133 */ 134 if ('u' == cp[i] && ']' != cp[i + 1]) 135 gly = ESCAPE_UNICODE; 136 term = ']'; 137 break; 138 case ('C'): 139 if ('\'' != cp[i]) 140 return(ESCAPE_ERROR); 141 gly = ESCAPE_SPECIAL; 142 term = '\''; 143 break; 144 145 /* 146 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where 147 * 'X' is the trigger. These have opaque sub-strings. 148 */ 149 case ('F'): 150 /* FALLTHROUGH */ 151 case ('g'): 152 /* FALLTHROUGH */ 153 case ('k'): 154 /* FALLTHROUGH */ 155 case ('M'): 156 /* FALLTHROUGH */ 157 case ('m'): 158 /* FALLTHROUGH */ 159 case ('n'): 160 /* FALLTHROUGH */ 161 case ('V'): 162 /* FALLTHROUGH */ 163 case ('Y'): 164 gly = ESCAPE_IGNORE; 165 /* FALLTHROUGH */ 166 case ('f'): 167 if (ESCAPE_ERROR == gly) 168 gly = ESCAPE_FONT; 169 170 rstart= &cp[i]; 171 if (start) 172 *start = rstart; 173 174 switch (cp[i++]) { 175 case ('('): 176 lim = 2; 177 break; 178 case ('['): 179 term = ']'; 180 break; 181 default: 182 lim = 1; 183 i--; 184 break; 185 } 186 break; 187 188 /* 189 * These escapes are of the form \X'Y', where 'X' is the trigger 190 * and 'Y' is any string. These have opaque sub-strings. 191 */ 192 case ('A'): 193 /* FALLTHROUGH */ 194 case ('b'): 195 /* FALLTHROUGH */ 196 case ('D'): 197 /* FALLTHROUGH */ 198 case ('o'): 199 /* FALLTHROUGH */ 200 case ('R'): 201 /* FALLTHROUGH */ 202 case ('X'): 203 /* FALLTHROUGH */ 204 case ('Z'): 205 if ('\'' != cp[i++]) 206 return(ESCAPE_ERROR); 207 gly = ESCAPE_IGNORE; 208 term = '\''; 209 break; 210 211 /* 212 * These escapes are of the form \X'N', where 'X' is the trigger 213 * and 'N' resolves to a numerical expression. 214 */ 215 case ('B'): 216 /* FALLTHROUGH */ 217 case ('h'): 218 /* FALLTHROUGH */ 219 case ('H'): 220 /* FALLTHROUGH */ 221 case ('L'): 222 /* FALLTHROUGH */ 223 case ('l'): 224 gly = ESCAPE_NUMBERED; 225 /* FALLTHROUGH */ 226 case ('S'): 227 /* FALLTHROUGH */ 228 case ('v'): 229 /* FALLTHROUGH */ 230 case ('w'): 231 /* FALLTHROUGH */ 232 case ('x'): 233 if (ESCAPE_ERROR == gly) 234 gly = ESCAPE_IGNORE; 235 if ('\'' != cp[i++]) 236 return(ESCAPE_ERROR); 237 term = numeric = '\''; 238 break; 239 240 /* 241 * Special handling for the numbered character escape. 242 * XXX Do any other escapes need similar handling? 243 */ 244 case ('N'): 245 if ('\0' == cp[i]) 246 return(ESCAPE_ERROR); 247 *end = &cp[++i]; 248 if (isdigit((unsigned char)cp[i-1])) 249 return(ESCAPE_IGNORE); 250 while (isdigit((unsigned char)**end)) 251 (*end)++; 252 if (start) 253 *start = &cp[i]; 254 if (sz) 255 *sz = *end - &cp[i]; 256 if ('\0' != **end) 257 (*end)++; 258 return(ESCAPE_NUMBERED); 259 260 /* 261 * Sizes get a special category of their own. 262 */ 263 case ('s'): 264 gly = ESCAPE_IGNORE; 265 266 rstart = &cp[i]; 267 if (start) 268 *start = rstart; 269 270 /* See +/- counts as a sign. */ 271 c = cp[i]; 272 if ('+' == c || '-' == c || ASCII_HYPH == c) 273 ++i; 274 275 switch (cp[i++]) { 276 case ('('): 277 lim = 2; 278 break; 279 case ('['): 280 term = numeric = ']'; 281 break; 282 case ('\''): 283 term = numeric = '\''; 284 break; 285 default: 286 lim = 1; 287 i--; 288 break; 289 } 290 291 /* See +/- counts as a sign. */ 292 c = cp[i]; 293 if ('+' == c || '-' == c || ASCII_HYPH == c) 294 ++i; 295 296 break; 297 298 /* 299 * Anything else is assumed to be a glyph. 300 */ 301 default: 302 gly = ESCAPE_SPECIAL; 303 lim = 1; 304 i--; 305 break; 306 } 307 308 assert(ESCAPE_ERROR != gly); 309 310 rstart = &cp[i]; 311 if (start) 312 *start = rstart; 313 314 /* 315 * If a terminating block has been specified, we need to 316 * handle the case of recursion, which could have their 317 * own terminating blocks that mess up our parse. This, by the 318 * way, means that the "start" and "size" values will be 319 * effectively meaningless. 320 */ 321 322 ssz = 0; 323 if (numeric && -1 == (ssz = numescape(&cp[i]))) 324 return(ESCAPE_ERROR); 325 326 i += ssz; 327 rlim = -1; 328 329 /* 330 * We have a character terminator. Try to read up to that 331 * character. If we can't (i.e., we hit the nil), then return 332 * an error; if we can, calculate our length, read past the 333 * terminating character, and exit. 334 */ 335 336 if ('\0' != term) { 337 *end = strchr(&cp[i], term); 338 if ('\0' == *end) 339 return(ESCAPE_ERROR); 340 341 rlim = *end - &cp[i]; 342 if (sz) 343 *sz = rlim; 344 (*end)++; 345 goto out; 346 } 347 348 assert(lim > 0); 349 350 /* 351 * We have a numeric limit. If the string is shorter than that, 352 * stop and return an error. Else adjust our endpoint, length, 353 * and return the current glyph. 354 */ 355 356 if ((size_t)lim > strlen(&cp[i])) 357 return(ESCAPE_ERROR); 358 359 rlim = lim; 360 if (sz) 361 *sz = rlim; 362 363 *end = &cp[i] + lim; 364 365 out: 366 assert(rlim >= 0 && rstart); 367 368 /* Run post-processors. */ 369 370 switch (gly) { 371 case (ESCAPE_FONT): 372 /* 373 * Pretend that the constant-width font modes are the 374 * same as the regular font modes. 375 */ 376 if (2 == rlim && 'C' == *rstart) 377 rstart++; 378 else if (1 != rlim) 379 break; 380 381 switch (*rstart) { 382 case ('3'): 383 /* FALLTHROUGH */ 384 case ('B'): 385 gly = ESCAPE_FONTBOLD; 386 break; 387 case ('2'): 388 /* FALLTHROUGH */ 389 case ('I'): 390 gly = ESCAPE_FONTITALIC; 391 break; 392 case ('P'): 393 gly = ESCAPE_FONTPREV; 394 break; 395 case ('1'): 396 /* FALLTHROUGH */ 397 case ('R'): 398 gly = ESCAPE_FONTROMAN; 399 break; 400 } 401 break; 402 case (ESCAPE_SPECIAL): 403 if (1 != rlim) 404 break; 405 if ('c' == *rstart) 406 gly = ESCAPE_NOSPACE; 407 break; 408 default: 409 break; 410 } 411 412 return(gly); 413 } 414 415 void * 416 mandoc_calloc(size_t num, size_t size) 417 { 418 void *ptr; 419 420 ptr = calloc(num, size); 421 if (NULL == ptr) { 422 perror(NULL); 423 exit((int)MANDOCLEVEL_SYSERR); 424 } 425 426 return(ptr); 427 } 428 429 430 void * 431 mandoc_malloc(size_t size) 432 { 433 void *ptr; 434 435 ptr = malloc(size); 436 if (NULL == ptr) { 437 perror(NULL); 438 exit((int)MANDOCLEVEL_SYSERR); 439 } 440 441 return(ptr); 442 } 443 444 445 void * 446 mandoc_realloc(void *ptr, size_t size) 447 { 448 449 ptr = realloc(ptr, size); 450 if (NULL == ptr) { 451 perror(NULL); 452 exit((int)MANDOCLEVEL_SYSERR); 453 } 454 455 return(ptr); 456 } 457 458 char * 459 mandoc_strndup(const char *ptr, size_t sz) 460 { 461 char *p; 462 463 p = mandoc_malloc(sz + 1); 464 memcpy(p, ptr, sz); 465 p[(int)sz] = '\0'; 466 return(p); 467 } 468 469 char * 470 mandoc_strdup(const char *ptr) 471 { 472 char *p; 473 474 p = strdup(ptr); 475 if (NULL == p) { 476 perror(NULL); 477 exit((int)MANDOCLEVEL_SYSERR); 478 } 479 480 return(p); 481 } 482 483 /* 484 * Parse a quoted or unquoted roff-style request or macro argument. 485 * Return a pointer to the parsed argument, which is either the original 486 * pointer or advanced by one byte in case the argument is quoted. 487 * Null-terminate the argument in place. 488 * Collapse pairs of quotes inside quoted arguments. 489 * Advance the argument pointer to the next argument, 490 * or to the null byte terminating the argument line. 491 */ 492 char * 493 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) 494 { 495 char *start, *cp; 496 int quoted, pairs, white; 497 498 /* Quoting can only start with a new word. */ 499 start = *cpp; 500 quoted = 0; 501 if ('"' == *start) { 502 quoted = 1; 503 start++; 504 } 505 506 pairs = 0; 507 white = 0; 508 for (cp = start; '\0' != *cp; cp++) { 509 /* Move left after quoted quotes and escaped backslashes. */ 510 if (pairs) 511 cp[-pairs] = cp[0]; 512 if ('\\' == cp[0]) { 513 if ('\\' == cp[1]) { 514 /* Poor man's copy mode. */ 515 pairs++; 516 cp++; 517 } else if (0 == quoted && ' ' == cp[1]) 518 /* Skip escaped blanks. */ 519 cp++; 520 } else if (0 == quoted) { 521 if (' ' == cp[0]) { 522 /* Unescaped blanks end unquoted args. */ 523 white = 1; 524 break; 525 } 526 } else if ('"' == cp[0]) { 527 if ('"' == cp[1]) { 528 /* Quoted quotes collapse. */ 529 pairs++; 530 cp++; 531 } else { 532 /* Unquoted quotes end quoted args. */ 533 quoted = 2; 534 break; 535 } 536 } 537 } 538 539 /* Quoted argument without a closing quote. */ 540 if (1 == quoted) 541 mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL); 542 543 /* Null-terminate this argument and move to the next one. */ 544 if (pairs) 545 cp[-pairs] = '\0'; 546 if ('\0' != *cp) { 547 *cp++ = '\0'; 548 while (' ' == *cp) 549 cp++; 550 } 551 *pos += (int)(cp - start) + (quoted ? 1 : 0); 552 *cpp = cp; 553 554 if ('\0' == *cp && (white || ' ' == cp[-1])) 555 mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL); 556 557 return(start); 558 } 559 560 static int 561 a2time(time_t *t, const char *fmt, const char *p) 562 { 563 struct tm tm; 564 char *pp; 565 566 memset(&tm, 0, sizeof(struct tm)); 567 568 pp = NULL; 569 #ifdef HAVE_STRPTIME 570 pp = strptime(p, fmt, &tm); 571 #endif 572 if (NULL != pp && '\0' == *pp) { 573 *t = mktime(&tm); 574 return(1); 575 } 576 577 return(0); 578 } 579 580 static char * 581 time2a(time_t t) 582 { 583 struct tm *tm; 584 char *buf, *p; 585 size_t ssz; 586 int isz; 587 588 tm = localtime(&t); 589 590 /* 591 * Reserve space: 592 * up to 9 characters for the month (September) + blank 593 * up to 2 characters for the day + comma + blank 594 * 4 characters for the year and a terminating '\0' 595 */ 596 p = buf = mandoc_malloc(10 + 4 + 4 + 1); 597 598 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm))) 599 goto fail; 600 p += (int)ssz; 601 602 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday))) 603 goto fail; 604 p += isz; 605 606 if (0 == strftime(p, 4 + 1, "%Y", tm)) 607 goto fail; 608 return(buf); 609 610 fail: 611 free(buf); 612 return(NULL); 613 } 614 615 char * 616 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos) 617 { 618 char *out; 619 time_t t; 620 621 if (NULL == in || '\0' == *in || 622 0 == strcmp(in, "$" "Mdocdate$")) { 623 mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL); 624 time(&t); 625 } 626 else if (a2time(&t, "%Y-%m-%d", in)) 627 t = 0; 628 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) && 629 !a2time(&t, "%b %d, %Y", in)) { 630 mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL); 631 t = 0; 632 } 633 out = t ? time2a(t) : NULL; 634 return(out ? out : mandoc_strdup(in)); 635 } 636 637 int 638 mandoc_eos(const char *p, size_t sz, int enclosed) 639 { 640 const char *q; 641 int found; 642 643 if (0 == sz) 644 return(0); 645 646 /* 647 * End-of-sentence recognition must include situations where 648 * some symbols, such as `)', allow prior EOS punctuation to 649 * propagate outward. 650 */ 651 652 found = 0; 653 for (q = p + (int)sz - 1; q >= p; q--) { 654 switch (*q) { 655 case ('\"'): 656 /* FALLTHROUGH */ 657 case ('\''): 658 /* FALLTHROUGH */ 659 case (']'): 660 /* FALLTHROUGH */ 661 case (')'): 662 if (0 == found) 663 enclosed = 1; 664 break; 665 case ('.'): 666 /* FALLTHROUGH */ 667 case ('!'): 668 /* FALLTHROUGH */ 669 case ('?'): 670 found = 1; 671 break; 672 default: 673 return(found && (!enclosed || isalnum((unsigned char)*q))); 674 } 675 } 676 677 return(found && !enclosed); 678 } 679 680 /* 681 * Find out whether a line is a macro line or not. If it is, adjust the 682 * current position and return one; if it isn't, return zero and don't 683 * change the current position. 684 */ 685 int 686 mandoc_getcontrol(const char *cp, int *ppos) 687 { 688 int pos; 689 690 pos = *ppos; 691 692 if ('\\' == cp[pos] && '.' == cp[pos + 1]) 693 pos += 2; 694 else if ('.' == cp[pos] || '\'' == cp[pos]) 695 pos++; 696 else 697 return(0); 698 699 while (' ' == cp[pos] || '\t' == cp[pos]) 700 pos++; 701 702 *ppos = pos; 703 return(1); 704 } 705 706 /* 707 * Convert a string to a long that may not be <0. 708 * If the string is invalid, or is less than 0, return -1. 709 */ 710 int 711 mandoc_strntoi(const char *p, size_t sz, int base) 712 { 713 char buf[32]; 714 char *ep; 715 long v; 716 717 if (sz > 31) 718 return(-1); 719 720 memcpy(buf, p, sz); 721 buf[(int)sz] = '\0'; 722 723 errno = 0; 724 v = strtol(buf, &ep, base); 725 726 if (buf[0] == '\0' || *ep != '\0') 727 return(-1); 728 729 if (v > INT_MAX) 730 v = INT_MAX; 731 if (v < INT_MIN) 732 v = INT_MIN; 733 734 return((int)v); 735 }