1 /*      $Id: mandoc.c,v 1.62 2011/12/03 16:08:51 schwarze Exp $ */
   2 /*
   3  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
   4  * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  */
  18 #ifdef HAVE_CONFIG_H
  19 #include "config.h"
  20 #endif
  21 
  22 #include <sys/types.h>
  23 
  24 #include <assert.h>
  25 #include <ctype.h>
  26 #include <errno.h>
  27 #include <limits.h>
  28 #include <stdlib.h>
  29 #include <stdio.h>
  30 #include <string.h>
  31 #include <time.h>
  32 
  33 #include "mandoc.h"
  34 #include "libmandoc.h"
  35 
  36 #define DATESIZE 32
  37 
  38 static  int      a2time(time_t *, const char *, const char *);
  39 static  char    *time2a(time_t);
  40 static  int      numescape(const char *);
  41 
  42 /*
  43  * Pass over recursive numerical expressions.  This context of this
  44  * function is important: it's only called within character-terminating
  45  * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
  46  * recursion: we don't care about what's in these blocks. 
  47  * This returns the number of characters skipped or -1 if an error
  48  * occurs (the caller should bail).
  49  */
  50 static int
  51 numescape(const char *start)
  52 {
  53         int              i;
  54         size_t           sz;
  55         const char      *cp;
  56 
  57         i = 0;
  58 
  59         /* The expression consists of a subexpression. */
  60 
  61         if ('\\' == start[i]) {
  62                 cp = &start[++i];
  63                 /*
  64                  * Read past the end of the subexpression.
  65                  * Bail immediately on errors.
  66                  */
  67                 if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
  68                         return(-1);
  69                 return(i + cp - &start[i]);
  70         } 
  71 
  72         if ('(' != start[i++])
  73                 return(0);
  74 
  75         /*
  76          * A parenthesised subexpression.  Read until the closing
  77          * parenthesis, making sure to handle any nested subexpressions
  78          * that might ruin our parse.
  79          */
  80 
  81         while (')' != start[i]) {
  82                 sz = strcspn(&start[i], ")\\");
  83                 i += (int)sz;
  84 
  85                 if ('\0' == start[i])
  86                         return(-1);
  87                 else if ('\\' != start[i])
  88                         continue;
  89 
  90                 cp = &start[++i];
  91                 if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
  92                         return(-1);
  93                 i += cp - &start[i];
  94         }
  95 
  96         /* Read past the terminating ')'. */
  97         return(++i);
  98 }
  99 
 100 enum mandoc_esc
 101 mandoc_escape(const char **end, const char **start, int *sz)
 102 {
 103         char             c, term, numeric;
 104         int              i, lim, ssz, rlim;
 105         const char      *cp, *rstart;
 106         enum mandoc_esc  gly; 
 107 
 108         cp = *end;
 109         rstart = cp;
 110         if (start)
 111                 *start = rstart;
 112         i = lim = 0;
 113         gly = ESCAPE_ERROR;
 114         term = numeric = '\0';
 115 
 116         switch ((c = cp[i++])) {
 117         /*
 118          * First the glyphs.  There are several different forms of
 119          * these, but each eventually returns a substring of the glyph
 120          * name.
 121          */
 122         case ('('):
 123                 gly = ESCAPE_SPECIAL;
 124                 lim = 2;
 125                 break;
 126         case ('['):
 127                 gly = ESCAPE_SPECIAL;
 128                 /*
 129                  * Unicode escapes are defined in groff as \[uXXXX] to
 130                  * \[u10FFFF], where the contained value must be a valid
 131                  * Unicode codepoint.  Here, however, only check whether
 132                  * it's not a zero-width escape.
 133                  */
 134                 if ('u' == cp[i] && ']' != cp[i + 1])
 135                         gly = ESCAPE_UNICODE;
 136                 term = ']';
 137                 break;
 138         case ('C'):
 139                 if ('\'' != cp[i])
 140                         return(ESCAPE_ERROR);
 141                 gly = ESCAPE_SPECIAL;
 142                 term = '\'';
 143                 break;
 144 
 145         /*
 146          * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
 147          * 'X' is the trigger.  These have opaque sub-strings.
 148          */
 149         case ('F'):
 150                 /* FALLTHROUGH */
 151         case ('g'):
 152                 /* FALLTHROUGH */
 153         case ('k'):
 154                 /* FALLTHROUGH */
 155         case ('M'):
 156                 /* FALLTHROUGH */
 157         case ('m'):
 158                 /* FALLTHROUGH */
 159         case ('n'):
 160                 /* FALLTHROUGH */
 161         case ('V'):
 162                 /* FALLTHROUGH */
 163         case ('Y'):
 164                 gly = ESCAPE_IGNORE;
 165                 /* FALLTHROUGH */
 166         case ('f'):
 167                 if (ESCAPE_ERROR == gly)
 168                         gly = ESCAPE_FONT;
 169 
 170                 rstart= &cp[i];
 171                 if (start) 
 172                         *start = rstart;
 173 
 174                 switch (cp[i++]) {
 175                 case ('('):
 176                         lim = 2;
 177                         break;
 178                 case ('['):
 179                         term = ']';
 180                         break;
 181                 default:
 182                         lim = 1;
 183                         i--;
 184                         break;
 185                 }
 186                 break;
 187 
 188         /*
 189          * These escapes are of the form \X'Y', where 'X' is the trigger
 190          * and 'Y' is any string.  These have opaque sub-strings.
 191          */
 192         case ('A'):
 193                 /* FALLTHROUGH */
 194         case ('b'):
 195                 /* FALLTHROUGH */
 196         case ('D'):
 197                 /* FALLTHROUGH */
 198         case ('o'):
 199                 /* FALLTHROUGH */
 200         case ('R'):
 201                 /* FALLTHROUGH */
 202         case ('X'):
 203                 /* FALLTHROUGH */
 204         case ('Z'):
 205                 if ('\'' != cp[i++])
 206                         return(ESCAPE_ERROR);
 207                 gly = ESCAPE_IGNORE;
 208                 term = '\'';
 209                 break;
 210 
 211         /*
 212          * These escapes are of the form \X'N', where 'X' is the trigger
 213          * and 'N' resolves to a numerical expression.
 214          */
 215         case ('B'):
 216                 /* FALLTHROUGH */
 217         case ('h'):
 218                 /* FALLTHROUGH */
 219         case ('H'):
 220                 /* FALLTHROUGH */
 221         case ('L'):
 222                 /* FALLTHROUGH */
 223         case ('l'):
 224                 gly = ESCAPE_NUMBERED;
 225                 /* FALLTHROUGH */
 226         case ('S'):
 227                 /* FALLTHROUGH */
 228         case ('v'):
 229                 /* FALLTHROUGH */
 230         case ('w'):
 231                 /* FALLTHROUGH */
 232         case ('x'):
 233                 if (ESCAPE_ERROR == gly)
 234                         gly = ESCAPE_IGNORE;
 235                 if ('\'' != cp[i++])
 236                         return(ESCAPE_ERROR);
 237                 term = numeric = '\'';
 238                 break;
 239 
 240         /*
 241          * Special handling for the numbered character escape.
 242          * XXX Do any other escapes need similar handling?
 243          */
 244         case ('N'):
 245                 if ('\0' == cp[i])
 246                         return(ESCAPE_ERROR);
 247                 *end = &cp[++i];
 248                 if (isdigit((unsigned char)cp[i-1]))
 249                         return(ESCAPE_IGNORE);
 250                 while (isdigit((unsigned char)**end))
 251                         (*end)++;
 252                 if (start)
 253                         *start = &cp[i];
 254                 if (sz)
 255                         *sz = *end - &cp[i];
 256                 if ('\0' != **end)
 257                         (*end)++;
 258                 return(ESCAPE_NUMBERED);
 259 
 260         /* 
 261          * Sizes get a special category of their own.
 262          */
 263         case ('s'):
 264                 gly = ESCAPE_IGNORE;
 265 
 266                 rstart = &cp[i];
 267                 if (start) 
 268                         *start = rstart;
 269 
 270                 /* See +/- counts as a sign. */
 271                 c = cp[i];
 272                 if ('+' == c || '-' == c || ASCII_HYPH == c)
 273                         ++i;
 274 
 275                 switch (cp[i++]) {
 276                 case ('('):
 277                         lim = 2;
 278                         break;
 279                 case ('['):
 280                         term = numeric = ']';
 281                         break;
 282                 case ('\''):
 283                         term = numeric = '\'';
 284                         break;
 285                 default:
 286                         lim = 1;
 287                         i--;
 288                         break;
 289                 }
 290 
 291                 /* See +/- counts as a sign. */
 292                 c = cp[i];
 293                 if ('+' == c || '-' == c || ASCII_HYPH == c)
 294                         ++i;
 295 
 296                 break;
 297 
 298         /*
 299          * Anything else is assumed to be a glyph.
 300          */
 301         default:
 302                 gly = ESCAPE_SPECIAL;
 303                 lim = 1;
 304                 i--;
 305                 break;
 306         }
 307 
 308         assert(ESCAPE_ERROR != gly);
 309 
 310         rstart = &cp[i];
 311         if (start)
 312                 *start = rstart;
 313 
 314         /*
 315          * If a terminating block has been specified, we need to
 316          * handle the case of recursion, which could have their
 317          * own terminating blocks that mess up our parse.  This, by the
 318          * way, means that the "start" and "size" values will be
 319          * effectively meaningless.
 320          */
 321 
 322         ssz = 0;
 323         if (numeric && -1 == (ssz = numescape(&cp[i])))
 324                 return(ESCAPE_ERROR);
 325 
 326         i += ssz;
 327         rlim = -1;
 328 
 329         /*
 330          * We have a character terminator.  Try to read up to that
 331          * character.  If we can't (i.e., we hit the nil), then return
 332          * an error; if we can, calculate our length, read past the
 333          * terminating character, and exit.
 334          */
 335 
 336         if ('\0' != term) {
 337                 *end = strchr(&cp[i], term);
 338                 if ('\0' == *end)
 339                         return(ESCAPE_ERROR);
 340 
 341                 rlim = *end - &cp[i];
 342                 if (sz)
 343                         *sz = rlim;
 344                 (*end)++;
 345                 goto out;
 346         }
 347 
 348         assert(lim > 0);
 349 
 350         /*
 351          * We have a numeric limit.  If the string is shorter than that,
 352          * stop and return an error.  Else adjust our endpoint, length,
 353          * and return the current glyph.
 354          */
 355 
 356         if ((size_t)lim > strlen(&cp[i]))
 357                 return(ESCAPE_ERROR);
 358 
 359         rlim = lim;
 360         if (sz)
 361                 *sz = rlim;
 362 
 363         *end = &cp[i] + lim;
 364 
 365 out:
 366         assert(rlim >= 0 && rstart);
 367 
 368         /* Run post-processors. */
 369 
 370         switch (gly) {
 371         case (ESCAPE_FONT):
 372                 /*
 373                  * Pretend that the constant-width font modes are the
 374                  * same as the regular font modes.
 375                  */
 376                 if (2 == rlim && 'C' == *rstart)
 377                         rstart++;
 378                 else if (1 != rlim)
 379                         break;
 380 
 381                 switch (*rstart) {
 382                 case ('3'):
 383                         /* FALLTHROUGH */
 384                 case ('B'):
 385                         gly = ESCAPE_FONTBOLD;
 386                         break;
 387                 case ('2'):
 388                         /* FALLTHROUGH */
 389                 case ('I'):
 390                         gly = ESCAPE_FONTITALIC;
 391                         break;
 392                 case ('P'):
 393                         gly = ESCAPE_FONTPREV;
 394                         break;
 395                 case ('1'):
 396                         /* FALLTHROUGH */
 397                 case ('R'):
 398                         gly = ESCAPE_FONTROMAN;
 399                         break;
 400                 }
 401                 break;
 402         case (ESCAPE_SPECIAL):
 403                 if (1 != rlim)
 404                         break;
 405                 if ('c' == *rstart)
 406                         gly = ESCAPE_NOSPACE;
 407                 break;
 408         default:
 409                 break;
 410         }
 411 
 412         return(gly);
 413 }
 414 
 415 void *
 416 mandoc_calloc(size_t num, size_t size)
 417 {
 418         void            *ptr;
 419 
 420         ptr = calloc(num, size);
 421         if (NULL == ptr) {
 422                 perror(NULL);
 423                 exit((int)MANDOCLEVEL_SYSERR);
 424         }
 425 
 426         return(ptr);
 427 }
 428 
 429 
 430 void *
 431 mandoc_malloc(size_t size)
 432 {
 433         void            *ptr;
 434 
 435         ptr = malloc(size);
 436         if (NULL == ptr) {
 437                 perror(NULL);
 438                 exit((int)MANDOCLEVEL_SYSERR);
 439         }
 440 
 441         return(ptr);
 442 }
 443 
 444 
 445 void *
 446 mandoc_realloc(void *ptr, size_t size)
 447 {
 448 
 449         ptr = realloc(ptr, size);
 450         if (NULL == ptr) {
 451                 perror(NULL);
 452                 exit((int)MANDOCLEVEL_SYSERR);
 453         }
 454 
 455         return(ptr);
 456 }
 457 
 458 char *
 459 mandoc_strndup(const char *ptr, size_t sz)
 460 {
 461         char            *p;
 462 
 463         p = mandoc_malloc(sz + 1);
 464         memcpy(p, ptr, sz);
 465         p[(int)sz] = '\0';
 466         return(p);
 467 }
 468 
 469 char *
 470 mandoc_strdup(const char *ptr)
 471 {
 472         char            *p;
 473 
 474         p = strdup(ptr);
 475         if (NULL == p) {
 476                 perror(NULL);
 477                 exit((int)MANDOCLEVEL_SYSERR);
 478         }
 479 
 480         return(p);
 481 }
 482 
 483 /*
 484  * Parse a quoted or unquoted roff-style request or macro argument.
 485  * Return a pointer to the parsed argument, which is either the original
 486  * pointer or advanced by one byte in case the argument is quoted.
 487  * Null-terminate the argument in place.
 488  * Collapse pairs of quotes inside quoted arguments.
 489  * Advance the argument pointer to the next argument,
 490  * or to the null byte terminating the argument line.
 491  */
 492 char *
 493 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
 494 {
 495         char     *start, *cp;
 496         int       quoted, pairs, white;
 497 
 498         /* Quoting can only start with a new word. */
 499         start = *cpp;
 500         quoted = 0;
 501         if ('"' == *start) {
 502                 quoted = 1;
 503                 start++;
 504         } 
 505 
 506         pairs = 0;
 507         white = 0;
 508         for (cp = start; '\0' != *cp; cp++) {
 509                 /* Move left after quoted quotes and escaped backslashes. */
 510                 if (pairs)
 511                         cp[-pairs] = cp[0];
 512                 if ('\\' == cp[0]) {
 513                         if ('\\' == cp[1]) {
 514                                 /* Poor man's copy mode. */
 515                                 pairs++;
 516                                 cp++;
 517                         } else if (0 == quoted && ' ' == cp[1])
 518                                 /* Skip escaped blanks. */
 519                                 cp++;
 520                 } else if (0 == quoted) {
 521                         if (' ' == cp[0]) {
 522                                 /* Unescaped blanks end unquoted args. */
 523                                 white = 1;
 524                                 break;
 525                         }
 526                 } else if ('"' == cp[0]) {
 527                         if ('"' == cp[1]) {
 528                                 /* Quoted quotes collapse. */
 529                                 pairs++;
 530                                 cp++;
 531                         } else {
 532                                 /* Unquoted quotes end quoted args. */
 533                                 quoted = 2;
 534                                 break;
 535                         }
 536                 }
 537         }
 538 
 539         /* Quoted argument without a closing quote. */
 540         if (1 == quoted)
 541                 mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
 542 
 543         /* Null-terminate this argument and move to the next one. */
 544         if (pairs)
 545                 cp[-pairs] = '\0';
 546         if ('\0' != *cp) {
 547                 *cp++ = '\0';
 548                 while (' ' == *cp)
 549                         cp++;
 550         }
 551         *pos += (int)(cp - start) + (quoted ? 1 : 0);
 552         *cpp = cp;
 553 
 554         if ('\0' == *cp && (white || ' ' == cp[-1]))
 555                 mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
 556 
 557         return(start);
 558 }
 559 
 560 static int
 561 a2time(time_t *t, const char *fmt, const char *p)
 562 {
 563         struct tm        tm;
 564         char            *pp;
 565 
 566         memset(&tm, 0, sizeof(struct tm));
 567 
 568         pp = NULL;
 569 #ifdef  HAVE_STRPTIME
 570         pp = strptime(p, fmt, &tm);
 571 #endif
 572         if (NULL != pp && '\0' == *pp) {
 573                 *t = mktime(&tm);
 574                 return(1);
 575         }
 576 
 577         return(0);
 578 }
 579 
 580 static char *
 581 time2a(time_t t)
 582 {
 583         struct tm       *tm;
 584         char            *buf, *p;
 585         size_t           ssz;
 586         int              isz;
 587 
 588         tm = localtime(&t);
 589 
 590         /*
 591          * Reserve space:
 592          * up to 9 characters for the month (September) + blank
 593          * up to 2 characters for the day + comma + blank
 594          * 4 characters for the year and a terminating '\0'
 595          */
 596         p = buf = mandoc_malloc(10 + 4 + 4 + 1);
 597 
 598         if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
 599                 goto fail;
 600         p += (int)ssz;
 601 
 602         if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
 603                 goto fail;
 604         p += isz;
 605 
 606         if (0 == strftime(p, 4 + 1, "%Y", tm))
 607                 goto fail;
 608         return(buf);
 609 
 610 fail:
 611         free(buf);
 612         return(NULL);
 613 }
 614 
 615 char *
 616 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
 617 {
 618         char            *out;
 619         time_t           t;
 620 
 621         if (NULL == in || '\0' == *in ||
 622             0 == strcmp(in, "$" "Mdocdate$")) {
 623                 mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
 624                 time(&t);
 625         }
 626         else if (a2time(&t, "%Y-%m-%d", in))
 627                 t = 0;
 628         else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
 629             !a2time(&t, "%b %d, %Y", in)) {
 630                 mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
 631                 t = 0;
 632         }
 633         out = t ? time2a(t) : NULL;
 634         return(out ? out : mandoc_strdup(in));
 635 }
 636 
 637 int
 638 mandoc_eos(const char *p, size_t sz, int enclosed)
 639 {
 640         const char *q;
 641         int found;
 642 
 643         if (0 == sz)
 644                 return(0);
 645 
 646         /*
 647          * End-of-sentence recognition must include situations where
 648          * some symbols, such as `)', allow prior EOS punctuation to
 649          * propagate outward.
 650          */
 651 
 652         found = 0;
 653         for (q = p + (int)sz - 1; q >= p; q--) {
 654                 switch (*q) {
 655                 case ('\"'):
 656                         /* FALLTHROUGH */
 657                 case ('\''):
 658                         /* FALLTHROUGH */
 659                 case (']'):
 660                         /* FALLTHROUGH */
 661                 case (')'):
 662                         if (0 == found)
 663                                 enclosed = 1;
 664                         break;
 665                 case ('.'):
 666                         /* FALLTHROUGH */
 667                 case ('!'):
 668                         /* FALLTHROUGH */
 669                 case ('?'):
 670                         found = 1;
 671                         break;
 672                 default:
 673                         return(found && (!enclosed || isalnum((unsigned char)*q)));
 674                 }
 675         }
 676 
 677         return(found && !enclosed);
 678 }
 679 
 680 /*
 681  * Find out whether a line is a macro line or not.  If it is, adjust the
 682  * current position and return one; if it isn't, return zero and don't
 683  * change the current position.
 684  */
 685 int
 686 mandoc_getcontrol(const char *cp, int *ppos)
 687 {
 688         int             pos;
 689 
 690         pos = *ppos;
 691 
 692         if ('\\' == cp[pos] && '.' == cp[pos + 1])
 693                 pos += 2;
 694         else if ('.' == cp[pos] || '\'' == cp[pos])
 695                 pos++;
 696         else
 697                 return(0);
 698 
 699         while (' ' == cp[pos] || '\t' == cp[pos])
 700                 pos++;
 701 
 702         *ppos = pos;
 703         return(1);
 704 }
 705 
 706 /*
 707  * Convert a string to a long that may not be <0.
 708  * If the string is invalid, or is less than 0, return -1.
 709  */
 710 int
 711 mandoc_strntoi(const char *p, size_t sz, int base)
 712 {
 713         char             buf[32];
 714         char            *ep;
 715         long             v;
 716 
 717         if (sz > 31)
 718                 return(-1);
 719 
 720         memcpy(buf, p, sz);
 721         buf[(int)sz] = '\0';
 722 
 723         errno = 0;
 724         v = strtol(buf, &ep, base);
 725 
 726         if (buf[0] == '\0' || *ep != '\0')
 727                 return(-1);
 728 
 729         if (v > INT_MAX)
 730                 v = INT_MAX;
 731         if (v < INT_MIN)
 732                 v = INT_MIN;
 733 
 734         return((int)v);
 735 }