1 /*      $Id: mandoc.c,v 1.74 2013/12/30 18:30:32 schwarze Exp $ */
   2 /*
   3  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
   4  * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  */
  18 #ifdef HAVE_CONFIG_H
  19 #include "config.h"
  20 #endif
  21 
  22 #include <sys/types.h>
  23 
  24 #include <assert.h>
  25 #include <ctype.h>
  26 #include <errno.h>
  27 #include <limits.h>
  28 #include <stdlib.h>
  29 #include <stdio.h>
  30 #include <string.h>
  31 #include <time.h>
  32 
  33 #include "mandoc.h"
  34 #include "libmandoc.h"
  35 
  36 #define DATESIZE 32
  37 
  38 static  int      a2time(time_t *, const char *, const char *);
  39 static  char    *time2a(time_t);
  40 
  41 
  42 enum mandoc_esc
  43 mandoc_escape(const char **end, const char **start, int *sz)
  44 {
  45         const char      *local_start;
  46         int              local_sz;
  47         char             term;
  48         enum mandoc_esc  gly; 
  49 
  50         /*
  51          * When the caller doesn't provide return storage,
  52          * use local storage.
  53          */
  54 
  55         if (NULL == start)
  56                 start = &local_start;
  57         if (NULL == sz)
  58                 sz = &local_sz;
  59 
  60         /*
  61          * Beyond the backslash, at least one input character
  62          * is part of the escape sequence.  With one exception
  63          * (see below), that character won't be returned.
  64          */
  65 
  66         gly = ESCAPE_ERROR;
  67         *start = ++*end;
  68         *sz = 0;
  69         term = '\0';
  70 
  71         switch ((*start)[-1]) {
  72         /*
  73          * First the glyphs.  There are several different forms of
  74          * these, but each eventually returns a substring of the glyph
  75          * name.
  76          */
  77         case ('('):
  78                 gly = ESCAPE_SPECIAL;
  79                 *sz = 2;
  80                 break;
  81         case ('['):
  82                 gly = ESCAPE_SPECIAL;
  83                 /*
  84                  * Unicode escapes are defined in groff as \[uXXXX] to
  85                  * \[u10FFFF], where the contained value must be a valid
  86                  * Unicode codepoint.  Here, however, only check whether
  87                  * it's not a zero-width escape.
  88                  */
  89                 if ('u' == (*start)[0] && ']' != (*start)[1])
  90                         gly = ESCAPE_UNICODE;
  91                 term = ']';
  92                 break;
  93         case ('C'):
  94                 if ('\'' != **start)
  95                         return(ESCAPE_ERROR);
  96                 *start = ++*end;
  97                 if ('u' == (*start)[0] && '\'' != (*start)[1])
  98                         gly = ESCAPE_UNICODE;
  99                 else
 100                         gly = ESCAPE_SPECIAL;
 101                 term = '\'';
 102                 break;
 103 
 104         /*
 105          * Escapes taking no arguments at all.
 106          */
 107         case ('d'):
 108                 /* FALLTHROUGH */
 109         case ('u'):
 110                 return(ESCAPE_IGNORE);
 111 
 112         /*
 113          * The \z escape is supposed to output the following
 114          * character without advancing the cursor position.  
 115          * Since we are mostly dealing with terminal mode,
 116          * let us just skip the next character.
 117          */
 118         case ('z'):
 119                 return(ESCAPE_SKIPCHAR);
 120 
 121         /*
 122          * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
 123          * 'X' is the trigger.  These have opaque sub-strings.
 124          */
 125         case ('F'):
 126                 /* FALLTHROUGH */
 127         case ('g'):
 128                 /* FALLTHROUGH */
 129         case ('k'):
 130                 /* FALLTHROUGH */
 131         case ('M'):
 132                 /* FALLTHROUGH */
 133         case ('m'):
 134                 /* FALLTHROUGH */
 135         case ('n'):
 136                 /* FALLTHROUGH */
 137         case ('V'):
 138                 /* FALLTHROUGH */
 139         case ('Y'):
 140                 gly = ESCAPE_IGNORE;
 141                 /* FALLTHROUGH */
 142         case ('f'):
 143                 if (ESCAPE_ERROR == gly)
 144                         gly = ESCAPE_FONT;
 145                 switch (**start) {
 146                 case ('('):
 147                         *start = ++*end;
 148                         *sz = 2;
 149                         break;
 150                 case ('['):
 151                         *start = ++*end;
 152                         term = ']';
 153                         break;
 154                 default:
 155                         *sz = 1;
 156                         break;
 157                 }
 158                 break;
 159 
 160         /*
 161          * These escapes are of the form \X'Y', where 'X' is the trigger
 162          * and 'Y' is any string.  These have opaque sub-strings.
 163          */
 164         case ('A'):
 165                 /* FALLTHROUGH */
 166         case ('b'):
 167                 /* FALLTHROUGH */
 168         case ('B'):
 169                 /* FALLTHROUGH */
 170         case ('D'):
 171                 /* FALLTHROUGH */
 172         case ('o'):
 173                 /* FALLTHROUGH */
 174         case ('R'):
 175                 /* FALLTHROUGH */
 176         case ('w'):
 177                 /* FALLTHROUGH */
 178         case ('X'):
 179                 /* FALLTHROUGH */
 180         case ('Z'):
 181                 if ('\'' != **start)
 182                         return(ESCAPE_ERROR);
 183                 gly = ESCAPE_IGNORE;
 184                 *start = ++*end;
 185                 term = '\'';
 186                 break;
 187 
 188         /*
 189          * These escapes are of the form \X'N', where 'X' is the trigger
 190          * and 'N' resolves to a numerical expression.
 191          */
 192         case ('h'):
 193                 /* FALLTHROUGH */
 194         case ('H'):
 195                 /* FALLTHROUGH */
 196         case ('L'):
 197                 /* FALLTHROUGH */
 198         case ('l'):
 199                 /* FALLTHROUGH */
 200         case ('S'):
 201                 /* FALLTHROUGH */
 202         case ('v'):
 203                 /* FALLTHROUGH */
 204         case ('x'):
 205                 if ('\'' != **start)
 206                         return(ESCAPE_ERROR);
 207                 gly = ESCAPE_IGNORE;
 208                 *start = ++*end;
 209                 term = '\'';
 210                 break;
 211 
 212         /*
 213          * Special handling for the numbered character escape.
 214          * XXX Do any other escapes need similar handling?
 215          */
 216         case ('N'):
 217                 if ('\0' == **start)
 218                         return(ESCAPE_ERROR);
 219                 (*end)++;
 220                 if (isdigit((unsigned char)**start)) {
 221                         *sz = 1;
 222                         return(ESCAPE_IGNORE);
 223                 }
 224                 (*start)++;
 225                 while (isdigit((unsigned char)**end))
 226                         (*end)++;
 227                 *sz = *end - *start;
 228                 if ('\0' != **end)
 229                         (*end)++;
 230                 return(ESCAPE_NUMBERED);
 231 
 232         /* 
 233          * Sizes get a special category of their own.
 234          */
 235         case ('s'):
 236                 gly = ESCAPE_IGNORE;
 237 
 238                 /* See +/- counts as a sign. */
 239                 if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
 240                         (*end)++;
 241 
 242                 switch (**end) {
 243                 case ('('):
 244                         *start = ++*end;
 245                         *sz = 2;
 246                         break;
 247                 case ('['):
 248                         *start = ++*end;
 249                         term = ']';
 250                         break;
 251                 case ('\''):
 252                         *start = ++*end;
 253                         term = '\'';
 254                         break;
 255                 default:
 256                         *sz = 1;
 257                         break;
 258                 }
 259 
 260                 break;
 261 
 262         /*
 263          * Anything else is assumed to be a glyph.
 264          * In this case, pass back the character after the backslash.
 265          */
 266         default:
 267                 gly = ESCAPE_SPECIAL;
 268                 *start = --*end;
 269                 *sz = 1;
 270                 break;
 271         }
 272 
 273         assert(ESCAPE_ERROR != gly);
 274 
 275         /*
 276          * Read up to the terminating character,
 277          * paying attention to nested escapes.
 278          */
 279 
 280         if ('\0' != term) {
 281                 while (**end != term) {
 282                         switch (**end) {
 283                         case ('\0'):
 284                                 return(ESCAPE_ERROR);
 285                         case ('\\'):
 286                                 (*end)++;
 287                                 if (ESCAPE_ERROR ==
 288                                     mandoc_escape(end, NULL, NULL))
 289                                         return(ESCAPE_ERROR);
 290                                 break;
 291                         default:
 292                                 (*end)++;
 293                                 break;
 294                         }
 295                 }
 296                 *sz = (*end)++ - *start;
 297         } else {
 298                 assert(*sz > 0);
 299                 if ((size_t)*sz > strlen(*start))
 300                         return(ESCAPE_ERROR);
 301                 *end += *sz;
 302         }
 303 
 304         /* Run post-processors. */
 305 
 306         switch (gly) {
 307         case (ESCAPE_FONT):
 308                 if (2 == *sz) {
 309                         if ('C' == **start) {
 310                                 /*
 311                                  * Treat constant-width font modes
 312                                  * just like regular font modes.
 313                                  */
 314                                 (*start)++;
 315                                 (*sz)--;
 316                         } else {
 317                                 if ('B' == (*start)[0] && 'I' == (*start)[1])
 318                                         gly = ESCAPE_FONTBI;
 319                                 break;
 320                         }
 321                 } else if (1 != *sz)
 322                         break;
 323 
 324                 switch (**start) {
 325                 case ('3'):
 326                         /* FALLTHROUGH */
 327                 case ('B'):
 328                         gly = ESCAPE_FONTBOLD;
 329                         break;
 330                 case ('2'):
 331                         /* FALLTHROUGH */
 332                 case ('I'):
 333                         gly = ESCAPE_FONTITALIC;
 334                         break;
 335                 case ('P'):
 336                         gly = ESCAPE_FONTPREV;
 337                         break;
 338                 case ('1'):
 339                         /* FALLTHROUGH */
 340                 case ('R'):
 341                         gly = ESCAPE_FONTROMAN;
 342                         break;
 343                 }
 344                 break;
 345         case (ESCAPE_SPECIAL):
 346                 if (1 == *sz && 'c' == **start)
 347                         gly = ESCAPE_NOSPACE;
 348                 break;
 349         default:
 350                 break;
 351         }
 352 
 353         return(gly);
 354 }
 355 
 356 void *
 357 mandoc_calloc(size_t num, size_t size)
 358 {
 359         void            *ptr;
 360 
 361         ptr = calloc(num, size);
 362         if (NULL == ptr) {
 363                 perror(NULL);
 364                 exit((int)MANDOCLEVEL_SYSERR);
 365         }
 366 
 367         return(ptr);
 368 }
 369 
 370 
 371 void *
 372 mandoc_malloc(size_t size)
 373 {
 374         void            *ptr;
 375 
 376         ptr = malloc(size);
 377         if (NULL == ptr) {
 378                 perror(NULL);
 379                 exit((int)MANDOCLEVEL_SYSERR);
 380         }
 381 
 382         return(ptr);
 383 }
 384 
 385 
 386 void *
 387 mandoc_realloc(void *ptr, size_t size)
 388 {
 389 
 390         ptr = realloc(ptr, size);
 391         if (NULL == ptr) {
 392                 perror(NULL);
 393                 exit((int)MANDOCLEVEL_SYSERR);
 394         }
 395 
 396         return(ptr);
 397 }
 398 
 399 char *
 400 mandoc_strndup(const char *ptr, size_t sz)
 401 {
 402         char            *p;
 403 
 404         p = mandoc_malloc(sz + 1);
 405         memcpy(p, ptr, sz);
 406         p[(int)sz] = '\0';
 407         return(p);
 408 }
 409 
 410 char *
 411 mandoc_strdup(const char *ptr)
 412 {
 413         char            *p;
 414 
 415         p = strdup(ptr);
 416         if (NULL == p) {
 417                 perror(NULL);
 418                 exit((int)MANDOCLEVEL_SYSERR);
 419         }
 420 
 421         return(p);
 422 }
 423 
 424 /*
 425  * Parse a quoted or unquoted roff-style request or macro argument.
 426  * Return a pointer to the parsed argument, which is either the original
 427  * pointer or advanced by one byte in case the argument is quoted.
 428  * NUL-terminate the argument in place.
 429  * Collapse pairs of quotes inside quoted arguments.
 430  * Advance the argument pointer to the next argument,
 431  * or to the NUL byte terminating the argument line.
 432  */
 433 char *
 434 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
 435 {
 436         char     *start, *cp;
 437         int       quoted, pairs, white;
 438 
 439         /* Quoting can only start with a new word. */
 440         start = *cpp;
 441         quoted = 0;
 442         if ('"' == *start) {
 443                 quoted = 1;
 444                 start++;
 445         } 
 446 
 447         pairs = 0;
 448         white = 0;
 449         for (cp = start; '\0' != *cp; cp++) {
 450 
 451                 /*
 452                  * Move the following text left
 453                  * after quoted quotes and after "\\" and "\t".
 454                  */
 455                 if (pairs)
 456                         cp[-pairs] = cp[0];
 457 
 458                 if ('\\' == cp[0]) {
 459                         /*
 460                          * In copy mode, translate double to single
 461                          * backslashes and backslash-t to literal tabs.
 462                          */
 463                         switch (cp[1]) {
 464                         case ('t'):
 465                                 cp[0] = '\t';
 466                                 /* FALLTHROUGH */
 467                         case ('\\'):
 468                                 pairs++;
 469                                 cp++;
 470                                 break;
 471                         case (' '):
 472                                 /* Skip escaped blanks. */
 473                                 if (0 == quoted)
 474                                         cp++;
 475                                 break;
 476                         default:
 477                                 break;
 478                         }
 479                 } else if (0 == quoted) {
 480                         if (' ' == cp[0]) {
 481                                 /* Unescaped blanks end unquoted args. */
 482                                 white = 1;
 483                                 break;
 484                         }
 485                 } else if ('"' == cp[0]) {
 486                         if ('"' == cp[1]) {
 487                                 /* Quoted quotes collapse. */
 488                                 pairs++;
 489                                 cp++;
 490                         } else {
 491                                 /* Unquoted quotes end quoted args. */
 492                                 quoted = 2;
 493                                 break;
 494                         }
 495                 }
 496         }
 497 
 498         /* Quoted argument without a closing quote. */
 499         if (1 == quoted)
 500                 mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
 501 
 502         /* NUL-terminate this argument and move to the next one. */
 503         if (pairs)
 504                 cp[-pairs] = '\0';
 505         if ('\0' != *cp) {
 506                 *cp++ = '\0';
 507                 while (' ' == *cp)
 508                         cp++;
 509         }
 510         *pos += (int)(cp - start) + (quoted ? 1 : 0);
 511         *cpp = cp;
 512 
 513         if ('\0' == *cp && (white || ' ' == cp[-1]))
 514                 mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
 515 
 516         return(start);
 517 }
 518 
 519 static int
 520 a2time(time_t *t, const char *fmt, const char *p)
 521 {
 522         struct tm        tm;
 523         char            *pp;
 524 
 525         memset(&tm, 0, sizeof(struct tm));
 526 
 527         pp = NULL;
 528 #ifdef  HAVE_STRPTIME
 529         pp = strptime(p, fmt, &tm);
 530 #endif
 531         if (NULL != pp && '\0' == *pp) {
 532                 *t = mktime(&tm);
 533                 return(1);
 534         }
 535 
 536         return(0);
 537 }
 538 
 539 static char *
 540 time2a(time_t t)
 541 {
 542         struct tm       *tm;
 543         char            *buf, *p;
 544         size_t           ssz;
 545         int              isz;
 546 
 547         tm = localtime(&t);
 548 
 549         /*
 550          * Reserve space:
 551          * up to 9 characters for the month (September) + blank
 552          * up to 2 characters for the day + comma + blank
 553          * 4 characters for the year and a terminating '\0'
 554          */
 555         p = buf = mandoc_malloc(10 + 4 + 4 + 1);
 556 
 557         if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
 558                 goto fail;
 559         p += (int)ssz;
 560 
 561         if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
 562                 goto fail;
 563         p += isz;
 564 
 565         if (0 == strftime(p, 4 + 1, "%Y", tm))
 566                 goto fail;
 567         return(buf);
 568 
 569 fail:
 570         free(buf);
 571         return(NULL);
 572 }
 573 
 574 char *
 575 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
 576 {
 577         char            *out;
 578         time_t           t;
 579 
 580         if (NULL == in || '\0' == *in ||
 581             0 == strcmp(in, "$" "Mdocdate$")) {
 582                 mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
 583                 time(&t);
 584         }
 585         else if (a2time(&t, "%Y-%m-%d", in))
 586                 t = 0;
 587         else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
 588             !a2time(&t, "%b %d, %Y", in)) {
 589                 mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
 590                 t = 0;
 591         }
 592         out = t ? time2a(t) : NULL;
 593         return(out ? out : mandoc_strdup(in));
 594 }
 595 
 596 int
 597 mandoc_eos(const char *p, size_t sz, int enclosed)
 598 {
 599         const char *q;
 600         int found;
 601 
 602         if (0 == sz)
 603                 return(0);
 604 
 605         /*
 606          * End-of-sentence recognition must include situations where
 607          * some symbols, such as `)', allow prior EOS punctuation to
 608          * propagate outward.
 609          */
 610 
 611         found = 0;
 612         for (q = p + (int)sz - 1; q >= p; q--) {
 613                 switch (*q) {
 614                 case ('\"'):
 615                         /* FALLTHROUGH */
 616                 case ('\''):
 617                         /* FALLTHROUGH */
 618                 case (']'):
 619                         /* FALLTHROUGH */
 620                 case (')'):
 621                         if (0 == found)
 622                                 enclosed = 1;
 623                         break;
 624                 case ('.'):
 625                         /* FALLTHROUGH */
 626                 case ('!'):
 627                         /* FALLTHROUGH */
 628                 case ('?'):
 629                         found = 1;
 630                         break;
 631                 default:
 632                         return(found && (!enclosed || isalnum((unsigned char)*q)));
 633                 }
 634         }
 635 
 636         return(found && !enclosed);
 637 }
 638 
 639 /*
 640  * Convert a string to a long that may not be <0.
 641  * If the string is invalid, or is less than 0, return -1.
 642  */
 643 int
 644 mandoc_strntoi(const char *p, size_t sz, int base)
 645 {
 646         char             buf[32];
 647         char            *ep;
 648         long             v;
 649 
 650         if (sz > 31)
 651                 return(-1);
 652 
 653         memcpy(buf, p, sz);
 654         buf[(int)sz] = '\0';
 655 
 656         errno = 0;
 657         v = strtol(buf, &ep, base);
 658 
 659         if (buf[0] == '\0' || *ep != '\0')
 660                 return(-1);
 661 
 662         if (v > INT_MAX)
 663                 v = INT_MAX;
 664         if (v < INT_MIN)
 665                 v = INT_MIN;
 666 
 667         return((int)v);
 668 }