illumos-gate Cdiff usr/src/cmd/mandoc/mandoc.c

Print this page

Update to 1.12.3.


*** 1,9 ****
! /*      $Id: mandoc.c,v 1.62 2011/12/03 16:08:51 schwarze Exp $ */
  /*
   * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
!  * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
   *
   * Permission to use, copy, modify, and distribute this software for any
   * purpose with or without fee is hereby granted, provided that the above
   * copyright notice and this permission notice appear in all copies.
   *
--- 1,9 ----
! /*      $Id: mandoc.c,v 1.74 2013/12/30 18:30:32 schwarze Exp $ */
  /*
   * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
!  * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org>
   *
   * Permission to use, copy, modify, and distribute this software for any
   * purpose with or without fee is hereby granted, provided that the above
   * copyright notice and this permission notice appear in all copies.
   *
*** 35,150 ****
  
  #define DATESIZE 32
  
  static  int      a2time(time_t *, const char *, const char *);
  static  char    *time2a(time_t);
- static  int      numescape(const char *);
  
! /*
!  * Pass over recursive numerical expressions.  This context of this
!  * function is important: it's only called within character-terminating
!  * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
!  * recursion: we don't care about what's in these blocks. 
!  * This returns the number of characters skipped or -1 if an error
!  * occurs (the caller should bail).
!  */
! static int
! numescape(const char *start)
  {
!         int              i;
!         size_t           sz;
!         const char      *cp;
  
-         i = 0;
- 
-         /* The expression consists of a subexpression. */
- 
-         if ('\\' == start[i]) {
-                 cp = &start[++i];
                  /*
!                  * Read past the end of the subexpression.
!                  * Bail immediately on errors.
                   */
-                 if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
-                         return(-1);
-                 return(i + cp - &start[i]);
-         } 
  
!         if ('(' != start[i++])
!                 return(0);
  
          /*
!          * A parenthesised subexpression.  Read until the closing
!          * parenthesis, making sure to handle any nested subexpressions
!          * that might ruin our parse.
           */
  
-         while (')' != start[i]) {
-                 sz = strcspn(&start[i], ")\\");
-                 i += (int)sz;
- 
-                 if ('\0' == start[i])
-                         return(-1);
-                 else if ('\\' != start[i])
-                         continue;
- 
-                 cp = &start[++i];
-                 if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
-                         return(-1);
-                 i += cp - &start[i];
-         }
- 
-         /* Read past the terminating ')'. */
-         return(++i);
- }
- 
- enum mandoc_esc
- mandoc_escape(const char **end, const char **start, int *sz)
- {
-         char             c, term, numeric;
-         int              i, lim, ssz, rlim;
-         const char      *cp, *rstart;
-         enum mandoc_esc  gly; 
- 
-         cp = *end;
-         rstart = cp;
-         if (start)
-                 *start = rstart;
-         i = lim = 0;
          gly = ESCAPE_ERROR;
!         term = numeric = '\0';
  
!         switch ((c = cp[i++])) {
          /*
           * First the glyphs.  There are several different forms of
           * these, but each eventually returns a substring of the glyph
           * name.
           */
          case ('('):
                  gly = ESCAPE_SPECIAL;
!                 lim = 2;
                  break;
          case ('['):
                  gly = ESCAPE_SPECIAL;
                  /*
                   * Unicode escapes are defined in groff as \[uXXXX] to
                   * \[u10FFFF], where the contained value must be a valid
                   * Unicode codepoint.  Here, however, only check whether
                   * it's not a zero-width escape.
                   */
!                 if ('u' == cp[i] && ']' != cp[i + 1])
                          gly = ESCAPE_UNICODE;
                  term = ']';
                  break;
          case ('C'):
!                 if ('\'' != cp[i])
                          return(ESCAPE_ERROR);
                  gly = ESCAPE_SPECIAL;
                  term = '\'';
                  break;
  
          /*
           * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
           * 'X' is the trigger.  These have opaque sub-strings.
           */
          case ('F'):
                  /* FALLTHROUGH */
--- 35,126 ----
  
  #define DATESIZE 32
  
  static  int      a2time(time_t *, const char *, const char *);
  static  char    *time2a(time_t);
  
! 
! enum mandoc_esc
! mandoc_escape(const char **end, const char **start, int *sz)
  {
!         const char      *local_start;
!         int              local_sz;
!         char             term;
!         enum mandoc_esc  gly; 
  
          /*
!          * When the caller doesn't provide return storage,
!          * use local storage.
           */
  
!         if (NULL == start)
!                 start = &local_start;
!         if (NULL == sz)
!                 sz = &local_sz;
  
          /*
!          * Beyond the backslash, at least one input character
!          * is part of the escape sequence.  With one exception
!          * (see below), that character won't be returned.
           */
  
          gly = ESCAPE_ERROR;
!         *start = ++*end;
!         *sz = 0;
!         term = '\0';
  
!         switch ((*start)[-1]) {
          /*
           * First the glyphs.  There are several different forms of
           * these, but each eventually returns a substring of the glyph
           * name.
           */
          case ('('):
                  gly = ESCAPE_SPECIAL;
!                 *sz = 2;
                  break;
          case ('['):
                  gly = ESCAPE_SPECIAL;
                  /*
                   * Unicode escapes are defined in groff as \[uXXXX] to
                   * \[u10FFFF], where the contained value must be a valid
                   * Unicode codepoint.  Here, however, only check whether
                   * it's not a zero-width escape.
                   */
!                 if ('u' == (*start)[0] && ']' != (*start)[1])
                          gly = ESCAPE_UNICODE;
                  term = ']';
                  break;
          case ('C'):
!                 if ('\'' != **start)
                          return(ESCAPE_ERROR);
+                 *start = ++*end;
+                 if ('u' == (*start)[0] && '\'' != (*start)[1])
+                         gly = ESCAPE_UNICODE;
+                 else
                          gly = ESCAPE_SPECIAL;
                  term = '\'';
                  break;
  
          /*
+          * Escapes taking no arguments at all.
+          */
+         case ('d'):
+                 /* FALLTHROUGH */
+         case ('u'):
+                 return(ESCAPE_IGNORE);
+ 
+         /*
+          * The \z escape is supposed to output the following
+          * character without advancing the cursor position.  
+          * Since we are mostly dealing with terminal mode,
+          * let us just skip the next character.
+          */
+         case ('z'):
+                 return(ESCAPE_SKIPCHAR);
+ 
+         /*
           * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
           * 'X' is the trigger.  These have opaque sub-strings.
           */
          case ('F'):
                  /* FALLTHROUGH */
*** 164,188 ****
                  gly = ESCAPE_IGNORE;
                  /* FALLTHROUGH */
          case ('f'):
                  if (ESCAPE_ERROR == gly)
                          gly = ESCAPE_FONT;
! 
!                 rstart= &cp[i];
!                 if (start) 
!                         *start = rstart;
! 
!                 switch (cp[i++]) {
                  case ('('):
!                         lim = 2;
                          break;
                  case ('['):
                          term = ']';
                          break;
                  default:
!                         lim = 1;
!                         i--;
                          break;
                  }
                  break;
  
          /*
--- 140,160 ----
                  gly = ESCAPE_IGNORE;
                  /* FALLTHROUGH */
          case ('f'):
                  if (ESCAPE_ERROR == gly)
                          gly = ESCAPE_FONT;
!                 switch (**start) {
                  case ('('):
!                         *start = ++*end;
!                         *sz = 2;
                          break;
                  case ('['):
+                         *start = ++*end;
                          term = ']';
                          break;
                  default:
!                         *sz = 1;
                          break;
                  }
                  break;
  
          /*
*** 191,260 ****
           */
          case ('A'):
                  /* FALLTHROUGH */
          case ('b'):
                  /* FALLTHROUGH */
          case ('D'):
                  /* FALLTHROUGH */
          case ('o'):
                  /* FALLTHROUGH */
          case ('R'):
                  /* FALLTHROUGH */
          case ('X'):
                  /* FALLTHROUGH */
          case ('Z'):
!                 if ('\'' != cp[i++])
                          return(ESCAPE_ERROR);
                  gly = ESCAPE_IGNORE;
                  term = '\'';
                  break;
  
          /*
           * These escapes are of the form \X'N', where 'X' is the trigger
           * and 'N' resolves to a numerical expression.
           */
-         case ('B'):
-                 /* FALLTHROUGH */
          case ('h'):
                  /* FALLTHROUGH */
          case ('H'):
                  /* FALLTHROUGH */
          case ('L'):
                  /* FALLTHROUGH */
          case ('l'):
-                 gly = ESCAPE_NUMBERED;
                  /* FALLTHROUGH */
          case ('S'):
                  /* FALLTHROUGH */
          case ('v'):
                  /* FALLTHROUGH */
-         case ('w'):
-                 /* FALLTHROUGH */
          case ('x'):
!                 if (ESCAPE_ERROR == gly)
!                         gly = ESCAPE_IGNORE;
!                 if ('\'' != cp[i++])
                          return(ESCAPE_ERROR);
!                 term = numeric = '\'';
                  break;
  
          /*
           * Special handling for the numbered character escape.
           * XXX Do any other escapes need similar handling?
           */
          case ('N'):
!                 if ('\0' == cp[i])
                          return(ESCAPE_ERROR);
!                 *end = &cp[++i];
!                 if (isdigit((unsigned char)cp[i-1]))
                          return(ESCAPE_IGNORE);
                  while (isdigit((unsigned char)**end))
                          (*end)++;
!                 if (start)
!                         *start = &cp[i];
!                 if (sz)
!                         *sz = *end - &cp[i];
                  if ('\0' != **end)
                          (*end)++;
                  return(ESCAPE_NUMBERED);
  
          /* 
--- 163,232 ----
           */
          case ('A'):
                  /* FALLTHROUGH */
          case ('b'):
                  /* FALLTHROUGH */
+         case ('B'):
+                 /* FALLTHROUGH */
          case ('D'):
                  /* FALLTHROUGH */
          case ('o'):
                  /* FALLTHROUGH */
          case ('R'):
                  /* FALLTHROUGH */
+         case ('w'):
+                 /* FALLTHROUGH */
          case ('X'):
                  /* FALLTHROUGH */
          case ('Z'):
!                 if ('\'' != **start)
                          return(ESCAPE_ERROR);
                  gly = ESCAPE_IGNORE;
+                 *start = ++*end;
                  term = '\'';
                  break;
  
          /*
           * These escapes are of the form \X'N', where 'X' is the trigger
           * and 'N' resolves to a numerical expression.
           */
          case ('h'):
                  /* FALLTHROUGH */
          case ('H'):
                  /* FALLTHROUGH */
          case ('L'):
                  /* FALLTHROUGH */
          case ('l'):
                  /* FALLTHROUGH */
          case ('S'):
                  /* FALLTHROUGH */
          case ('v'):
                  /* FALLTHROUGH */
          case ('x'):
!                 if ('\'' != **start)
                          return(ESCAPE_ERROR);
!                 gly = ESCAPE_IGNORE;
!                 *start = ++*end;
!                 term = '\'';
                  break;
  
          /*
           * Special handling for the numbered character escape.
           * XXX Do any other escapes need similar handling?
           */
          case ('N'):
!                 if ('\0' == **start)
                          return(ESCAPE_ERROR);
!                 (*end)++;
!                 if (isdigit((unsigned char)**start)) {
!                         *sz = 1;
                          return(ESCAPE_IGNORE);
+                 }
+                 (*start)++;
                  while (isdigit((unsigned char)**end))
                          (*end)++;
!                 *sz = *end - *start;
                  if ('\0' != **end)
                          (*end)++;
                  return(ESCAPE_NUMBERED);
  
          /* 
*** 261,386 ****
           * Sizes get a special category of their own.
           */
          case ('s'):
                  gly = ESCAPE_IGNORE;
  
-                 rstart = &cp[i];
-                 if (start) 
-                         *start = rstart;
- 
                  /* See +/- counts as a sign. */
!                 c = cp[i];
!                 if ('+' == c || '-' == c || ASCII_HYPH == c)
!                         ++i;
  
!                 switch (cp[i++]) {
                  case ('('):
!                         lim = 2;
                          break;
                  case ('['):
!                         term = numeric = ']';
                          break;
                  case ('\''):
!                         term = numeric = '\'';
                          break;
                  default:
!                         lim = 1;
!                         i--;
                          break;
                  }
  
-                 /* See +/- counts as a sign. */
-                 c = cp[i];
-                 if ('+' == c || '-' == c || ASCII_HYPH == c)
-                         ++i;
- 
                  break;
  
          /*
           * Anything else is assumed to be a glyph.
           */
          default:
                  gly = ESCAPE_SPECIAL;
!                 lim = 1;
!                 i--;
                  break;
          }
  
          assert(ESCAPE_ERROR != gly);
  
-         rstart = &cp[i];
-         if (start)
-                 *start = rstart;
- 
          /*
!          * If a terminating block has been specified, we need to
!          * handle the case of recursion, which could have their
!          * own terminating blocks that mess up our parse.  This, by the
!          * way, means that the "start" and "size" values will be
!          * effectively meaningless.
           */
  
-         ssz = 0;
-         if (numeric && -1 == (ssz = numescape(&cp[i])))
-                 return(ESCAPE_ERROR);
- 
-         i += ssz;
-         rlim = -1;
- 
-         /*
-          * We have a character terminator.  Try to read up to that
-          * character.  If we can't (i.e., we hit the nil), then return
-          * an error; if we can, calculate our length, read past the
-          * terminating character, and exit.
-          */
- 
          if ('\0' != term) {
!                 *end = strchr(&cp[i], term);
!                 if ('\0' == *end)
                          return(ESCAPE_ERROR);
! 
!                 rlim = *end - &cp[i];
!                 if (sz)
!                         *sz = rlim;
                  (*end)++;
!                 goto out;
          }
! 
!         assert(lim > 0);
! 
!         /*
!          * We have a numeric limit.  If the string is shorter than that,
!          * stop and return an error.  Else adjust our endpoint, length,
!          * and return the current glyph.
!          */
! 
!         if ((size_t)lim > strlen(&cp[i]))
                  return(ESCAPE_ERROR);
  
-         rlim = lim;
-         if (sz)
-                 *sz = rlim;
- 
-         *end = &cp[i] + lim;
- 
- out:
-         assert(rlim >= 0 && rstart);
- 
          /* Run post-processors. */
  
          switch (gly) {
          case (ESCAPE_FONT):
                  /*
!                  * Pretend that the constant-width font modes are the
!                  * same as the regular font modes.
                   */
!                 if (2 == rlim && 'C' == *rstart)
!                         rstart++;
!                 else if (1 != rlim)
                          break;
  
!                 switch (*rstart) {
                  case ('3'):
                          /* FALLTHROUGH */
                  case ('B'):
                          gly = ESCAPE_FONTBOLD;
                          break;
--- 233,329 ----
           * Sizes get a special category of their own.
           */
          case ('s'):
                  gly = ESCAPE_IGNORE;
  
                  /* See +/- counts as a sign. */
!                 if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
!                         (*end)++;
  
!                 switch (**end) {
                  case ('('):
!                         *start = ++*end;
!                         *sz = 2;
                          break;
                  case ('['):
!                         *start = ++*end;
!                         term = ']';
                          break;
                  case ('\''):
!                         *start = ++*end;
!                         term = '\'';
                          break;
                  default:
!                         *sz = 1;
                          break;
                  }
  
                  break;
  
          /*
           * Anything else is assumed to be a glyph.
+          * In this case, pass back the character after the backslash.
           */
          default:
                  gly = ESCAPE_SPECIAL;
!                 *start = --*end;
!                 *sz = 1;
                  break;
          }
  
          assert(ESCAPE_ERROR != gly);
  
          /*
!          * Read up to the terminating character,
!          * paying attention to nested escapes.
           */
  
          if ('\0' != term) {
!                 while (**end != term) {
!                         switch (**end) {
!                         case ('\0'):
                                  return(ESCAPE_ERROR);
!                         case ('\\'):
                                  (*end)++;
!                                 if (ESCAPE_ERROR ==
!                                     mandoc_escape(end, NULL, NULL))
!                                         return(ESCAPE_ERROR);
!                                 break;
!                         default:
!                                 (*end)++;
!                                 break;
                          }
!                 }
!                 *sz = (*end)++ - *start;
!         } else {
!                 assert(*sz > 0);
!                 if ((size_t)*sz > strlen(*start))
                          return(ESCAPE_ERROR);
+                 *end += *sz;
+         }
  
          /* Run post-processors. */
  
          switch (gly) {
          case (ESCAPE_FONT):
+                 if (2 == *sz) {
+                         if ('C' == **start) {
                                  /*
!                                  * Treat constant-width font modes
!                                  * just like regular font modes.
                                   */
!                                 (*start)++;
!                                 (*sz)--;
!                         } else {
!                                 if ('B' == (*start)[0] && 'I' == (*start)[1])
!                                         gly = ESCAPE_FONTBI;
                                  break;
+                         }
+                 } else if (1 != *sz)
+                         break;
  
!                 switch (**start) {
                  case ('3'):
                          /* FALLTHROUGH */
                  case ('B'):
                          gly = ESCAPE_FONTBOLD;
                          break;
*** 398,410 ****
                          gly = ESCAPE_FONTROMAN;
                          break;
                  }
                  break;
          case (ESCAPE_SPECIAL):
!                 if (1 != rlim)
!                         break;
!                 if ('c' == *rstart)
                          gly = ESCAPE_NOSPACE;
                  break;
          default:
                  break;
          }
--- 341,351 ----
                          gly = ESCAPE_FONTROMAN;
                          break;
                  }
                  break;
          case (ESCAPE_SPECIAL):
!                 if (1 == *sz && 'c' == **start)
                          gly = ESCAPE_NOSPACE;
                  break;
          default:
                  break;
          }
*** 482,495 ****
  
  /*
   * Parse a quoted or unquoted roff-style request or macro argument.
   * Return a pointer to the parsed argument, which is either the original
   * pointer or advanced by one byte in case the argument is quoted.
!  * Null-terminate the argument in place.
   * Collapse pairs of quotes inside quoted arguments.
   * Advance the argument pointer to the next argument,
!  * or to the null byte terminating the argument line.
   */
  char *
  mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
  {
          char     *start, *cp;
--- 423,436 ----
  
  /*
   * Parse a quoted or unquoted roff-style request or macro argument.
   * Return a pointer to the parsed argument, which is either the original
   * pointer or advanced by one byte in case the argument is quoted.
!  * NUL-terminate the argument in place.
   * Collapse pairs of quotes inside quoted arguments.
   * Advance the argument pointer to the next argument,
!  * or to the NUL byte terminating the argument line.
   */
  char *
  mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
  {
          char     *start, *cp;
*** 504,524 ****
          } 
  
          pairs = 0;
          white = 0;
          for (cp = start; '\0' != *cp; cp++) {
!                 /* Move left after quoted quotes and escaped backslashes. */
                  if (pairs)
                          cp[-pairs] = cp[0];
                  if ('\\' == cp[0]) {
!                         if ('\\' == cp[1]) {
!                                 /* Poor man's copy mode. */
                                  pairs++;
                                  cp++;
!                         } else if (0 == quoted && ' ' == cp[1])
                                  /* Skip escaped blanks. */
                                  cp++;
                  } else if (0 == quoted) {
                          if (' ' == cp[0]) {
                                  /* Unescaped blanks end unquoted args. */
                                  white = 1;
                                  break;
--- 445,483 ----
          } 
  
          pairs = 0;
          white = 0;
          for (cp = start; '\0' != *cp; cp++) {
! 
!                 /*
!                  * Move the following text left
!                  * after quoted quotes and after "\\" and "\t".
!                  */
                  if (pairs)
                          cp[-pairs] = cp[0];
+ 
                  if ('\\' == cp[0]) {
!                         /*
!                          * In copy mode, translate double to single
!                          * backslashes and backslash-t to literal tabs.
!                          */
!                         switch (cp[1]) {
!                         case ('t'):
!                                 cp[0] = '\t';
!                                 /* FALLTHROUGH */
!                         case ('\\'):
                                  pairs++;
                                  cp++;
!                                 break;
!                         case (' '):
                                  /* Skip escaped blanks. */
+                                 if (0 == quoted)
                                          cp++;
+                                 break;
+                         default:
+                                 break;
+                         }
                  } else if (0 == quoted) {
                          if (' ' == cp[0]) {
                                  /* Unescaped blanks end unquoted args. */
                                  white = 1;
                                  break;
*** 538,548 ****
  
          /* Quoted argument without a closing quote. */
          if (1 == quoted)
                  mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
  
!         /* Null-terminate this argument and move to the next one. */
          if (pairs)
                  cp[-pairs] = '\0';
          if ('\0' != *cp) {
                  *cp++ = '\0';
                  while (' ' == *cp)
--- 497,507 ----
  
          /* Quoted argument without a closing quote. */
          if (1 == quoted)
                  mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
  
!         /* NUL-terminate this argument and move to the next one. */
          if (pairs)
                  cp[-pairs] = '\0';
          if ('\0' != *cp) {
                  *cp++ = '\0';
                  while (' ' == *cp)
*** 675,710 ****
          }
  
          return(found && !enclosed);
  }
  
- /*
-  * Find out whether a line is a macro line or not.  If it is, adjust the
-  * current position and return one; if it isn't, return zero and don't
-  * change the current position.
-  */
- int
- mandoc_getcontrol(const char *cp, int *ppos)
- {
-         int             pos;
- 
-         pos = *ppos;
- 
-         if ('\\' == cp[pos] && '.' == cp[pos + 1])
-                 pos += 2;
-         else if ('.' == cp[pos] || '\'' == cp[pos])
-                 pos++;
-         else
-                 return(0);
- 
-         while (' ' == cp[pos] || '\t' == cp[pos])
-                 pos++;
- 
-         *ppos = pos;
-         return(1);
- }
- 
  /*
   * Convert a string to a long that may not be <0.
   * If the string is invalid, or is less than 0, return -1.
   */
  int
--- 634,643 ----