Print this page
5051 import mdocml-1.12.3
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Approved by: TBD

@@ -1,9 +1,9 @@
-/*      $Id: mandoc.c,v 1.62 2011/12/03 16:08:51 schwarze Exp $ */
+/*      $Id: mandoc.c,v 1.74 2013/12/30 18:30:32 schwarze Exp $ */
 /*
  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
- * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
  * copyright notice and this permission notice appear in all copies.
  *

@@ -35,116 +35,92 @@
 
 #define DATESIZE 32
 
 static  int      a2time(time_t *, const char *, const char *);
 static  char    *time2a(time_t);
-static  int      numescape(const char *);
 
-/*
- * Pass over recursive numerical expressions.  This context of this
- * function is important: it's only called within character-terminating
- * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
- * recursion: we don't care about what's in these blocks. 
- * This returns the number of characters skipped or -1 if an error
- * occurs (the caller should bail).
- */
-static int
-numescape(const char *start)
+
+enum mandoc_esc
+mandoc_escape(const char **end, const char **start, int *sz)
 {
-        int              i;
-        size_t           sz;
-        const char      *cp;
+        const char      *local_start;
+        int              local_sz;
+        char             term;
+        enum mandoc_esc  gly; 
 
-        i = 0;
-
-        /* The expression consists of a subexpression. */
-
-        if ('\\' == start[i]) {
-                cp = &start[++i];
                 /*
-                 * Read past the end of the subexpression.
-                 * Bail immediately on errors.
+         * When the caller doesn't provide return storage,
+         * use local storage.
                  */
-                if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
-                        return(-1);
-                return(i + cp - &start[i]);
-        } 
 
-        if ('(' != start[i++])
-                return(0);
+        if (NULL == start)
+                start = &local_start;
+        if (NULL == sz)
+                sz = &local_sz;
 
         /*
-         * A parenthesised subexpression.  Read until the closing
-         * parenthesis, making sure to handle any nested subexpressions
-         * that might ruin our parse.
+         * Beyond the backslash, at least one input character
+         * is part of the escape sequence.  With one exception
+         * (see below), that character won't be returned.
          */
 
-        while (')' != start[i]) {
-                sz = strcspn(&start[i], ")\\");
-                i += (int)sz;
-
-                if ('\0' == start[i])
-                        return(-1);
-                else if ('\\' != start[i])
-                        continue;
-
-                cp = &start[++i];
-                if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
-                        return(-1);
-                i += cp - &start[i];
-        }
-
-        /* Read past the terminating ')'. */
-        return(++i);
-}
-
-enum mandoc_esc
-mandoc_escape(const char **end, const char **start, int *sz)
-{
-        char             c, term, numeric;
-        int              i, lim, ssz, rlim;
-        const char      *cp, *rstart;
-        enum mandoc_esc  gly; 
-
-        cp = *end;
-        rstart = cp;
-        if (start)
-                *start = rstart;
-        i = lim = 0;
         gly = ESCAPE_ERROR;
-        term = numeric = '\0';
+        *start = ++*end;
+        *sz = 0;
+        term = '\0';
 
-        switch ((c = cp[i++])) {
+        switch ((*start)[-1]) {
         /*
          * First the glyphs.  There are several different forms of
          * these, but each eventually returns a substring of the glyph
          * name.
          */
         case ('('):
                 gly = ESCAPE_SPECIAL;
-                lim = 2;
+                *sz = 2;
                 break;
         case ('['):
                 gly = ESCAPE_SPECIAL;
                 /*
                  * Unicode escapes are defined in groff as \[uXXXX] to
                  * \[u10FFFF], where the contained value must be a valid
                  * Unicode codepoint.  Here, however, only check whether
                  * it's not a zero-width escape.
                  */
-                if ('u' == cp[i] && ']' != cp[i + 1])
+                if ('u' == (*start)[0] && ']' != (*start)[1])
                         gly = ESCAPE_UNICODE;
                 term = ']';
                 break;
         case ('C'):
-                if ('\'' != cp[i])
+                if ('\'' != **start)
                         return(ESCAPE_ERROR);
+                *start = ++*end;
+                if ('u' == (*start)[0] && '\'' != (*start)[1])
+                        gly = ESCAPE_UNICODE;
+                else
                 gly = ESCAPE_SPECIAL;
                 term = '\'';
                 break;
 
         /*
+         * Escapes taking no arguments at all.
+         */
+        case ('d'):
+                /* FALLTHROUGH */
+        case ('u'):
+                return(ESCAPE_IGNORE);
+
+        /*
+         * The \z escape is supposed to output the following
+         * character without advancing the cursor position.  
+         * Since we are mostly dealing with terminal mode,
+         * let us just skip the next character.
+         */
+        case ('z'):
+                return(ESCAPE_SKIPCHAR);
+
+        /*
          * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
          * 'X' is the trigger.  These have opaque sub-strings.
          */
         case ('F'):
                 /* FALLTHROUGH */

@@ -164,25 +140,21 @@
                 gly = ESCAPE_IGNORE;
                 /* FALLTHROUGH */
         case ('f'):
                 if (ESCAPE_ERROR == gly)
                         gly = ESCAPE_FONT;
-
-                rstart= &cp[i];
-                if (start) 
-                        *start = rstart;
-
-                switch (cp[i++]) {
+                switch (**start) {
                 case ('('):
-                        lim = 2;
+                        *start = ++*end;
+                        *sz = 2;
                         break;
                 case ('['):
+                        *start = ++*end;
                         term = ']';
                         break;
                 default:
-                        lim = 1;
-                        i--;
+                        *sz = 1;
                         break;
                 }
                 break;
 
         /*

@@ -191,70 +163,70 @@
          */
         case ('A'):
                 /* FALLTHROUGH */
         case ('b'):
                 /* FALLTHROUGH */
+        case ('B'):
+                /* FALLTHROUGH */
         case ('D'):
                 /* FALLTHROUGH */
         case ('o'):
                 /* FALLTHROUGH */
         case ('R'):
                 /* FALLTHROUGH */
+        case ('w'):
+                /* FALLTHROUGH */
         case ('X'):
                 /* FALLTHROUGH */
         case ('Z'):
-                if ('\'' != cp[i++])
+                if ('\'' != **start)
                         return(ESCAPE_ERROR);
                 gly = ESCAPE_IGNORE;
+                *start = ++*end;
                 term = '\'';
                 break;
 
         /*
          * These escapes are of the form \X'N', where 'X' is the trigger
          * and 'N' resolves to a numerical expression.
          */
-        case ('B'):
-                /* FALLTHROUGH */
         case ('h'):
                 /* FALLTHROUGH */
         case ('H'):
                 /* FALLTHROUGH */
         case ('L'):
                 /* FALLTHROUGH */
         case ('l'):
-                gly = ESCAPE_NUMBERED;
                 /* FALLTHROUGH */
         case ('S'):
                 /* FALLTHROUGH */
         case ('v'):
                 /* FALLTHROUGH */
-        case ('w'):
-                /* FALLTHROUGH */
         case ('x'):
-                if (ESCAPE_ERROR == gly)
-                        gly = ESCAPE_IGNORE;
-                if ('\'' != cp[i++])
+                if ('\'' != **start)
                         return(ESCAPE_ERROR);
-                term = numeric = '\'';
+                gly = ESCAPE_IGNORE;
+                *start = ++*end;
+                term = '\'';
                 break;
 
         /*
          * Special handling for the numbered character escape.
          * XXX Do any other escapes need similar handling?
          */
         case ('N'):
-                if ('\0' == cp[i])
+                if ('\0' == **start)
                         return(ESCAPE_ERROR);
-                *end = &cp[++i];
-                if (isdigit((unsigned char)cp[i-1]))
+                (*end)++;
+                if (isdigit((unsigned char)**start)) {
+                        *sz = 1;
                         return(ESCAPE_IGNORE);
+                }
+                (*start)++;
                 while (isdigit((unsigned char)**end))
                         (*end)++;
-                if (start)
-                        *start = &cp[i];
-                if (sz)
-                        *sz = *end - &cp[i];
+                *sz = *end - *start;
                 if ('\0' != **end)
                         (*end)++;
                 return(ESCAPE_NUMBERED);
 
         /* 

@@ -261,126 +233,97 @@
          * Sizes get a special category of their own.
          */
         case ('s'):
                 gly = ESCAPE_IGNORE;
 
-                rstart = &cp[i];
-                if (start) 
-                        *start = rstart;
-
                 /* See +/- counts as a sign. */
-                c = cp[i];
-                if ('+' == c || '-' == c || ASCII_HYPH == c)
-                        ++i;
+                if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
+                        (*end)++;
 
-                switch (cp[i++]) {
+                switch (**end) {
                 case ('('):
-                        lim = 2;
+                        *start = ++*end;
+                        *sz = 2;
                         break;
                 case ('['):
-                        term = numeric = ']';
+                        *start = ++*end;
+                        term = ']';
                         break;
                 case ('\''):
-                        term = numeric = '\'';
+                        *start = ++*end;
+                        term = '\'';
                         break;
                 default:
-                        lim = 1;
-                        i--;
+                        *sz = 1;
                         break;
                 }
 
-                /* See +/- counts as a sign. */
-                c = cp[i];
-                if ('+' == c || '-' == c || ASCII_HYPH == c)
-                        ++i;
-
                 break;
 
         /*
          * Anything else is assumed to be a glyph.
+         * In this case, pass back the character after the backslash.
          */
         default:
                 gly = ESCAPE_SPECIAL;
-                lim = 1;
-                i--;
+                *start = --*end;
+                *sz = 1;
                 break;
         }
 
         assert(ESCAPE_ERROR != gly);
 
-        rstart = &cp[i];
-        if (start)
-                *start = rstart;
-
         /*
-         * If a terminating block has been specified, we need to
-         * handle the case of recursion, which could have their
-         * own terminating blocks that mess up our parse.  This, by the
-         * way, means that the "start" and "size" values will be
-         * effectively meaningless.
+         * Read up to the terminating character,
+         * paying attention to nested escapes.
          */
 
-        ssz = 0;
-        if (numeric && -1 == (ssz = numescape(&cp[i])))
-                return(ESCAPE_ERROR);
-
-        i += ssz;
-        rlim = -1;
-
-        /*
-         * We have a character terminator.  Try to read up to that
-         * character.  If we can't (i.e., we hit the nil), then return
-         * an error; if we can, calculate our length, read past the
-         * terminating character, and exit.
-         */
-
         if ('\0' != term) {
-                *end = strchr(&cp[i], term);
-                if ('\0' == *end)
+                while (**end != term) {
+                        switch (**end) {
+                        case ('\0'):
                         return(ESCAPE_ERROR);
-
-                rlim = *end - &cp[i];
-                if (sz)
-                        *sz = rlim;
+                        case ('\\'):
                 (*end)++;
-                goto out;
+                                if (ESCAPE_ERROR ==
+                                    mandoc_escape(end, NULL, NULL))
+                                        return(ESCAPE_ERROR);
+                                break;
+                        default:
+                                (*end)++;
+                                break;
         }
-
-        assert(lim > 0);
-
-        /*
-         * We have a numeric limit.  If the string is shorter than that,
-         * stop and return an error.  Else adjust our endpoint, length,
-         * and return the current glyph.
-         */
-
-        if ((size_t)lim > strlen(&cp[i]))
+                }
+                *sz = (*end)++ - *start;
+        } else {
+                assert(*sz > 0);
+                if ((size_t)*sz > strlen(*start))
                 return(ESCAPE_ERROR);
+                *end += *sz;
+        }
 
-        rlim = lim;
-        if (sz)
-                *sz = rlim;
-
-        *end = &cp[i] + lim;
-
-out:
-        assert(rlim >= 0 && rstart);
-
         /* Run post-processors. */
 
         switch (gly) {
         case (ESCAPE_FONT):
+                if (2 == *sz) {
+                        if ('C' == **start) {
                 /*
-                 * Pretend that the constant-width font modes are the
-                 * same as the regular font modes.
+                                 * Treat constant-width font modes
+                                 * just like regular font modes.
                  */
-                if (2 == rlim && 'C' == *rstart)
-                        rstart++;
-                else if (1 != rlim)
+                                (*start)++;
+                                (*sz)--;
+                        } else {
+                                if ('B' == (*start)[0] && 'I' == (*start)[1])
+                                        gly = ESCAPE_FONTBI;
                         break;
+                        }
+                } else if (1 != *sz)
+                        break;
 
-                switch (*rstart) {
+                switch (**start) {
                 case ('3'):
                         /* FALLTHROUGH */
                 case ('B'):
                         gly = ESCAPE_FONTBOLD;
                         break;

@@ -398,13 +341,11 @@
                         gly = ESCAPE_FONTROMAN;
                         break;
                 }
                 break;
         case (ESCAPE_SPECIAL):
-                if (1 != rlim)
-                        break;
-                if ('c' == *rstart)
+                if (1 == *sz && 'c' == **start)
                         gly = ESCAPE_NOSPACE;
                 break;
         default:
                 break;
         }

@@ -482,14 +423,14 @@
 
 /*
  * Parse a quoted or unquoted roff-style request or macro argument.
  * Return a pointer to the parsed argument, which is either the original
  * pointer or advanced by one byte in case the argument is quoted.
- * Null-terminate the argument in place.
+ * NUL-terminate the argument in place.
  * Collapse pairs of quotes inside quoted arguments.
  * Advance the argument pointer to the next argument,
- * or to the null byte terminating the argument line.
+ * or to the NUL byte terminating the argument line.
  */
 char *
 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
 {
         char     *start, *cp;

@@ -504,21 +445,39 @@
         } 
 
         pairs = 0;
         white = 0;
         for (cp = start; '\0' != *cp; cp++) {
-                /* Move left after quoted quotes and escaped backslashes. */
+
+                /*
+                 * Move the following text left
+                 * after quoted quotes and after "\\" and "\t".
+                 */
                 if (pairs)
                         cp[-pairs] = cp[0];
+
                 if ('\\' == cp[0]) {
-                        if ('\\' == cp[1]) {
-                                /* Poor man's copy mode. */
+                        /*
+                         * In copy mode, translate double to single
+                         * backslashes and backslash-t to literal tabs.
+                         */
+                        switch (cp[1]) {
+                        case ('t'):
+                                cp[0] = '\t';
+                                /* FALLTHROUGH */
+                        case ('\\'):
                                 pairs++;
                                 cp++;
-                        } else if (0 == quoted && ' ' == cp[1])
+                                break;
+                        case (' '):
                                 /* Skip escaped blanks. */
+                                if (0 == quoted)
                                 cp++;
+                                break;
+                        default:
+                                break;
+                        }
                 } else if (0 == quoted) {
                         if (' ' == cp[0]) {
                                 /* Unescaped blanks end unquoted args. */
                                 white = 1;
                                 break;

@@ -538,11 +497,11 @@
 
         /* Quoted argument without a closing quote. */
         if (1 == quoted)
                 mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
 
-        /* Null-terminate this argument and move to the next one. */
+        /* NUL-terminate this argument and move to the next one. */
         if (pairs)
                 cp[-pairs] = '\0';
         if ('\0' != *cp) {
                 *cp++ = '\0';
                 while (' ' == *cp)

@@ -675,36 +634,10 @@
         }
 
         return(found && !enclosed);
 }
 
-/*
- * Find out whether a line is a macro line or not.  If it is, adjust the
- * current position and return one; if it isn't, return zero and don't
- * change the current position.
- */
-int
-mandoc_getcontrol(const char *cp, int *ppos)
-{
-        int             pos;
-
-        pos = *ppos;
-
-        if ('\\' == cp[pos] && '.' == cp[pos + 1])
-                pos += 2;
-        else if ('.' == cp[pos] || '\'' == cp[pos])
-                pos++;
-        else
-                return(0);
-
-        while (' ' == cp[pos] || '\t' == cp[pos])
-                pos++;
-
-        *ppos = pos;
-        return(1);
-}
-
 /*
  * Convert a string to a long that may not be <0.
  * If the string is invalid, or is less than 0, return -1.
  */
 int