Print this page
5051 import mdocml-1.12.3
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Approved by: TBD

*** 1,9 **** ! /* $Id: mandoc.c,v 1.62 2011/12/03 16:08:51 schwarze Exp $ */ /* * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> ! * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * --- 1,9 ---- ! /* $Id: mandoc.c,v 1.74 2013/12/30 18:30:32 schwarze Exp $ */ /* * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> ! * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. *
*** 35,150 **** #define DATESIZE 32 static int a2time(time_t *, const char *, const char *); static char *time2a(time_t); - static int numescape(const char *); ! /* ! * Pass over recursive numerical expressions. This context of this ! * function is important: it's only called within character-terminating ! * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial ! * recursion: we don't care about what's in these blocks. ! * This returns the number of characters skipped or -1 if an error ! * occurs (the caller should bail). ! */ ! static int ! numescape(const char *start) { ! int i; ! size_t sz; ! const char *cp; - i = 0; - - /* The expression consists of a subexpression. */ - - if ('\\' == start[i]) { - cp = &start[++i]; /* ! * Read past the end of the subexpression. ! * Bail immediately on errors. */ - if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL)) - return(-1); - return(i + cp - &start[i]); - } ! if ('(' != start[i++]) ! return(0); /* ! * A parenthesised subexpression. Read until the closing ! * parenthesis, making sure to handle any nested subexpressions ! * that might ruin our parse. */ - while (')' != start[i]) { - sz = strcspn(&start[i], ")\\"); - i += (int)sz; - - if ('\0' == start[i]) - return(-1); - else if ('\\' != start[i]) - continue; - - cp = &start[++i]; - if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL)) - return(-1); - i += cp - &start[i]; - } - - /* Read past the terminating ')'. */ - return(++i); - } - - enum mandoc_esc - mandoc_escape(const char **end, const char **start, int *sz) - { - char c, term, numeric; - int i, lim, ssz, rlim; - const char *cp, *rstart; - enum mandoc_esc gly; - - cp = *end; - rstart = cp; - if (start) - *start = rstart; - i = lim = 0; gly = ESCAPE_ERROR; ! term = numeric = '\0'; ! switch ((c = cp[i++])) { /* * First the glyphs. There are several different forms of * these, but each eventually returns a substring of the glyph * name. */ case ('('): gly = ESCAPE_SPECIAL; ! lim = 2; break; case ('['): gly = ESCAPE_SPECIAL; /* * Unicode escapes are defined in groff as \[uXXXX] to * \[u10FFFF], where the contained value must be a valid * Unicode codepoint. Here, however, only check whether * it's not a zero-width escape. */ ! if ('u' == cp[i] && ']' != cp[i + 1]) gly = ESCAPE_UNICODE; term = ']'; break; case ('C'): ! if ('\'' != cp[i]) return(ESCAPE_ERROR); gly = ESCAPE_SPECIAL; term = '\''; break; /* * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where * 'X' is the trigger. These have opaque sub-strings. */ case ('F'): /* FALLTHROUGH */ --- 35,126 ---- #define DATESIZE 32 static int a2time(time_t *, const char *, const char *); static char *time2a(time_t); ! ! enum mandoc_esc ! mandoc_escape(const char **end, const char **start, int *sz) { ! const char *local_start; ! int local_sz; ! char term; ! enum mandoc_esc gly; /* ! * When the caller doesn't provide return storage, ! * use local storage. */ ! if (NULL == start) ! start = &local_start; ! if (NULL == sz) ! sz = &local_sz; /* ! * Beyond the backslash, at least one input character ! * is part of the escape sequence. With one exception ! * (see below), that character won't be returned. */ gly = ESCAPE_ERROR; ! *start = ++*end; ! *sz = 0; ! term = '\0'; ! switch ((*start)[-1]) { /* * First the glyphs. There are several different forms of * these, but each eventually returns a substring of the glyph * name. */ case ('('): gly = ESCAPE_SPECIAL; ! *sz = 2; break; case ('['): gly = ESCAPE_SPECIAL; /* * Unicode escapes are defined in groff as \[uXXXX] to * \[u10FFFF], where the contained value must be a valid * Unicode codepoint. Here, however, only check whether * it's not a zero-width escape. */ ! if ('u' == (*start)[0] && ']' != (*start)[1]) gly = ESCAPE_UNICODE; term = ']'; break; case ('C'): ! if ('\'' != **start) return(ESCAPE_ERROR); + *start = ++*end; + if ('u' == (*start)[0] && '\'' != (*start)[1]) + gly = ESCAPE_UNICODE; + else gly = ESCAPE_SPECIAL; term = '\''; break; /* + * Escapes taking no arguments at all. + */ + case ('d'): + /* FALLTHROUGH */ + case ('u'): + return(ESCAPE_IGNORE); + + /* + * The \z escape is supposed to output the following + * character without advancing the cursor position. + * Since we are mostly dealing with terminal mode, + * let us just skip the next character. + */ + case ('z'): + return(ESCAPE_SKIPCHAR); + + /* * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where * 'X' is the trigger. These have opaque sub-strings. */ case ('F'): /* FALLTHROUGH */
*** 164,188 **** gly = ESCAPE_IGNORE; /* FALLTHROUGH */ case ('f'): if (ESCAPE_ERROR == gly) gly = ESCAPE_FONT; ! ! rstart= &cp[i]; ! if (start) ! *start = rstart; ! ! switch (cp[i++]) { case ('('): ! lim = 2; break; case ('['): term = ']'; break; default: ! lim = 1; ! i--; break; } break; /* --- 140,160 ---- gly = ESCAPE_IGNORE; /* FALLTHROUGH */ case ('f'): if (ESCAPE_ERROR == gly) gly = ESCAPE_FONT; ! switch (**start) { case ('('): ! *start = ++*end; ! *sz = 2; break; case ('['): + *start = ++*end; term = ']'; break; default: ! *sz = 1; break; } break; /*
*** 191,260 **** */ case ('A'): /* FALLTHROUGH */ case ('b'): /* FALLTHROUGH */ case ('D'): /* FALLTHROUGH */ case ('o'): /* FALLTHROUGH */ case ('R'): /* FALLTHROUGH */ case ('X'): /* FALLTHROUGH */ case ('Z'): ! if ('\'' != cp[i++]) return(ESCAPE_ERROR); gly = ESCAPE_IGNORE; term = '\''; break; /* * These escapes are of the form \X'N', where 'X' is the trigger * and 'N' resolves to a numerical expression. */ - case ('B'): - /* FALLTHROUGH */ case ('h'): /* FALLTHROUGH */ case ('H'): /* FALLTHROUGH */ case ('L'): /* FALLTHROUGH */ case ('l'): - gly = ESCAPE_NUMBERED; /* FALLTHROUGH */ case ('S'): /* FALLTHROUGH */ case ('v'): /* FALLTHROUGH */ - case ('w'): - /* FALLTHROUGH */ case ('x'): ! if (ESCAPE_ERROR == gly) ! gly = ESCAPE_IGNORE; ! if ('\'' != cp[i++]) return(ESCAPE_ERROR); ! term = numeric = '\''; break; /* * Special handling for the numbered character escape. * XXX Do any other escapes need similar handling? */ case ('N'): ! if ('\0' == cp[i]) return(ESCAPE_ERROR); ! *end = &cp[++i]; ! if (isdigit((unsigned char)cp[i-1])) return(ESCAPE_IGNORE); while (isdigit((unsigned char)**end)) (*end)++; ! if (start) ! *start = &cp[i]; ! if (sz) ! *sz = *end - &cp[i]; if ('\0' != **end) (*end)++; return(ESCAPE_NUMBERED); /* --- 163,232 ---- */ case ('A'): /* FALLTHROUGH */ case ('b'): /* FALLTHROUGH */ + case ('B'): + /* FALLTHROUGH */ case ('D'): /* FALLTHROUGH */ case ('o'): /* FALLTHROUGH */ case ('R'): /* FALLTHROUGH */ + case ('w'): + /* FALLTHROUGH */ case ('X'): /* FALLTHROUGH */ case ('Z'): ! if ('\'' != **start) return(ESCAPE_ERROR); gly = ESCAPE_IGNORE; + *start = ++*end; term = '\''; break; /* * These escapes are of the form \X'N', where 'X' is the trigger * and 'N' resolves to a numerical expression. */ case ('h'): /* FALLTHROUGH */ case ('H'): /* FALLTHROUGH */ case ('L'): /* FALLTHROUGH */ case ('l'): /* FALLTHROUGH */ case ('S'): /* FALLTHROUGH */ case ('v'): /* FALLTHROUGH */ case ('x'): ! if ('\'' != **start) return(ESCAPE_ERROR); ! gly = ESCAPE_IGNORE; ! *start = ++*end; ! term = '\''; break; /* * Special handling for the numbered character escape. * XXX Do any other escapes need similar handling? */ case ('N'): ! if ('\0' == **start) return(ESCAPE_ERROR); ! (*end)++; ! if (isdigit((unsigned char)**start)) { ! *sz = 1; return(ESCAPE_IGNORE); + } + (*start)++; while (isdigit((unsigned char)**end)) (*end)++; ! *sz = *end - *start; if ('\0' != **end) (*end)++; return(ESCAPE_NUMBERED); /*
*** 261,386 **** * Sizes get a special category of their own. */ case ('s'): gly = ESCAPE_IGNORE; - rstart = &cp[i]; - if (start) - *start = rstart; - /* See +/- counts as a sign. */ ! c = cp[i]; ! if ('+' == c || '-' == c || ASCII_HYPH == c) ! ++i; ! switch (cp[i++]) { case ('('): ! lim = 2; break; case ('['): ! term = numeric = ']'; break; case ('\''): ! term = numeric = '\''; break; default: ! lim = 1; ! i--; break; } - /* See +/- counts as a sign. */ - c = cp[i]; - if ('+' == c || '-' == c || ASCII_HYPH == c) - ++i; - break; /* * Anything else is assumed to be a glyph. */ default: gly = ESCAPE_SPECIAL; ! lim = 1; ! i--; break; } assert(ESCAPE_ERROR != gly); - rstart = &cp[i]; - if (start) - *start = rstart; - /* ! * If a terminating block has been specified, we need to ! * handle the case of recursion, which could have their ! * own terminating blocks that mess up our parse. This, by the ! * way, means that the "start" and "size" values will be ! * effectively meaningless. */ - ssz = 0; - if (numeric && -1 == (ssz = numescape(&cp[i]))) - return(ESCAPE_ERROR); - - i += ssz; - rlim = -1; - - /* - * We have a character terminator. Try to read up to that - * character. If we can't (i.e., we hit the nil), then return - * an error; if we can, calculate our length, read past the - * terminating character, and exit. - */ - if ('\0' != term) { ! *end = strchr(&cp[i], term); ! if ('\0' == *end) return(ESCAPE_ERROR); ! ! rlim = *end - &cp[i]; ! if (sz) ! *sz = rlim; (*end)++; ! goto out; } ! ! assert(lim > 0); ! ! /* ! * We have a numeric limit. If the string is shorter than that, ! * stop and return an error. Else adjust our endpoint, length, ! * and return the current glyph. ! */ ! ! if ((size_t)lim > strlen(&cp[i])) return(ESCAPE_ERROR); - rlim = lim; - if (sz) - *sz = rlim; - - *end = &cp[i] + lim; - - out: - assert(rlim >= 0 && rstart); - /* Run post-processors. */ switch (gly) { case (ESCAPE_FONT): /* ! * Pretend that the constant-width font modes are the ! * same as the regular font modes. */ ! if (2 == rlim && 'C' == *rstart) ! rstart++; ! else if (1 != rlim) break; ! switch (*rstart) { case ('3'): /* FALLTHROUGH */ case ('B'): gly = ESCAPE_FONTBOLD; break; --- 233,329 ---- * Sizes get a special category of their own. */ case ('s'): gly = ESCAPE_IGNORE; /* See +/- counts as a sign. */ ! if ('+' == **end || '-' == **end || ASCII_HYPH == **end) ! (*end)++; ! switch (**end) { case ('('): ! *start = ++*end; ! *sz = 2; break; case ('['): ! *start = ++*end; ! term = ']'; break; case ('\''): ! *start = ++*end; ! term = '\''; break; default: ! *sz = 1; break; } break; /* * Anything else is assumed to be a glyph. + * In this case, pass back the character after the backslash. */ default: gly = ESCAPE_SPECIAL; ! *start = --*end; ! *sz = 1; break; } assert(ESCAPE_ERROR != gly); /* ! * Read up to the terminating character, ! * paying attention to nested escapes. */ if ('\0' != term) { ! while (**end != term) { ! switch (**end) { ! case ('\0'): return(ESCAPE_ERROR); ! case ('\\'): (*end)++; ! if (ESCAPE_ERROR == ! mandoc_escape(end, NULL, NULL)) ! return(ESCAPE_ERROR); ! break; ! default: ! (*end)++; ! break; } ! } ! *sz = (*end)++ - *start; ! } else { ! assert(*sz > 0); ! if ((size_t)*sz > strlen(*start)) return(ESCAPE_ERROR); + *end += *sz; + } /* Run post-processors. */ switch (gly) { case (ESCAPE_FONT): + if (2 == *sz) { + if ('C' == **start) { /* ! * Treat constant-width font modes ! * just like regular font modes. */ ! (*start)++; ! (*sz)--; ! } else { ! if ('B' == (*start)[0] && 'I' == (*start)[1]) ! gly = ESCAPE_FONTBI; break; + } + } else if (1 != *sz) + break; ! switch (**start) { case ('3'): /* FALLTHROUGH */ case ('B'): gly = ESCAPE_FONTBOLD; break;
*** 398,410 **** gly = ESCAPE_FONTROMAN; break; } break; case (ESCAPE_SPECIAL): ! if (1 != rlim) ! break; ! if ('c' == *rstart) gly = ESCAPE_NOSPACE; break; default: break; } --- 341,351 ---- gly = ESCAPE_FONTROMAN; break; } break; case (ESCAPE_SPECIAL): ! if (1 == *sz && 'c' == **start) gly = ESCAPE_NOSPACE; break; default: break; }
*** 482,495 **** /* * Parse a quoted or unquoted roff-style request or macro argument. * Return a pointer to the parsed argument, which is either the original * pointer or advanced by one byte in case the argument is quoted. ! * Null-terminate the argument in place. * Collapse pairs of quotes inside quoted arguments. * Advance the argument pointer to the next argument, ! * or to the null byte terminating the argument line. */ char * mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) { char *start, *cp; --- 423,436 ---- /* * Parse a quoted or unquoted roff-style request or macro argument. * Return a pointer to the parsed argument, which is either the original * pointer or advanced by one byte in case the argument is quoted. ! * NUL-terminate the argument in place. * Collapse pairs of quotes inside quoted arguments. * Advance the argument pointer to the next argument, ! * or to the NUL byte terminating the argument line. */ char * mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos) { char *start, *cp;
*** 504,524 **** } pairs = 0; white = 0; for (cp = start; '\0' != *cp; cp++) { ! /* Move left after quoted quotes and escaped backslashes. */ if (pairs) cp[-pairs] = cp[0]; if ('\\' == cp[0]) { ! if ('\\' == cp[1]) { ! /* Poor man's copy mode. */ pairs++; cp++; ! } else if (0 == quoted && ' ' == cp[1]) /* Skip escaped blanks. */ cp++; } else if (0 == quoted) { if (' ' == cp[0]) { /* Unescaped blanks end unquoted args. */ white = 1; break; --- 445,483 ---- } pairs = 0; white = 0; for (cp = start; '\0' != *cp; cp++) { ! ! /* ! * Move the following text left ! * after quoted quotes and after "\\" and "\t". ! */ if (pairs) cp[-pairs] = cp[0]; + if ('\\' == cp[0]) { ! /* ! * In copy mode, translate double to single ! * backslashes and backslash-t to literal tabs. ! */ ! switch (cp[1]) { ! case ('t'): ! cp[0] = '\t'; ! /* FALLTHROUGH */ ! case ('\\'): pairs++; cp++; ! break; ! case (' '): /* Skip escaped blanks. */ + if (0 == quoted) cp++; + break; + default: + break; + } } else if (0 == quoted) { if (' ' == cp[0]) { /* Unescaped blanks end unquoted args. */ white = 1; break;
*** 538,548 **** /* Quoted argument without a closing quote. */ if (1 == quoted) mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL); ! /* Null-terminate this argument and move to the next one. */ if (pairs) cp[-pairs] = '\0'; if ('\0' != *cp) { *cp++ = '\0'; while (' ' == *cp) --- 497,507 ---- /* Quoted argument without a closing quote. */ if (1 == quoted) mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL); ! /* NUL-terminate this argument and move to the next one. */ if (pairs) cp[-pairs] = '\0'; if ('\0' != *cp) { *cp++ = '\0'; while (' ' == *cp)
*** 675,710 **** } return(found && !enclosed); } - /* - * Find out whether a line is a macro line or not. If it is, adjust the - * current position and return one; if it isn't, return zero and don't - * change the current position. - */ - int - mandoc_getcontrol(const char *cp, int *ppos) - { - int pos; - - pos = *ppos; - - if ('\\' == cp[pos] && '.' == cp[pos + 1]) - pos += 2; - else if ('.' == cp[pos] || '\'' == cp[pos]) - pos++; - else - return(0); - - while (' ' == cp[pos] || '\t' == cp[pos]) - pos++; - - *ppos = pos; - return(1); - } - /* * Convert a string to a long that may not be <0. * If the string is invalid, or is less than 0, return -1. */ int --- 634,643 ----