1 /* quotearg.c - quote arguments for output
   2 
   3    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2004, 2005, 2006, 2007 Free
   4    Software Foundation, Inc.
   5 
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10 
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15 
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  19 
  20 /* Written by Paul Eggert <eggert@twinsun.com> */
  21 
  22 #include <config.h>
  23 
  24 #include "quotearg.h"
  25 
  26 #include "xalloc.h"
  27 
  28 #include <ctype.h>
  29 #include <errno.h>
  30 #include <limits.h>
  31 #include <stdbool.h>
  32 #include <stdlib.h>
  33 #include <string.h>
  34 #include <wchar.h>
  35 #include <wctype.h>
  36 
  37 #include "gettext.h"
  38 #define _(msgid) gettext (msgid)
  39 #define N_(msgid) msgid
  40 
  41 #if !HAVE_MBRTOWC
  42 /* Disable multibyte processing entirely.  Since MB_CUR_MAX is 1, the
  43    other macros are defined only for documentation and to satisfy C
  44    syntax.  */
  45 # undef MB_CUR_MAX
  46 # define MB_CUR_MAX 1
  47 # undef mbstate_t
  48 # define mbstate_t int
  49 # define mbrtowc(pwc, s, n, ps) ((*(pwc) = *(s)) != 0)
  50 # define iswprint(wc) isprint ((unsigned char) (wc))
  51 # undef HAVE_MBSINIT
  52 #endif
  53 
  54 #if !defined mbsinit && !HAVE_MBSINIT
  55 # define mbsinit(ps) 1
  56 #endif
  57 
  58 #ifndef SIZE_MAX
  59 # define SIZE_MAX ((size_t) -1)
  60 #endif
  61 
  62 #define INT_BITS (sizeof (int) * CHAR_BIT)
  63 
  64 struct quoting_options
  65 {
  66   /* Basic quoting style.  */
  67   enum quoting_style style;
  68 
  69   /* Quote the characters indicated by this bit vector even if the
  70      quoting style would not normally require them to be quoted.  */
  71   unsigned int quote_these_too[(UCHAR_MAX / INT_BITS) + 1];
  72 };
  73 
  74 /* Names of quoting styles.  */
  75 char const *const quoting_style_args[] =
  76 {
  77   "literal",
  78   "shell",
  79   "shell-always",
  80   "c",
  81   "escape",
  82   "locale",
  83   "clocale",
  84   0
  85 };
  86 
  87 /* Correspondences to quoting style names.  */
  88 enum quoting_style const quoting_style_vals[] =
  89 {
  90   literal_quoting_style,
  91   shell_quoting_style,
  92   shell_always_quoting_style,
  93   c_quoting_style,
  94   escape_quoting_style,
  95   locale_quoting_style,
  96   clocale_quoting_style
  97 };
  98 
  99 /* The default quoting options.  */
 100 static struct quoting_options default_quoting_options;
 101 
 102 /* Allocate a new set of quoting options, with contents initially identical
 103    to O if O is not null, or to the default if O is null.
 104    It is the caller's responsibility to free the result.  */
 105 struct quoting_options *
 106 clone_quoting_options (struct quoting_options *o)
 107 {
 108   int e = errno;
 109   struct quoting_options *p = xmemdup (o ? o : &default_quoting_options,
 110                                        sizeof *o);
 111   errno = e;
 112   return p;
 113 }
 114 
 115 /* Get the value of O's quoting style.  If O is null, use the default.  */
 116 enum quoting_style
 117 get_quoting_style (struct quoting_options *o)
 118 {
 119   return (o ? o : &default_quoting_options)->style;
 120 }
 121 
 122 /* In O (or in the default if O is null),
 123    set the value of the quoting style to S.  */
 124 void
 125 set_quoting_style (struct quoting_options *o, enum quoting_style s)
 126 {
 127   (o ? o : &default_quoting_options)->style = s;
 128 }
 129 
 130 /* In O (or in the default if O is null),
 131    set the value of the quoting options for character C to I.
 132    Return the old value.  Currently, the only values defined for I are
 133    0 (the default) and 1 (which means to quote the character even if
 134    it would not otherwise be quoted).  */
 135 int
 136 set_char_quoting (struct quoting_options *o, char c, int i)
 137 {
 138   unsigned char uc = c;
 139   unsigned int *p =
 140     (o ? o : &default_quoting_options)->quote_these_too + uc / INT_BITS;
 141   int shift = uc % INT_BITS;
 142   int r = (*p >> shift) & 1;
 143   *p ^= ((i & 1) ^ r) << shift;
 144   return r;
 145 }
 146 
 147 /* MSGID approximates a quotation mark.  Return its translation if it
 148    has one; otherwise, return either it or "\"", depending on S.  */
 149 static char const *
 150 gettext_quote (char const *msgid, enum quoting_style s)
 151 {
 152   char const *translation = _(msgid);
 153   if (translation == msgid && s == clocale_quoting_style)
 154     translation = "\"";
 155   return translation;
 156 }
 157 
 158 /* Place into buffer BUFFER (of size BUFFERSIZE) a quoted version of
 159    argument ARG (of size ARGSIZE), using QUOTING_STYLE and the
 160    non-quoting-style part of O to control quoting.
 161    Terminate the output with a null character, and return the written
 162    size of the output, not counting the terminating null.
 163    If BUFFERSIZE is too small to store the output string, return the
 164    value that would have been returned had BUFFERSIZE been large enough.
 165    If ARGSIZE is SIZE_MAX, use the string length of the argument for ARGSIZE.
 166 
 167    This function acts like quotearg_buffer (BUFFER, BUFFERSIZE, ARG,
 168    ARGSIZE, O), except it uses QUOTING_STYLE instead of the quoting
 169    style specified by O, and O may not be null.  */
 170 
 171 static size_t
 172 quotearg_buffer_restyled (char *buffer, size_t buffersize,
 173                           char const *arg, size_t argsize,
 174                           enum quoting_style quoting_style,
 175                           struct quoting_options const *o)
 176 {
 177   size_t i;
 178   size_t len = 0;
 179   char const *quote_string = 0;
 180   size_t quote_string_len = 0;
 181   bool backslash_escapes = false;
 182   bool unibyte_locale = MB_CUR_MAX == 1;
 183 
 184 #define STORE(c) \
 185     do \
 186       { \
 187         if (len < buffersize) \
 188           buffer[len] = (c); \
 189         len++; \
 190       } \
 191     while (0)
 192 
 193   switch (quoting_style)
 194     {
 195     case c_quoting_style:
 196       STORE ('"');
 197       backslash_escapes = true;
 198       quote_string = "\"";
 199       quote_string_len = 1;
 200       break;
 201 
 202     case escape_quoting_style:
 203       backslash_escapes = true;
 204       break;
 205 
 206     case locale_quoting_style:
 207     case clocale_quoting_style:
 208       {
 209         /* TRANSLATORS:
 210            Get translations for open and closing quotation marks.
 211 
 212            The message catalog should translate "`" to a left
 213            quotation mark suitable for the locale, and similarly for
 214            "'".  If the catalog has no translation,
 215            locale_quoting_style quotes `like this', and
 216            clocale_quoting_style quotes "like this".
 217 
 218            For example, an American English Unicode locale should
 219            translate "`" to U+201C (LEFT DOUBLE QUOTATION MARK), and
 220            should translate "'" to U+201D (RIGHT DOUBLE QUOTATION
 221            MARK).  A British English Unicode locale should instead
 222            translate these to U+2018 (LEFT SINGLE QUOTATION MARK) and
 223            U+2019 (RIGHT SINGLE QUOTATION MARK), respectively.
 224 
 225            If you don't know what to put here, please see
 226            <http://en.wikipedia.org/wiki/Quotation_mark#Glyphs>
 227            and use glyphs suitable for your language.  */
 228 
 229         char const *left = gettext_quote (N_("`"), quoting_style);
 230         char const *right = gettext_quote (N_("'"), quoting_style);
 231         for (quote_string = left; *quote_string; quote_string++)
 232           STORE (*quote_string);
 233         backslash_escapes = true;
 234         quote_string = right;
 235         quote_string_len = strlen (quote_string);
 236       }
 237       break;
 238 
 239     case shell_always_quoting_style:
 240       STORE ('\'');
 241       quote_string = "'";
 242       quote_string_len = 1;
 243       break;
 244 
 245     default:
 246       break;
 247     }
 248 
 249   for (i = 0;  ! (argsize == SIZE_MAX ? arg[i] == '\0' : i == argsize);  i++)
 250     {
 251       unsigned char c;
 252       unsigned char esc;
 253 
 254       if (backslash_escapes
 255           && quote_string_len
 256           && i + quote_string_len <= argsize
 257           && memcmp (arg + i, quote_string, quote_string_len) == 0)
 258         STORE ('\\');
 259 
 260       c = arg[i];
 261       switch (c)
 262         {
 263         case '\0':
 264           if (backslash_escapes)
 265             {
 266               STORE ('\\');
 267               STORE ('0');
 268               STORE ('0');
 269               c = '0';
 270             }
 271           break;
 272 
 273         case '?':
 274           switch (quoting_style)
 275             {
 276             case shell_quoting_style:
 277               goto use_shell_always_quoting_style;
 278 
 279             case c_quoting_style:
 280               if (i + 2 < argsize && arg[i + 1] == '?')
 281                 switch (arg[i + 2])
 282                   {
 283                   case '!': case '\'':
 284                   case '(': case ')': case '-': case '/':
 285                   case '<': case '=': case '>':
 286                     /* Escape the second '?' in what would otherwise be
 287                        a trigraph.  */
 288                     c = arg[i + 2];
 289                     i += 2;
 290                     STORE ('?');
 291                     STORE ('\\');
 292                     STORE ('?');
 293                     break;
 294 
 295                   default:
 296                     break;
 297                   }
 298               break;
 299 
 300             default:
 301               break;
 302             }
 303           break;
 304 
 305         case '\a': esc = 'a'; goto c_escape;
 306         case '\b': esc = 'b'; goto c_escape;
 307         case '\f': esc = 'f'; goto c_escape;
 308         case '\n': esc = 'n'; goto c_and_shell_escape;
 309         case '\r': esc = 'r'; goto c_and_shell_escape;
 310         case '\t': esc = 't'; goto c_and_shell_escape;
 311         case '\v': esc = 'v'; goto c_escape;
 312         case '\\': esc = c; goto c_and_shell_escape;
 313 
 314         c_and_shell_escape:
 315           if (quoting_style == shell_quoting_style)
 316             goto use_shell_always_quoting_style;
 317         c_escape:
 318           if (backslash_escapes)
 319             {
 320               c = esc;
 321               goto store_escape;
 322             }
 323           break;
 324 
 325         case '{': case '}': /* sometimes special if isolated */
 326           if (! (argsize == SIZE_MAX ? arg[1] == '\0' : argsize == 1))
 327             break;
 328           /* Fall through.  */
 329         case '#': case '~':
 330           if (i != 0)
 331             break;
 332           /* Fall through.  */
 333         case ' ':
 334         case '!': /* special in bash */
 335         case '"': case '$': case '&':
 336         case '(': case ')': case '*': case ';':
 337         case '<':
 338         case '=': /* sometimes special in 0th or (with "set -k") later args */
 339         case '>': case '[':
 340         case '^': /* special in old /bin/sh, e.g. SunOS 4.1.4 */
 341         case '`': case '|':
 342           /* A shell special character.  In theory, '$' and '`' could
 343              be the first bytes of multibyte characters, which means
 344              we should check them with mbrtowc, but in practice this
 345              doesn't happen so it's not worth worrying about.  */
 346           if (quoting_style == shell_quoting_style)
 347             goto use_shell_always_quoting_style;
 348           break;
 349 
 350         case '\'':
 351           switch (quoting_style)
 352             {
 353             case shell_quoting_style:
 354               goto use_shell_always_quoting_style;
 355 
 356             case shell_always_quoting_style:
 357               STORE ('\'');
 358               STORE ('\\');
 359               STORE ('\'');
 360               break;
 361 
 362             default:
 363               break;
 364             }
 365           break;
 366 
 367         case '%': case '+': case ',': case '-': case '.': case '/':
 368         case '0': case '1': case '2': case '3': case '4': case '5':
 369         case '6': case '7': case '8': case '9': case ':':
 370         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 371         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 372         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 373         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 374         case 'Y': case 'Z': case ']': case '_': case 'a': case 'b':
 375         case 'c': case 'd': case 'e': case 'f': case 'g': case 'h':
 376         case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
 377         case 'o': case 'p': case 'q': case 'r': case 's': case 't':
 378         case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
 379           /* These characters don't cause problems, no matter what the
 380              quoting style is.  They cannot start multibyte sequences.  */
 381           break;
 382 
 383         default:
 384           /* If we have a multibyte sequence, copy it until we reach
 385              its end, find an error, or come back to the initial shift
 386              state.  For C-like styles, if the sequence has
 387              unprintable characters, escape the whole sequence, since
 388              we can't easily escape single characters within it.  */
 389           {
 390             /* Length of multibyte sequence found so far.  */
 391             size_t m;
 392 
 393             bool printable;
 394 
 395             if (unibyte_locale)
 396               {
 397                 m = 1;
 398                 printable = isprint (c) != 0;
 399               }
 400             else
 401               {
 402                 mbstate_t mbstate;
 403                 memset (&mbstate, 0, sizeof mbstate);
 404 
 405                 m = 0;
 406                 printable = true;
 407                 if (argsize == SIZE_MAX)
 408                   argsize = strlen (arg);
 409 
 410                 do
 411                   {
 412                     wchar_t w;
 413                     size_t bytes = mbrtowc (&w, &arg[i + m],
 414                                             argsize - (i + m), &mbstate);
 415                     if (bytes == 0)
 416                       break;
 417                     else if (bytes == (size_t) -1)
 418                       {
 419                         printable = false;
 420                         break;
 421                       }
 422                     else if (bytes == (size_t) -2)
 423                       {
 424                         printable = false;
 425                         while (i + m < argsize && arg[i + m])
 426                           m++;
 427                         break;
 428                       }
 429                     else
 430                       {
 431                         /* Work around a bug with older shells that "see" a '\'
 432                            that is really the 2nd byte of a multibyte character.
 433                            In practice the problem is limited to ASCII
 434                            chars >= '@' that are shell special chars.  */
 435                         if ('[' == 0x5b && quoting_style == shell_quoting_style)
 436                           {
 437                             size_t j;
 438                             for (j = 1; j < bytes; j++)
 439                               switch (arg[i + m + j])
 440                                 {
 441                                 case '[': case '\\': case '^':
 442                                 case '`': case '|':
 443                                   goto use_shell_always_quoting_style;
 444 
 445                                 default:
 446                                   break;
 447                                 }
 448                           }
 449 
 450                         if (! iswprint (w))
 451                           printable = false;
 452                         m += bytes;
 453                       }
 454                   }
 455                 while (! mbsinit (&mbstate));
 456               }
 457 
 458             if (1 < m || (backslash_escapes && ! printable))
 459               {
 460                 /* Output a multibyte sequence, or an escaped
 461                    unprintable unibyte character.  */
 462                 size_t ilim = i + m;
 463 
 464                 for (;;)
 465                   {
 466                     if (backslash_escapes && ! printable)
 467                       {
 468                         STORE ('\\');
 469                         STORE ('0' + (c >> 6));
 470                         STORE ('0' + ((c >> 3) & 7));
 471                         c = '0' + (c & 7);
 472                       }
 473                     if (ilim <= i + 1)
 474                       break;
 475                     STORE (c);
 476                     c = arg[++i];
 477                   }
 478 
 479                 goto store_c;
 480               }
 481           }
 482         }
 483 
 484       if (! (backslash_escapes
 485              && o->quote_these_too[c / INT_BITS] & (1 << (c % INT_BITS))))
 486         goto store_c;
 487 
 488     store_escape:
 489       STORE ('\\');
 490 
 491     store_c:
 492       STORE (c);
 493     }
 494 
 495   if (i == 0 && quoting_style == shell_quoting_style)
 496     goto use_shell_always_quoting_style;
 497 
 498   if (quote_string)
 499     for (; *quote_string; quote_string++)
 500       STORE (*quote_string);
 501 
 502   if (len < buffersize)
 503     buffer[len] = '\0';
 504   return len;
 505 
 506  use_shell_always_quoting_style:
 507   return quotearg_buffer_restyled (buffer, buffersize, arg, argsize,
 508                                    shell_always_quoting_style, o);
 509 }
 510 
 511 /* Place into buffer BUFFER (of size BUFFERSIZE) a quoted version of
 512    argument ARG (of size ARGSIZE), using O to control quoting.
 513    If O is null, use the default.
 514    Terminate the output with a null character, and return the written
 515    size of the output, not counting the terminating null.
 516    If BUFFERSIZE is too small to store the output string, return the
 517    value that would have been returned had BUFFERSIZE been large enough.
 518    If ARGSIZE is SIZE_MAX, use the string length of the argument for
 519    ARGSIZE.  */
 520 size_t
 521 quotearg_buffer (char *buffer, size_t buffersize,
 522                  char const *arg, size_t argsize,
 523                  struct quoting_options const *o)
 524 {
 525   struct quoting_options const *p = o ? o : &default_quoting_options;
 526   int e = errno;
 527   size_t r = quotearg_buffer_restyled (buffer, buffersize, arg, argsize,
 528                                        p->style, p);
 529   errno = e;
 530   return r;
 531 }
 532 
 533 /* Like quotearg_buffer (..., ARG, ARGSIZE, O), except return newly
 534    allocated storage containing the quoted string.  */
 535 char *
 536 quotearg_alloc (char const *arg, size_t argsize,
 537                 struct quoting_options const *o)
 538 {
 539   int e = errno;
 540   size_t bufsize = quotearg_buffer (0, 0, arg, argsize, o) + 1;
 541   char *buf = xcharalloc (bufsize);
 542   quotearg_buffer (buf, bufsize, arg, argsize, o);
 543   errno = e;
 544   return buf;
 545 }
 546 
 547 /* A storage slot with size and pointer to a value.  */
 548 struct slotvec
 549 {
 550   size_t size;
 551   char *val;
 552 };
 553 
 554 /* Preallocate a slot 0 buffer, so that the caller can always quote
 555    one small component of a "memory exhausted" message in slot 0.  */
 556 static char slot0[256];
 557 static unsigned int nslots = 1;
 558 static struct slotvec slotvec0 = {sizeof slot0, slot0};
 559 static struct slotvec *slotvec = &slotvec0;
 560 
 561 void
 562 quotearg_free (void)
 563 {
 564   struct slotvec *sv = slotvec;
 565   unsigned int i;
 566   for (i = 1; i < nslots; i++)
 567     free (sv[i].val);
 568   if (sv[0].val != slot0)
 569     {
 570       free (sv[0].val);
 571       slotvec0.size = sizeof slot0;
 572       slotvec0.val = slot0;
 573     }
 574   if (sv != &slotvec0)
 575     {
 576       free (sv);
 577       slotvec = &slotvec0;
 578     }
 579   nslots = 1;
 580 }
 581 
 582 /* Use storage slot N to return a quoted version of argument ARG.
 583    ARG is of size ARGSIZE, but if that is SIZE_MAX, ARG is a
 584    null-terminated string.
 585    OPTIONS specifies the quoting options.
 586    The returned value points to static storage that can be
 587    reused by the next call to this function with the same value of N.
 588    N must be nonnegative.  N is deliberately declared with type "int"
 589    to allow for future extensions (using negative values).  */
 590 static char *
 591 quotearg_n_options (int n, char const *arg, size_t argsize,
 592                     struct quoting_options const *options)
 593 {
 594   int e = errno;
 595 
 596   unsigned int n0 = n;
 597   struct slotvec *sv = slotvec;
 598 
 599   if (n < 0)
 600     abort ();
 601 
 602   if (nslots <= n0)
 603     {
 604       /* FIXME: technically, the type of n1 should be `unsigned int',
 605          but that evokes an unsuppressible warning from gcc-4.0.1 and
 606          older.  If gcc ever provides an option to suppress that warning,
 607          revert to the original type, so that the test in xalloc_oversized
 608          is once again performed only at compile time.  */
 609       size_t n1 = n0 + 1;
 610       bool preallocated = (sv == &slotvec0);
 611 
 612       if (xalloc_oversized (n1, sizeof *sv))
 613         xalloc_die ();
 614 
 615       slotvec = sv = xrealloc (preallocated ? NULL : sv, n1 * sizeof *sv);
 616       if (preallocated)
 617         *sv = slotvec0;
 618       memset (sv + nslots, 0, (n1 - nslots) * sizeof *sv);
 619       nslots = n1;
 620     }
 621 
 622   {
 623     size_t size = sv[n].size;
 624     char *val = sv[n].val;
 625     size_t qsize = quotearg_buffer (val, size, arg, argsize, options);
 626 
 627     if (size <= qsize)
 628       {
 629         sv[n].size = size = qsize + 1;
 630         if (val != slot0)
 631           free (val);
 632         sv[n].val = val = xcharalloc (size);
 633         quotearg_buffer (val, size, arg, argsize, options);
 634       }
 635 
 636     errno = e;
 637     return val;
 638   }
 639 }
 640 
 641 char *
 642 quotearg_n (int n, char const *arg)
 643 {
 644   return quotearg_n_options (n, arg, SIZE_MAX, &default_quoting_options);
 645 }
 646 
 647 char *
 648 quotearg (char const *arg)
 649 {
 650   return quotearg_n (0, arg);
 651 }
 652 
 653 /* Return quoting options for STYLE, with no extra quoting.  */
 654 static struct quoting_options
 655 quoting_options_from_style (enum quoting_style style)
 656 {
 657   struct quoting_options o;
 658   o.style = style;
 659   memset (o.quote_these_too, 0, sizeof o.quote_these_too);
 660   return o;
 661 }
 662 
 663 char *
 664 quotearg_n_style (int n, enum quoting_style s, char const *arg)
 665 {
 666   struct quoting_options const o = quoting_options_from_style (s);
 667   return quotearg_n_options (n, arg, SIZE_MAX, &o);
 668 }
 669 
 670 char *
 671 quotearg_n_style_mem (int n, enum quoting_style s,
 672                       char const *arg, size_t argsize)
 673 {
 674   struct quoting_options const o = quoting_options_from_style (s);
 675   return quotearg_n_options (n, arg, argsize, &o);
 676 }
 677 
 678 char *
 679 quotearg_style (enum quoting_style s, char const *arg)
 680 {
 681   return quotearg_n_style (0, s, arg);
 682 }
 683 
 684 char *
 685 quotearg_char (char const *arg, char ch)
 686 {
 687   struct quoting_options options;
 688   options = default_quoting_options;
 689   set_char_quoting (&options, ch, 1);
 690   return quotearg_n_options (0, arg, SIZE_MAX, &options);
 691 }
 692 
 693 char *
 694 quotearg_colon (char const *arg)
 695 {
 696   return quotearg_char (arg, ':');
 697 }