illumos-gate Wdiff usr/src/cmd/mandoc/preconv.c

Print this page

5051 import mdocml-1.12.3
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Approved by: TBD

Split	Close
Expand all
Collapse all

          --- old/usr/src/cmd/mandoc/preconv.c
          +++ new/usr/src/cmd/mandoc/preconv.c
   1      -/*      $Id: preconv.c,v 1.5 2011/07/24 18:15:14 kristaps Exp $ */
        1 +/*      $Id: preconv.c,v 1.6 2013/06/02 03:52:21 schwarze Exp $ */
   2    2  /*
   3    3   * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
   4    4   *
   5    5   * Permission to use, copy, modify, and distribute this software for any
   6    6   * purpose with or without fee is hereby granted, provided that the above
   7    7   * copyright notice and this permission notice appear in all copies.
   8    8   *
   9    9   * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  10   10   * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  11   11   * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR

  12   12   * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  13   13   * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  14   14   * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  15   15   * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  16   16   */
  17   17  #ifdef HAVE_CONFIG_H
  18   18  #include "config.h"
  19   19  #endif
  20   20  
  21   21  #ifdef HAVE_MMAP
  22   22  #include <sys/stat.h>
  23   23  #include <sys/mman.h>
  24   24  #endif

↓ open down ↓

13 lines elided

↑ open up ↑

  25   25  
  26   26  #include <assert.h>
  27   27  #include <fcntl.h>
  28   28  #include <stdio.h>
  29   29  #include <stdlib.h>
  30   30  #include <string.h>
  31   31  #include <unistd.h>
  32   32  
  33   33  /* 
  34   34   * The read_whole_file() and resize_buf() functions are copied from
  35      - * read.c, including all dependency code (MAP_FILE, etc.).
       35 + * read.c, including all dependency code.
  36   36   */
  37   37  
  38      -#ifndef MAP_FILE
  39      -#define MAP_FILE        0
  40      -#endif
  41      -
  42   38  enum    enc {
  43   39          ENC_UTF_8, /* UTF-8 */
  44   40          ENC_US_ASCII, /* US-ASCII */
  45   41          ENC_LATIN_1, /* Latin-1 */
  46   42          ENC__MAX
  47   43  };
  48   44  
  49   45  struct  buf {
  50   46          char             *buf; /* binary input buffer */
  51   47          size_t            sz; /* size of binary buffer */

  52   48          size_t            offs; /* starting buffer offset */
  53   49  };
  54   50  
  55   51  struct  encode {
  56   52          const char       *name;
  57   53          int             (*conv)(const struct buf *);
  58   54  };
  59   55  
  60   56  static  int      cue_enc(const struct buf *, size_t *, enum enc *);
  61   57  static  int      conv_latin_1(const struct buf *);
  62   58  static  int      conv_us_ascii(const struct buf *);
  63   59  static  int      conv_utf_8(const struct buf *);
  64   60  static  int      read_whole_file(const char *, int, 
  65   61                          struct buf *, int *);
  66   62  static  void     resize_buf(struct buf *, size_t);
  67   63  static  void     usage(void);
  68   64  
  69   65  static  const struct encode encs[ENC__MAX] = {
  70   66          { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
  71   67          { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
  72   68          { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
  73   69  };
  74   70  
  75   71  static  const char       *progname;
  76   72  
  77   73  static void
  78   74  usage(void)
  79   75  {
  80   76  
  81   77          fprintf(stderr, "usage: %s "
  82   78                          "[-D enc] "
  83   79                          "[-e ENC] "
  84   80                          "[file]\n", progname);
  85   81  }
  86   82  
  87   83  static int
  88   84  conv_latin_1(const struct buf *b)
  89   85  {
  90   86          size_t           i;
  91   87          unsigned char    cu;
  92   88          const char      *cp;
  93   89  
  94   90          cp = b->buf + (int)b->offs;
  95   91  
  96   92          /*
  97   93           * Latin-1 falls into the first 256 code-points of Unicode, so
  98   94           * there's no need for any sort of translation.  Just make the
  99   95           * 8-bit characters use the Unicode escape.
 100   96           * Note that binary values 128 < v < 160 are passed through
 101   97           * unmodified to mandoc.
 102   98           */
 103   99  
 104  100          for (i = b->offs; i < b->sz; i++) {
 105  101                  cu = (unsigned char)*cp++;
 106  102                  cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu);
 107  103          }
 108  104  
 109  105          return(1);
 110  106  }
 111  107  
 112  108  static int
 113  109  conv_us_ascii(const struct buf *b)
 114  110  {
 115  111  
 116  112          /*
 117  113           * US-ASCII has no conversion since it falls into the first 128
 118  114           * bytes of Unicode.
 119  115           */
 120  116  
 121  117          fwrite(b->buf, 1, b->sz, stdout);
 122  118          return(1);
 123  119  }
 124  120  
 125  121  static int
 126  122  conv_utf_8(const struct buf *b)
 127  123  {
 128  124          int              state, be;
 129  125          unsigned int     accum;
 130  126          size_t           i;
 131  127          unsigned char    cu;
 132  128          const char      *cp;
 133  129          const long       one = 1L;
 134  130  
 135  131          cp = b->buf + (int)b->offs;
 136  132          state = 0;
 137  133          accum = 0U;
 138  134          be = 0;
 139  135  
 140  136          /* Quick test for big-endian value. */
 141  137  
 142  138          if ( ! (*((const char *)(&one))))
 143  139                  be = 1;
 144  140  
 145  141          for (i = b->offs; i < b->sz; i++) {
 146  142                  cu = (unsigned char)*cp++;
 147  143                  if (state) {
 148  144                          if ( ! (cu & 128) || (cu & 64)) {
 149  145                                  /* Bad sequence header. */
 150  146                                  return(0);
 151  147                          }
 152  148  
 153  149                          /* Accept only legitimate bit patterns. */
 154  150  
 155  151                          if (cu > 191 || cu < 128) {
 156  152                                  /* Bad in-sequence bits. */
 157  153                                  return(0);
 158  154                          }
 159  155  
 160  156                          accum |= (cu & 63) << --state * 6;
 161  157  
 162  158                          /*
 163  159                           * Accum is held in little-endian order as
 164  160                           * stipulated by the UTF-8 sequence coding.  We
 165  161                           * need to convert to a native big-endian if our
 166  162                           * architecture requires it.
 167  163                           */
 168  164  
 169  165                          if (0 == state && be) 
 170  166                                  accum = (accum >> 24) | 
 171  167                                          ((accum << 8) & 0x00FF0000) |
 172  168                                          ((accum >> 8) & 0x0000FF00) |
 173  169                                          (accum << 24);
 174  170  
 175  171                          if (0 == state) {
 176  172                                  accum < 128U ? putchar(accum) : 
 177  173                                          printf("\\[u%.4X]", accum);
 178  174                                  accum = 0U;
 179  175                          }
 180  176                  } else if (cu & (1 << 7)) {
 181  177                          /*
 182  178                           * Entering a UTF-8 state:  if we encounter a
 183  179                           * UTF-8 bitmask, calculate the expected UTF-8
 184  180                           * state from it.
 185  181                           */
 186  182                          for (state = 0; state < 7; state++) 
 187  183                                  if ( ! (cu & (1 << (7 - state))))
 188  184                                          break;
 189  185  
 190  186                          /* Accept only legitimate bit patterns. */
 191  187  
 192  188                          switch (state) {
 193  189                          case (4):
 194  190                                  if (cu <= 244 && cu >= 240) {
 195  191                                          accum = (cu & 7) << 18;
 196  192                                          break;
 197  193                                  }
 198  194                                  /* Bad 4-sequence start bits. */
 199  195                                  return(0);
 200  196                          case (3):
 201  197                                  if (cu <= 239 && cu >= 224) {
 202  198                                          accum = (cu & 15) << 12;
 203  199                                          break;
 204  200                                  }
 205  201                                  /* Bad 3-sequence start bits. */
 206  202                                  return(0);
 207  203                          case (2):
 208  204                                  if (cu <= 223 && cu >= 194) {
 209  205                                          accum = (cu & 31) << 6;
 210  206                                          break;
 211  207                                  }
 212  208                                  /* Bad 2-sequence start bits. */
 213  209                                  return(0);
 214  210                          default:
 215  211                                  /* Bad sequence bit mask. */
 216  212                                  return(0);
 217  213                          }
 218  214                          state--;
 219  215                  } else
 220  216                          putchar(cu);
 221  217          }
 222  218  
 223  219          if (0 != state) {
 224  220                  /* Bad trailing bits. */
 225  221                  return(0);
 226  222          }
 227  223  
 228  224          return(1);
 229  225  }
 230  226  
 231  227  static void
 232  228  resize_buf(struct buf *buf, size_t initial)
 233  229  {
 234  230  
 235  231          buf->sz = buf->sz > initial / 2 ? 
 236  232                  2 * buf->sz : initial;
 237  233  
 238  234          buf->buf = realloc(buf->buf, buf->sz);
 239  235          if (NULL == buf->buf) {
 240  236                  perror(NULL);
 241  237                  exit(EXIT_FAILURE);
 242  238          }
 243  239  }
 244  240  
 245  241  static int
 246  242  read_whole_file(const char *f, int fd, 
 247  243                  struct buf *fb, int *with_mmap)
 248  244  {
 249  245          size_t           off;
 250  246          ssize_t          ssz;
 251  247  
 252  248  #ifdef  HAVE_MMAP
 253  249          struct stat      st;
 254  250          if (-1 == fstat(fd, &st)) {
 255  251                  perror(f);
 256  252                  return(0);
 257  253          }
 258  254  
 259  255          /*
 260  256           * If we're a regular file, try just reading in the whole entry
 261  257           * via mmap().  This is faster than reading it into blocks, and
 262  258           * since each file is only a few bytes to begin with, I'm not
 263  259           * concerned that this is going to tank any machines.

↓ open down ↓

212 lines elided

↑ open up ↑

 264  260           */
 265  261  
 266  262          if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
 267  263                  fprintf(stderr, "%s: input too large\n", f);
 268  264                  return(0);
 269  265          } 
 270  266          
 271  267          if (S_ISREG(st.st_mode)) {
 272  268                  *with_mmap = 1;
 273  269                  fb->sz = (size_t)st.st_size;
 274      -                fb->buf = mmap(NULL, fb->sz, PROT_READ, 
 275      -                                MAP_FILE|MAP_SHARED, fd, 0);
      270 +                fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
 276  271                  if (fb->buf != MAP_FAILED)
 277  272                          return(1);
 278  273          }
 279  274  #endif
 280  275  
 281  276          /*
 282  277           * If this isn't a regular file (like, say, stdin), then we must
 283  278           * go the old way and just read things in bit by bit.
 284  279           */
 285  280

 286  281          *with_mmap = 0;
 287  282          off = 0;
 288  283          fb->sz = 0;
 289  284          fb->buf = NULL;
 290  285          for (;;) {
 291  286                  if (off == fb->sz && fb->sz == (1U << 31)) {
 292  287                          fprintf(stderr, "%s: input too large\n", f);
 293  288                          break;
 294  289                  } 
 295  290                  
 296  291                  if (off == fb->sz)
 297  292                          resize_buf(fb, 65536);
 298  293  
 299  294                  ssz = read(fd, fb->buf + (int)off, fb->sz - off);
 300  295                  if (ssz == 0) {
 301  296                          fb->sz = off;
 302  297                          return(1);
 303  298                  }
 304  299                  if (ssz == -1) {
 305  300                          perror(f);
 306  301                          break;
 307  302                  }
 308  303                  off += (size_t)ssz;
 309  304          }
 310  305  
 311  306          free(fb->buf);
 312  307          fb->buf = NULL;
 313  308          return(0);
 314  309  }
 315  310  
 316  311  static int
 317  312  cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
 318  313  {
 319  314          const char      *ln, *eoln, *eoph;
 320  315          size_t           sz, phsz, nsz;
 321  316          int              i;
 322  317  
 323  318          ln = b->buf + (int)*offs;
 324  319          sz = b->sz - *offs;
 325  320  
 326  321          /* Look for the end-of-line. */
 327  322  
 328  323          if (NULL == (eoln = memchr(ln, '\n', sz)))
 329  324                  return(-1);
 330  325  
 331  326          /* Set next-line marker. */
 332  327  
 333  328          *offs = (size_t)((eoln + 1) - b->buf);
 334  329  
 335  330          /* Check if we have the correct header/trailer. */
 336  331  
 337  332          if ((sz = (size_t)(eoln - ln)) < 10 || 
 338  333                          memcmp(ln, ".\\\" -*-", 7) ||
 339  334                          memcmp(eoln - 3, "-*-", 3))
 340  335                  return(0);
 341  336  
 342  337          /* Move after the header and adjust for the trailer. */
 343  338  
 344  339          ln += 7;
 345  340          sz -= 10;
 346  341  
 347  342          while (sz > 0) {
 348  343                  while (sz > 0 && ' ' == *ln) {
 349  344                          ln++;
 350  345                          sz--;
 351  346                  }
 352  347                  if (0 == sz)
 353  348                          break;
 354  349  
 355  350                  /* Find the end-of-phrase marker (or eoln). */
 356  351  
 357  352                  if (NULL == (eoph = memchr(ln, ';', sz)))
 358  353                          eoph = eoln - 3;
 359  354                  else
 360  355                          eoph++;
 361  356  
 362  357                  /* Only account for the "coding" phrase. */
 363  358  
 364  359                  if ((phsz = (size_t)(eoph - ln)) < 7 ||
 365  360                                  strncasecmp(ln, "coding:", 7)) {
 366  361                          sz -= phsz;
 367  362                          ln += phsz;
 368  363                          continue;
 369  364                  } 
 370  365  
 371  366                  sz -= 7;
 372  367                  ln += 7;
 373  368  
 374  369                  while (sz > 0 && ' ' == *ln) {
 375  370                          ln++;
 376  371                          sz--;
 377  372                  }
 378  373                  if (0 == sz)
 379  374                          break;
 380  375  
 381  376                  /* Check us against known encodings. */
 382  377  
 383  378                  for (i = 0; i < (int)ENC__MAX; i++) {
 384  379                          nsz = strlen(encs[i].name);
 385  380                          if (phsz < nsz)
 386  381                                  continue;
 387  382                          if (strncasecmp(ln, encs[i].name, nsz))
 388  383                                  continue;
 389  384  
 390  385                          *enc = (enum enc)i;
 391  386                          return(1);
 392  387                  }
 393  388  
 394  389                  /* Unknown encoding. */
 395  390  
 396  391                  *enc = ENC__MAX;
 397  392                  return(1);
 398  393          }
 399  394  
 400  395          return(0);
 401  396  }
 402  397  
 403  398  int
 404  399  main(int argc, char *argv[])
 405  400  {
 406  401          int              i, ch, map, fd, rc;
 407  402          struct buf       b;
 408  403          const char      *fn;
 409  404          enum enc         enc, def;
 410  405          unsigned char    bom[3] = { 0xEF, 0xBB, 0xBF };
 411  406          size_t           offs;
 412  407          extern int       optind;
 413  408          extern char     *optarg;
 414  409  
 415  410          progname = strrchr(argv[0], '/');
 416  411          if (progname == NULL)
 417  412                  progname = argv[0];
 418  413          else
 419  414                  ++progname;
 420  415  
 421  416          fn = "<stdin>";
 422  417          fd = STDIN_FILENO;
 423  418          rc = EXIT_FAILURE;
 424  419          enc = def = ENC__MAX;
 425  420          map = 0;
 426  421  
 427  422          memset(&b, 0, sizeof(struct buf));
 428  423  
 429  424          while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
 430  425                  switch (ch) {
 431  426                  case ('D'):
 432  427                          /* FALLTHROUGH */
 433  428                  case ('e'):
 434  429                          for (i = 0; i < (int)ENC__MAX; i++) {
 435  430                                  if (strcasecmp(optarg, encs[i].name))
 436  431                                          continue;
 437  432                                  break;
 438  433                          }
 439  434                          if (i < (int)ENC__MAX) {
 440  435                                  if ('D' == ch)
 441  436                                          def = (enum enc)i;
 442  437                                  else
 443  438                                          enc = (enum enc)i;
 444  439                                  break;
 445  440                          }
 446  441  
 447  442                          fprintf(stderr, "%s: Bad encoding\n", optarg);
 448  443                          return(EXIT_FAILURE);
 449  444                  case ('r'):
 450  445                          /* FALLTHROUGH */
 451  446                  case ('d'):
 452  447                          /* FALLTHROUGH */
 453  448                  case ('v'):
 454  449                          /* Compatibility with GNU preconv. */
 455  450                          break;
 456  451                  case ('h'):
 457  452                          /* Compatibility with GNU preconv. */
 458  453                          /* FALLTHROUGH */
 459  454                  default:
 460  455                          usage();
 461  456                          return(EXIT_FAILURE);
 462  457                  }
 463  458  
 464  459          argc -= optind;
 465  460          argv += optind;
 466  461          
 467  462          /* 
 468  463           * Open and read the first argument on the command-line.
 469  464           * If we don't have one, we default to stdin.
 470  465           */
 471  466  
 472  467          if (argc > 0) {
 473  468                  fn = *argv;
 474  469                  fd = open(fn, O_RDONLY, 0);
 475  470                  if (-1 == fd) {
 476  471                          perror(fn);
 477  472                          return(EXIT_FAILURE);
 478  473                  }
 479  474          }
 480  475  
 481  476          if ( ! read_whole_file(fn, fd, &b, &map))
 482  477                  goto out;
 483  478  
 484  479          /* Try to read the UTF-8 BOM. */
 485  480  
 486  481          if (ENC__MAX == enc)
 487  482                  if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) {
 488  483                          b.offs = 3;
 489  484                          enc = ENC_UTF_8;
 490  485                  }
 491  486  
 492  487          /* Try reading from the "-*-" cue. */
 493  488  
 494  489          if (ENC__MAX == enc) {
 495  490                  offs = b.offs;
 496  491                  ch = cue_enc(&b, &offs, &enc);
 497  492                  if (0 == ch)
 498  493                          ch = cue_enc(&b, &offs, &enc);
 499  494          }
 500  495  
 501  496          /*
 502  497           * No encoding has been detected.
 503  498           * Thus, we either fall into our default encoder, if specified,
 504  499           * or use Latin-1 if all else fails.
 505  500           */
 506  501  
 507  502          if (ENC__MAX == enc) 
 508  503                  enc = ENC__MAX == def ? ENC_LATIN_1 : def;
 509  504  
 510  505          if ( ! (*encs[(int)enc].conv)(&b)) {
 511  506                  fprintf(stderr, "%s: Bad encoding\n", fn);
 512  507                  goto out;
 513  508          }
 514  509  
 515  510          rc = EXIT_SUCCESS;
 516  511  out:
 517  512  #ifdef  HAVE_MMAP
 518  513          if (map)
 519  514                  munmap(b.buf, b.sz);
 520  515          else 
 521  516  #endif
 522  517                  free(b.buf);
 523  518  
 524  519          if (fd > STDIN_FILENO)
 525  520                  close(fd);
 526  521  
 527  522          return(rc);
 528  523  }

↓ open down ↓

243 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX