1 /* $Id: preconv.c,v 1.5 2011/07/24 18:15:14 kristaps Exp $ */ 2 /* 3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 #ifdef HAVE_CONFIG_H 18 #include "config.h" 19 #endif 20 21 #ifdef HAVE_MMAP 22 #include <sys/stat.h> 23 #include <sys/mman.h> 24 #endif 25 26 #include <assert.h> 27 #include <fcntl.h> 28 #include <stdio.h> 29 #include <stdlib.h> 30 #include <string.h> 31 #include <unistd.h> 32 33 /* 34 * The read_whole_file() and resize_buf() functions are copied from 35 * read.c, including all dependency code (MAP_FILE, etc.). 36 */ 37 38 #ifndef MAP_FILE 39 #define MAP_FILE 0 40 #endif 41 42 enum enc { 43 ENC_UTF_8, /* UTF-8 */ 44 ENC_US_ASCII, /* US-ASCII */ 45 ENC_LATIN_1, /* Latin-1 */ 46 ENC__MAX 47 }; 48 49 struct buf { 50 char *buf; /* binary input buffer */ 51 size_t sz; /* size of binary buffer */ 52 size_t offs; /* starting buffer offset */ 53 }; 54 55 struct encode { 56 const char *name; 57 int (*conv)(const struct buf *); 58 }; 59 60 static int cue_enc(const struct buf *, size_t *, enum enc *); 61 static int conv_latin_1(const struct buf *); 62 static int conv_us_ascii(const struct buf *); 63 static int conv_utf_8(const struct buf *); 64 static int read_whole_file(const char *, int, 65 struct buf *, int *); 66 static void resize_buf(struct buf *, size_t); 67 static void usage(void); 68 69 static const struct encode encs[ENC__MAX] = { 70 { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */ 71 { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */ 72 { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */ 73 }; 74 75 static const char *progname; 76 77 static void 78 usage(void) 79 { 80 81 fprintf(stderr, "usage: %s " 82 "[-D enc] " 83 "[-e ENC] " 84 "[file]\n", progname); 85 } 86 87 static int 88 conv_latin_1(const struct buf *b) 89 { 90 size_t i; 91 unsigned char cu; 92 const char *cp; 93 94 cp = b->buf + (int)b->offs; 95 96 /* 97 * Latin-1 falls into the first 256 code-points of Unicode, so 98 * there's no need for any sort of translation. Just make the 99 * 8-bit characters use the Unicode escape. 100 * Note that binary values 128 < v < 160 are passed through 101 * unmodified to mandoc. 102 */ 103 104 for (i = b->offs; i < b->sz; i++) { 105 cu = (unsigned char)*cp++; 106 cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu); 107 } 108 109 return(1); 110 } 111 112 static int 113 conv_us_ascii(const struct buf *b) 114 { 115 116 /* 117 * US-ASCII has no conversion since it falls into the first 128 118 * bytes of Unicode. 119 */ 120 121 fwrite(b->buf, 1, b->sz, stdout); 122 return(1); 123 } 124 125 static int 126 conv_utf_8(const struct buf *b) 127 { 128 int state, be; 129 unsigned int accum; 130 size_t i; 131 unsigned char cu; 132 const char *cp; 133 const long one = 1L; 134 135 cp = b->buf + (int)b->offs; 136 state = 0; 137 accum = 0U; 138 be = 0; 139 140 /* Quick test for big-endian value. */ 141 142 if ( ! (*((const char *)(&one)))) 143 be = 1; 144 145 for (i = b->offs; i < b->sz; i++) { 146 cu = (unsigned char)*cp++; 147 if (state) { 148 if ( ! (cu & 128) || (cu & 64)) { 149 /* Bad sequence header. */ 150 return(0); 151 } 152 153 /* Accept only legitimate bit patterns. */ 154 155 if (cu > 191 || cu < 128) { 156 /* Bad in-sequence bits. */ 157 return(0); 158 } 159 160 accum |= (cu & 63) << --state * 6; 161 162 /* 163 * Accum is held in little-endian order as 164 * stipulated by the UTF-8 sequence coding. We 165 * need to convert to a native big-endian if our 166 * architecture requires it. 167 */ 168 169 if (0 == state && be) 170 accum = (accum >> 24) | 171 ((accum << 8) & 0x00FF0000) | 172 ((accum >> 8) & 0x0000FF00) | 173 (accum << 24); 174 175 if (0 == state) { 176 accum < 128U ? putchar(accum) : 177 printf("\\[u%.4X]", accum); 178 accum = 0U; 179 } 180 } else if (cu & (1 << 7)) { 181 /* 182 * Entering a UTF-8 state: if we encounter a 183 * UTF-8 bitmask, calculate the expected UTF-8 184 * state from it. 185 */ 186 for (state = 0; state < 7; state++) 187 if ( ! (cu & (1 << (7 - state)))) 188 break; 189 190 /* Accept only legitimate bit patterns. */ 191 192 switch (state) { 193 case (4): 194 if (cu <= 244 && cu >= 240) { 195 accum = (cu & 7) << 18; 196 break; 197 } 198 /* Bad 4-sequence start bits. */ 199 return(0); 200 case (3): 201 if (cu <= 239 && cu >= 224) { 202 accum = (cu & 15) << 12; 203 break; 204 } 205 /* Bad 3-sequence start bits. */ 206 return(0); 207 case (2): 208 if (cu <= 223 && cu >= 194) { 209 accum = (cu & 31) << 6; 210 break; 211 } 212 /* Bad 2-sequence start bits. */ 213 return(0); 214 default: 215 /* Bad sequence bit mask. */ 216 return(0); 217 } 218 state--; 219 } else 220 putchar(cu); 221 } 222 223 if (0 != state) { 224 /* Bad trailing bits. */ 225 return(0); 226 } 227 228 return(1); 229 } 230 231 static void 232 resize_buf(struct buf *buf, size_t initial) 233 { 234 235 buf->sz = buf->sz > initial / 2 ? 236 2 * buf->sz : initial; 237 238 buf->buf = realloc(buf->buf, buf->sz); 239 if (NULL == buf->buf) { 240 perror(NULL); 241 exit(EXIT_FAILURE); 242 } 243 } 244 245 static int 246 read_whole_file(const char *f, int fd, 247 struct buf *fb, int *with_mmap) 248 { 249 size_t off; 250 ssize_t ssz; 251 252 #ifdef HAVE_MMAP 253 struct stat st; 254 if (-1 == fstat(fd, &st)) { 255 perror(f); 256 return(0); 257 } 258 259 /* 260 * If we're a regular file, try just reading in the whole entry 261 * via mmap(). This is faster than reading it into blocks, and 262 * since each file is only a few bytes to begin with, I'm not 263 * concerned that this is going to tank any machines. 264 */ 265 266 if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) { 267 fprintf(stderr, "%s: input too large\n", f); 268 return(0); 269 } 270 271 if (S_ISREG(st.st_mode)) { 272 *with_mmap = 1; 273 fb->sz = (size_t)st.st_size; 274 fb->buf = mmap(NULL, fb->sz, PROT_READ, 275 MAP_FILE|MAP_SHARED, fd, 0); 276 if (fb->buf != MAP_FAILED) 277 return(1); 278 } 279 #endif 280 281 /* 282 * If this isn't a regular file (like, say, stdin), then we must 283 * go the old way and just read things in bit by bit. 284 */ 285 286 *with_mmap = 0; 287 off = 0; 288 fb->sz = 0; 289 fb->buf = NULL; 290 for (;;) { 291 if (off == fb->sz && fb->sz == (1U << 31)) { 292 fprintf(stderr, "%s: input too large\n", f); 293 break; 294 } 295 296 if (off == fb->sz) 297 resize_buf(fb, 65536); 298 299 ssz = read(fd, fb->buf + (int)off, fb->sz - off); 300 if (ssz == 0) { 301 fb->sz = off; 302 return(1); 303 } 304 if (ssz == -1) { 305 perror(f); 306 break; 307 } 308 off += (size_t)ssz; 309 } 310 311 free(fb->buf); 312 fb->buf = NULL; 313 return(0); 314 } 315 316 static int 317 cue_enc(const struct buf *b, size_t *offs, enum enc *enc) 318 { 319 const char *ln, *eoln, *eoph; 320 size_t sz, phsz, nsz; 321 int i; 322 323 ln = b->buf + (int)*offs; 324 sz = b->sz - *offs; 325 326 /* Look for the end-of-line. */ 327 328 if (NULL == (eoln = memchr(ln, '\n', sz))) 329 return(-1); 330 331 /* Set next-line marker. */ 332 333 *offs = (size_t)((eoln + 1) - b->buf); 334 335 /* Check if we have the correct header/trailer. */ 336 337 if ((sz = (size_t)(eoln - ln)) < 10 || 338 memcmp(ln, ".\\\" -*-", 7) || 339 memcmp(eoln - 3, "-*-", 3)) 340 return(0); 341 342 /* Move after the header and adjust for the trailer. */ 343 344 ln += 7; 345 sz -= 10; 346 347 while (sz > 0) { 348 while (sz > 0 && ' ' == *ln) { 349 ln++; 350 sz--; 351 } 352 if (0 == sz) 353 break; 354 355 /* Find the end-of-phrase marker (or eoln). */ 356 357 if (NULL == (eoph = memchr(ln, ';', sz))) 358 eoph = eoln - 3; 359 else 360 eoph++; 361 362 /* Only account for the "coding" phrase. */ 363 364 if ((phsz = (size_t)(eoph - ln)) < 7 || 365 strncasecmp(ln, "coding:", 7)) { 366 sz -= phsz; 367 ln += phsz; 368 continue; 369 } 370 371 sz -= 7; 372 ln += 7; 373 374 while (sz > 0 && ' ' == *ln) { 375 ln++; 376 sz--; 377 } 378 if (0 == sz) 379 break; 380 381 /* Check us against known encodings. */ 382 383 for (i = 0; i < (int)ENC__MAX; i++) { 384 nsz = strlen(encs[i].name); 385 if (phsz < nsz) 386 continue; 387 if (strncasecmp(ln, encs[i].name, nsz)) 388 continue; 389 390 *enc = (enum enc)i; 391 return(1); 392 } 393 394 /* Unknown encoding. */ 395 396 *enc = ENC__MAX; 397 return(1); 398 } 399 400 return(0); 401 } 402 403 int 404 main(int argc, char *argv[]) 405 { 406 int i, ch, map, fd, rc; 407 struct buf b; 408 const char *fn; 409 enum enc enc, def; 410 unsigned char bom[3] = { 0xEF, 0xBB, 0xBF }; 411 size_t offs; 412 extern int optind; 413 extern char *optarg; 414 415 progname = strrchr(argv[0], '/'); 416 if (progname == NULL) 417 progname = argv[0]; 418 else 419 ++progname; 420 421 fn = "<stdin>"; 422 fd = STDIN_FILENO; 423 rc = EXIT_FAILURE; 424 enc = def = ENC__MAX; 425 map = 0; 426 427 memset(&b, 0, sizeof(struct buf)); 428 429 while (-1 != (ch = getopt(argc, argv, "D:e:rdvh"))) 430 switch (ch) { 431 case ('D'): 432 /* FALLTHROUGH */ 433 case ('e'): 434 for (i = 0; i < (int)ENC__MAX; i++) { 435 if (strcasecmp(optarg, encs[i].name)) 436 continue; 437 break; 438 } 439 if (i < (int)ENC__MAX) { 440 if ('D' == ch) 441 def = (enum enc)i; 442 else 443 enc = (enum enc)i; 444 break; 445 } 446 447 fprintf(stderr, "%s: Bad encoding\n", optarg); 448 return(EXIT_FAILURE); 449 case ('r'): 450 /* FALLTHROUGH */ 451 case ('d'): 452 /* FALLTHROUGH */ 453 case ('v'): 454 /* Compatibility with GNU preconv. */ 455 break; 456 case ('h'): 457 /* Compatibility with GNU preconv. */ 458 /* FALLTHROUGH */ 459 default: 460 usage(); 461 return(EXIT_FAILURE); 462 } 463 464 argc -= optind; 465 argv += optind; 466 467 /* 468 * Open and read the first argument on the command-line. 469 * If we don't have one, we default to stdin. 470 */ 471 472 if (argc > 0) { 473 fn = *argv; 474 fd = open(fn, O_RDONLY, 0); 475 if (-1 == fd) { 476 perror(fn); 477 return(EXIT_FAILURE); 478 } 479 } 480 481 if ( ! read_whole_file(fn, fd, &b, &map)) 482 goto out; 483 484 /* Try to read the UTF-8 BOM. */ 485 486 if (ENC__MAX == enc) 487 if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) { 488 b.offs = 3; 489 enc = ENC_UTF_8; 490 } 491 492 /* Try reading from the "-*-" cue. */ 493 494 if (ENC__MAX == enc) { 495 offs = b.offs; 496 ch = cue_enc(&b, &offs, &enc); 497 if (0 == ch) 498 ch = cue_enc(&b, &offs, &enc); 499 } 500 501 /* 502 * No encoding has been detected. 503 * Thus, we either fall into our default encoder, if specified, 504 * or use Latin-1 if all else fails. 505 */ 506 507 if (ENC__MAX == enc) 508 enc = ENC__MAX == def ? ENC_LATIN_1 : def; 509 510 if ( ! (*encs[(int)enc].conv)(&b)) { 511 fprintf(stderr, "%s: Bad encoding\n", fn); 512 goto out; 513 } 514 515 rc = EXIT_SUCCESS; 516 out: 517 #ifdef HAVE_MMAP 518 if (map) 519 munmap(b.buf, b.sz); 520 else 521 #endif 522 free(b.buf); 523 524 if (fd > STDIN_FILENO) 525 close(fd); 526 527 return(rc); 528 }