1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * lofi (loopback file) driver - allows you to attach a file to a device, 27 * which can then be accessed through that device. The simple model is that 28 * you tell lofi to open a file, and then use the block device you get as 29 * you would any block device. lofi translates access to the block device 30 * into I/O on the underlying file. This is mostly useful for 31 * mounting images of filesystems. 32 * 33 * lofi is controlled through /dev/lofictl - this is the only device exported 34 * during attach, and is minor number 0. lofiadm communicates with lofi through 35 * ioctls on this device. When a file is attached to lofi, block and character 36 * devices are exported in /dev/lofi and /dev/rlofi. Currently, these devices 37 * are identified by their minor number, and the minor number is also used 38 * as the name in /dev/lofi. If we ever decide to support virtual disks, 39 * we'll have to divide the minor number space to identify fdisk partitions 40 * and slices, and the name will then be the minor number shifted down a 41 * few bits. Minor devices are tracked with state structures handled with 42 * ddi_soft_state(9F) for simplicity. 43 * 44 * A file attached to lofi is opened when attached and not closed until 45 * explicitly detached from lofi. This seems more sensible than deferring 46 * the open until the /dev/lofi device is opened, for a number of reasons. 47 * One is that any failure is likely to be noticed by the person (or script) 48 * running lofiadm. Another is that it would be a security problem if the 49 * file was replaced by another one after being added but before being opened. 50 * 51 * The only hard part about lofi is the ioctls. In order to support things 52 * like 'newfs' on a lofi device, it needs to support certain disk ioctls. 53 * So it has to fake disk geometry and partition information. More may need 54 * to be faked if your favorite utility doesn't work and you think it should 55 * (fdformat doesn't work because it really wants to know the type of floppy 56 * controller to talk to, and that didn't seem easy to fake. Or possibly even 57 * necessary, since we have mkfs_pcfs now). 58 * 59 * Normally, a lofi device cannot be detached if it is open (i.e. busy). To 60 * support simulation of hotplug events, an optional force flag is provided. 61 * If a lofi device is open when a force detach is requested, then the 62 * underlying file is closed and any subsequent operations return EIO. When the 63 * device is closed for the last time, it will be cleaned up at that time. In 64 * addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is 65 * detached but not removed. 66 * 67 * Known problems: 68 * 69 * UFS logging. Mounting a UFS filesystem image "logging" 70 * works for basic copy testing but wedges during a build of ON through 71 * that image. Some deadlock in lufs holding the log mutex and then 72 * getting stuck on a buf. So for now, don't do that. 73 * 74 * Direct I/O. Since the filesystem data is being cached in the buffer 75 * cache, _and_ again in the underlying filesystem, it's tempting to 76 * enable direct I/O on the underlying file. Don't, because that deadlocks. 77 * I think to fix the cache-twice problem we might need filesystem support. 78 * 79 * Interesting things to do: 80 * 81 * Allow multiple files for each device. A poor-man's metadisk, basically. 82 * 83 * Pass-through ioctls on block devices. You can (though it's not 84 * documented), give lofi a block device as a file name. Then we shouldn't 85 * need to fake a geometry, however, it may be relevant if you're replacing 86 * metadisk, or using lofi to get crypto. 87 * It makes sense to do lofiadm -c aes -a /dev/dsk/c0t0d0s4 /dev/lofi/1 88 * and then in /etc/vfstab have an entry for /dev/lofi/1 as /export/home. 89 * In fact this even makes sense if you have lofi "above" metadisk. 90 * 91 * Encryption: 92 * Each lofi device can have its own symmetric key and cipher. 93 * They are passed to us by lofiadm(1m) in the correct format for use 94 * with the misc/kcf crypto_* routines. 95 * 96 * Each block has its own IV, that is calculated in lofi_blk_mech(), based 97 * on the "master" key held in the lsp and the block number of the buffer. 98 */ 99 100 #include <sys/types.h> 101 #include <netinet/in.h> 102 #include <sys/sysmacros.h> 103 #include <sys/uio.h> 104 #include <sys/kmem.h> 105 #include <sys/cred.h> 106 #include <sys/mman.h> 107 #include <sys/errno.h> 108 #include <sys/aio_req.h> 109 #include <sys/stat.h> 110 #include <sys/file.h> 111 #include <sys/modctl.h> 112 #include <sys/conf.h> 113 #include <sys/debug.h> 114 #include <sys/vnode.h> 115 #include <sys/lofi.h> 116 #include <sys/fcntl.h> 117 #include <sys/pathname.h> 118 #include <sys/filio.h> 119 #include <sys/fdio.h> 120 #include <sys/open.h> 121 #include <sys/disp.h> 122 #include <vm/seg_map.h> 123 #include <sys/ddi.h> 124 #include <sys/sunddi.h> 125 #include <sys/zmod.h> 126 #include <sys/id_space.h> 127 #include <sys/mkdev.h> 128 #include <sys/crypto/common.h> 129 #include <sys/crypto/api.h> 130 #include <sys/rctl.h> 131 #include <LzmaDec.h> 132 133 /* 134 * The basis for CRYOFF is derived from usr/src/uts/common/sys/fs/ufs_fs.h. 135 * Crypto metadata, if it exists, is located at the end of the boot block 136 * (BBOFF + BBSIZE, which is SBOFF). The super block and everything after 137 * is offset by the size of the crypto metadata which is handled by 138 * lsp->ls_crypto_offset. 139 */ 140 #define CRYOFF ((off_t)8192) 141 142 #define NBLOCKS_PROP_NAME "Nblocks" 143 #define SIZE_PROP_NAME "Size" 144 #define ZONE_PROP_NAME "zone" 145 146 #define SETUP_C_DATA(cd, buf, len) \ 147 (cd).cd_format = CRYPTO_DATA_RAW; \ 148 (cd).cd_offset = 0; \ 149 (cd).cd_miscdata = NULL; \ 150 (cd).cd_length = (len); \ 151 (cd).cd_raw.iov_base = (buf); \ 152 (cd).cd_raw.iov_len = (len); 153 154 #define UIO_CHECK(uio) \ 155 if (((uio)->uio_loffset % DEV_BSIZE) != 0 || \ 156 ((uio)->uio_resid % DEV_BSIZE) != 0) { \ 157 return (EINVAL); \ 158 } 159 160 static dev_info_t *lofi_dip = NULL; 161 static void *lofi_statep = NULL; 162 static kmutex_t lofi_lock; /* state lock */ 163 static id_space_t *lofi_minor_id; 164 static list_t lofi_list; 165 static zone_key_t lofi_zone_key; 166 167 /* 168 * Because lofi_taskq_nthreads limits the actual swamping of the device, the 169 * maxalloc parameter (lofi_taskq_maxalloc) should be tuned conservatively 170 * high. If we want to be assured that the underlying device is always busy, 171 * we must be sure that the number of bytes enqueued when the number of 172 * enqueued tasks exceeds maxalloc is sufficient to keep the device busy for 173 * the duration of the sleep time in taskq_ent_alloc(). That is, lofi should 174 * set maxalloc to be the maximum throughput (in bytes per second) of the 175 * underlying device divided by the minimum I/O size. We assume a realistic 176 * maximum throughput of one hundred megabytes per second; we set maxalloc on 177 * the lofi task queue to be 104857600 divided by DEV_BSIZE. 178 */ 179 static int lofi_taskq_maxalloc = 104857600 / DEV_BSIZE; 180 static int lofi_taskq_nthreads = 4; /* # of taskq threads per device */ 181 182 const char lofi_crypto_magic[6] = LOFI_CRYPTO_MAGIC; 183 184 /* 185 * To avoid decompressing data in a compressed segment multiple times 186 * when accessing small parts of a segment's data, we cache and reuse 187 * the uncompressed segment's data. 188 * 189 * A single cached segment is sufficient to avoid lots of duplicate 190 * segment decompress operations. A small cache size also reduces the 191 * memory footprint. 192 * 193 * lofi_max_comp_cache is the maximum number of decompressed data segments 194 * cached for each compressed lofi image. It can be set to 0 to disable 195 * caching. 196 */ 197 198 uint32_t lofi_max_comp_cache = 1; 199 200 static int gzip_decompress(void *src, size_t srclen, void *dst, 201 size_t *destlen, int level); 202 203 static int lzma_decompress(void *src, size_t srclen, void *dst, 204 size_t *dstlen, int level); 205 206 lofi_compress_info_t lofi_compress_table[LOFI_COMPRESS_FUNCTIONS] = { 207 {gzip_decompress, NULL, 6, "gzip"}, /* default */ 208 {gzip_decompress, NULL, 6, "gzip-6"}, 209 {gzip_decompress, NULL, 9, "gzip-9"}, 210 {lzma_decompress, NULL, 0, "lzma"} 211 }; 212 213 /*ARGSUSED*/ 214 static void 215 *SzAlloc(void *p, size_t size) 216 { 217 return (kmem_alloc(size, KM_SLEEP)); 218 } 219 220 /*ARGSUSED*/ 221 static void 222 SzFree(void *p, void *address, size_t size) 223 { 224 kmem_free(address, size); 225 } 226 227 static ISzAlloc g_Alloc = { SzAlloc, SzFree }; 228 229 /* 230 * Free data referenced by the linked list of cached uncompressed 231 * segments. 232 */ 233 static void 234 lofi_free_comp_cache(struct lofi_state *lsp) 235 { 236 struct lofi_comp_cache *lc; 237 238 while ((lc = list_remove_head(&lsp->ls_comp_cache)) != NULL) { 239 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 240 kmem_free(lc, sizeof (struct lofi_comp_cache)); 241 lsp->ls_comp_cache_count--; 242 } 243 ASSERT(lsp->ls_comp_cache_count == 0); 244 } 245 246 static int 247 is_opened(struct lofi_state *lsp) 248 { 249 ASSERT(MUTEX_HELD(&lofi_lock)); 250 return (lsp->ls_chr_open || lsp->ls_blk_open || lsp->ls_lyr_open_count); 251 } 252 253 static int 254 mark_opened(struct lofi_state *lsp, int otyp) 255 { 256 ASSERT(MUTEX_HELD(&lofi_lock)); 257 switch (otyp) { 258 case OTYP_CHR: 259 lsp->ls_chr_open = 1; 260 break; 261 case OTYP_BLK: 262 lsp->ls_blk_open = 1; 263 break; 264 case OTYP_LYR: 265 lsp->ls_lyr_open_count++; 266 break; 267 default: 268 return (-1); 269 } 270 return (0); 271 } 272 273 static void 274 mark_closed(struct lofi_state *lsp, int otyp) 275 { 276 ASSERT(MUTEX_HELD(&lofi_lock)); 277 switch (otyp) { 278 case OTYP_CHR: 279 lsp->ls_chr_open = 0; 280 break; 281 case OTYP_BLK: 282 lsp->ls_blk_open = 0; 283 break; 284 case OTYP_LYR: 285 lsp->ls_lyr_open_count--; 286 break; 287 default: 288 break; 289 } 290 } 291 292 static void 293 lofi_free_crypto(struct lofi_state *lsp) 294 { 295 ASSERT(MUTEX_HELD(&lofi_lock)); 296 297 if (lsp->ls_crypto_enabled) { 298 /* 299 * Clean up the crypto state so that it doesn't hang around 300 * in memory after we are done with it. 301 */ 302 if (lsp->ls_key.ck_data != NULL) { 303 bzero(lsp->ls_key.ck_data, 304 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 305 kmem_free(lsp->ls_key.ck_data, 306 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 307 lsp->ls_key.ck_data = NULL; 308 lsp->ls_key.ck_length = 0; 309 } 310 311 if (lsp->ls_mech.cm_param != NULL) { 312 kmem_free(lsp->ls_mech.cm_param, 313 lsp->ls_mech.cm_param_len); 314 lsp->ls_mech.cm_param = NULL; 315 lsp->ls_mech.cm_param_len = 0; 316 } 317 318 if (lsp->ls_iv_mech.cm_param != NULL) { 319 kmem_free(lsp->ls_iv_mech.cm_param, 320 lsp->ls_iv_mech.cm_param_len); 321 lsp->ls_iv_mech.cm_param = NULL; 322 lsp->ls_iv_mech.cm_param_len = 0; 323 } 324 325 mutex_destroy(&lsp->ls_crypto_lock); 326 } 327 } 328 329 static void 330 lofi_destroy(struct lofi_state *lsp, cred_t *credp) 331 { 332 minor_t minor = getminor(lsp->ls_dev); 333 int i; 334 335 ASSERT(MUTEX_HELD(&lofi_lock)); 336 337 list_remove(&lofi_list, lsp); 338 339 lofi_free_crypto(lsp); 340 341 /* 342 * Free pre-allocated compressed buffers 343 */ 344 if (lsp->ls_comp_bufs != NULL) { 345 for (i = 0; i < lofi_taskq_nthreads; i++) { 346 if (lsp->ls_comp_bufs[i].bufsize > 0) 347 kmem_free(lsp->ls_comp_bufs[i].buf, 348 lsp->ls_comp_bufs[i].bufsize); 349 } 350 kmem_free(lsp->ls_comp_bufs, 351 sizeof (struct compbuf) * lofi_taskq_nthreads); 352 } 353 354 (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 355 1, 0, credp, NULL); 356 VN_RELE(lsp->ls_vp); 357 if (lsp->ls_stacked_vp != lsp->ls_vp) 358 VN_RELE(lsp->ls_stacked_vp); 359 360 taskq_destroy(lsp->ls_taskq); 361 362 if (lsp->ls_kstat != NULL) 363 kstat_delete(lsp->ls_kstat); 364 365 /* 366 * Free cached decompressed segment data 367 */ 368 lofi_free_comp_cache(lsp); 369 list_destroy(&lsp->ls_comp_cache); 370 371 if (lsp->ls_uncomp_seg_sz > 0) { 372 kmem_free(lsp->ls_comp_index_data, lsp->ls_comp_index_data_sz); 373 lsp->ls_uncomp_seg_sz = 0; 374 } 375 376 rctl_decr_lofi(lsp->ls_zone.zref_zone, 1); 377 zone_rele_ref(&lsp->ls_zone, ZONE_REF_LOFI); 378 379 mutex_destroy(&lsp->ls_comp_cache_lock); 380 mutex_destroy(&lsp->ls_comp_bufs_lock); 381 mutex_destroy(&lsp->ls_kstat_lock); 382 mutex_destroy(&lsp->ls_vp_lock); 383 384 ASSERT(ddi_get_soft_state(lofi_statep, minor) == lsp); 385 ddi_soft_state_free(lofi_statep, minor); 386 id_free(lofi_minor_id, minor); 387 } 388 389 static void 390 lofi_free_dev(dev_t dev) 391 { 392 minor_t minor = getminor(dev); 393 char namebuf[50]; 394 395 ASSERT(MUTEX_HELD(&lofi_lock)); 396 397 (void) ddi_prop_remove(dev, lofi_dip, ZONE_PROP_NAME); 398 (void) ddi_prop_remove(dev, lofi_dip, SIZE_PROP_NAME); 399 (void) ddi_prop_remove(dev, lofi_dip, NBLOCKS_PROP_NAME); 400 401 (void) snprintf(namebuf, sizeof (namebuf), "%d", minor); 402 ddi_remove_minor_node(lofi_dip, namebuf); 403 (void) snprintf(namebuf, sizeof (namebuf), "%d,raw", minor); 404 ddi_remove_minor_node(lofi_dip, namebuf); 405 } 406 407 /*ARGSUSED*/ 408 static void 409 lofi_zone_shutdown(zoneid_t zoneid, void *arg) 410 { 411 struct lofi_state *lsp; 412 struct lofi_state *next; 413 414 mutex_enter(&lofi_lock); 415 416 for (lsp = list_head(&lofi_list); lsp != NULL; lsp = next) { 417 418 /* lofi_destroy() frees lsp */ 419 next = list_next(&lofi_list, lsp); 420 421 if (lsp->ls_zone.zref_zone->zone_id != zoneid) 422 continue; 423 424 /* 425 * No in-zone processes are running, but something has this 426 * open. It's either a global zone process, or a lofi 427 * mount. In either case we set ls_cleanup so the last 428 * user destroys the device. 429 */ 430 if (is_opened(lsp)) { 431 lsp->ls_cleanup = 1; 432 } else { 433 lofi_free_dev(lsp->ls_dev); 434 lofi_destroy(lsp, kcred); 435 } 436 } 437 438 mutex_exit(&lofi_lock); 439 } 440 441 /*ARGSUSED*/ 442 static int 443 lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp) 444 { 445 minor_t minor; 446 struct lofi_state *lsp; 447 448 /* 449 * lofiadm -a /dev/lofi/1 gets us here. 450 */ 451 if (mutex_owner(&lofi_lock) == curthread) 452 return (EINVAL); 453 454 mutex_enter(&lofi_lock); 455 456 minor = getminor(*devp); 457 458 /* master control device */ 459 if (minor == 0) { 460 mutex_exit(&lofi_lock); 461 return (0); 462 } 463 464 /* otherwise, the mapping should already exist */ 465 lsp = ddi_get_soft_state(lofi_statep, minor); 466 if (lsp == NULL) { 467 mutex_exit(&lofi_lock); 468 return (EINVAL); 469 } 470 471 if (lsp->ls_vp == NULL) { 472 mutex_exit(&lofi_lock); 473 return (ENXIO); 474 } 475 476 if (mark_opened(lsp, otyp) == -1) { 477 mutex_exit(&lofi_lock); 478 return (EINVAL); 479 } 480 481 if (lsp->ls_readonly && (flag & FWRITE)) { 482 mutex_exit(&lofi_lock); 483 return (EROFS); 484 } 485 486 mutex_exit(&lofi_lock); 487 return (0); 488 } 489 490 /*ARGSUSED*/ 491 static int 492 lofi_close(dev_t dev, int flag, int otyp, struct cred *credp) 493 { 494 minor_t minor; 495 struct lofi_state *lsp; 496 497 mutex_enter(&lofi_lock); 498 minor = getminor(dev); 499 lsp = ddi_get_soft_state(lofi_statep, minor); 500 if (lsp == NULL) { 501 mutex_exit(&lofi_lock); 502 return (EINVAL); 503 } 504 505 if (minor == 0) { 506 mutex_exit(&lofi_lock); 507 return (0); 508 } 509 510 mark_closed(lsp, otyp); 511 512 /* 513 * If we forcibly closed the underlying device (li_force), or 514 * asked for cleanup (li_cleanup), finish up if we're the last 515 * out of the door. 516 */ 517 if (!is_opened(lsp) && (lsp->ls_cleanup || lsp->ls_vp == NULL)) { 518 lofi_free_dev(lsp->ls_dev); 519 lofi_destroy(lsp, credp); 520 } 521 522 mutex_exit(&lofi_lock); 523 return (0); 524 } 525 526 /* 527 * Sets the mechanism's initialization vector (IV) if one is needed. 528 * The IV is computed from the data block number. lsp->ls_mech is 529 * altered so that: 530 * lsp->ls_mech.cm_param_len is set to the IV len. 531 * lsp->ls_mech.cm_param is set to the IV. 532 */ 533 static int 534 lofi_blk_mech(struct lofi_state *lsp, longlong_t lblkno) 535 { 536 int ret; 537 crypto_data_t cdata; 538 char *iv; 539 size_t iv_len; 540 size_t min; 541 void *data; 542 size_t datasz; 543 544 ASSERT(MUTEX_HELD(&lsp->ls_crypto_lock)); 545 546 if (lsp == NULL) 547 return (CRYPTO_DEVICE_ERROR); 548 549 /* lsp->ls_mech.cm_param{_len} has already been set for static iv */ 550 if (lsp->ls_iv_type == IVM_NONE) { 551 return (CRYPTO_SUCCESS); 552 } 553 554 /* 555 * if kmem already alloced from previous call and it's the same size 556 * we need now, just recycle it; allocate new kmem only if we have to 557 */ 558 if (lsp->ls_mech.cm_param == NULL || 559 lsp->ls_mech.cm_param_len != lsp->ls_iv_len) { 560 iv_len = lsp->ls_iv_len; 561 iv = kmem_zalloc(iv_len, KM_SLEEP); 562 } else { 563 iv_len = lsp->ls_mech.cm_param_len; 564 iv = lsp->ls_mech.cm_param; 565 bzero(iv, iv_len); 566 } 567 568 switch (lsp->ls_iv_type) { 569 case IVM_ENC_BLKNO: 570 /* iv is not static, lblkno changes each time */ 571 data = &lblkno; 572 datasz = sizeof (lblkno); 573 break; 574 default: 575 data = 0; 576 datasz = 0; 577 break; 578 } 579 580 /* 581 * write blkno into the iv buffer padded on the left in case 582 * blkno ever grows bigger than its current longlong_t size 583 * or a variation other than blkno is used for the iv data 584 */ 585 min = MIN(datasz, iv_len); 586 bcopy(data, iv + (iv_len - min), min); 587 588 /* encrypt the data in-place to get the IV */ 589 SETUP_C_DATA(cdata, iv, iv_len); 590 591 ret = crypto_encrypt(&lsp->ls_iv_mech, &cdata, &lsp->ls_key, 592 NULL, NULL, NULL); 593 if (ret != CRYPTO_SUCCESS) { 594 cmn_err(CE_WARN, "failed to create iv for block %lld: (0x%x)", 595 lblkno, ret); 596 if (lsp->ls_mech.cm_param != iv) 597 kmem_free(iv, iv_len); 598 599 return (ret); 600 } 601 602 /* clean up the iv from the last computation */ 603 if (lsp->ls_mech.cm_param != NULL && lsp->ls_mech.cm_param != iv) 604 kmem_free(lsp->ls_mech.cm_param, lsp->ls_mech.cm_param_len); 605 606 lsp->ls_mech.cm_param_len = iv_len; 607 lsp->ls_mech.cm_param = iv; 608 609 return (CRYPTO_SUCCESS); 610 } 611 612 /* 613 * Performs encryption and decryption of a chunk of data of size "len", 614 * one DEV_BSIZE block at a time. "len" is assumed to be a multiple of 615 * DEV_BSIZE. 616 */ 617 static int 618 lofi_crypto(struct lofi_state *lsp, struct buf *bp, caddr_t plaintext, 619 caddr_t ciphertext, size_t len, boolean_t op_encrypt) 620 { 621 crypto_data_t cdata; 622 crypto_data_t wdata; 623 int ret; 624 longlong_t lblkno = bp->b_lblkno; 625 626 mutex_enter(&lsp->ls_crypto_lock); 627 628 /* 629 * though we could encrypt/decrypt entire "len" chunk of data, we need 630 * to break it into DEV_BSIZE pieces to capture blkno incrementing 631 */ 632 SETUP_C_DATA(cdata, plaintext, len); 633 cdata.cd_length = DEV_BSIZE; 634 if (ciphertext != NULL) { /* not in-place crypto */ 635 SETUP_C_DATA(wdata, ciphertext, len); 636 wdata.cd_length = DEV_BSIZE; 637 } 638 639 do { 640 ret = lofi_blk_mech(lsp, lblkno); 641 if (ret != CRYPTO_SUCCESS) 642 continue; 643 644 if (op_encrypt) { 645 ret = crypto_encrypt(&lsp->ls_mech, &cdata, 646 &lsp->ls_key, NULL, 647 ((ciphertext != NULL) ? &wdata : NULL), NULL); 648 } else { 649 ret = crypto_decrypt(&lsp->ls_mech, &cdata, 650 &lsp->ls_key, NULL, 651 ((ciphertext != NULL) ? &wdata : NULL), NULL); 652 } 653 654 cdata.cd_offset += DEV_BSIZE; 655 if (ciphertext != NULL) 656 wdata.cd_offset += DEV_BSIZE; 657 lblkno++; 658 } while (ret == CRYPTO_SUCCESS && cdata.cd_offset < len); 659 660 mutex_exit(&lsp->ls_crypto_lock); 661 662 if (ret != CRYPTO_SUCCESS) { 663 cmn_err(CE_WARN, "%s failed for block %lld: (0x%x)", 664 op_encrypt ? "crypto_encrypt()" : "crypto_decrypt()", 665 lblkno, ret); 666 } 667 668 return (ret); 669 } 670 671 #define RDWR_RAW 1 672 #define RDWR_BCOPY 2 673 674 static int 675 lofi_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp, 676 struct lofi_state *lsp, size_t len, int method, caddr_t bcopy_locn) 677 { 678 ssize_t resid; 679 int isread; 680 int error; 681 682 /* 683 * Handles reads/writes for both plain and encrypted lofi 684 * Note: offset is already shifted by lsp->ls_crypto_offset 685 * when it gets here. 686 */ 687 688 isread = bp->b_flags & B_READ; 689 if (isread) { 690 if (method == RDWR_BCOPY) { 691 /* DO NOT update bp->b_resid for bcopy */ 692 bcopy(bcopy_locn, bufaddr, len); 693 error = 0; 694 } else { /* RDWR_RAW */ 695 error = vn_rdwr(UIO_READ, lsp->ls_vp, bufaddr, len, 696 offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, 697 &resid); 698 bp->b_resid = resid; 699 } 700 if (lsp->ls_crypto_enabled && error == 0) { 701 if (lofi_crypto(lsp, bp, bufaddr, NULL, len, 702 B_FALSE) != CRYPTO_SUCCESS) { 703 /* 704 * XXX: original code didn't set residual 705 * back to len because no error was expected 706 * from bcopy() if encryption is not enabled 707 */ 708 if (method != RDWR_BCOPY) 709 bp->b_resid = len; 710 error = EIO; 711 } 712 } 713 return (error); 714 } else { 715 void *iobuf = bufaddr; 716 717 if (lsp->ls_crypto_enabled) { 718 /* don't do in-place crypto to keep bufaddr intact */ 719 iobuf = kmem_alloc(len, KM_SLEEP); 720 if (lofi_crypto(lsp, bp, bufaddr, iobuf, len, 721 B_TRUE) != CRYPTO_SUCCESS) { 722 kmem_free(iobuf, len); 723 if (method != RDWR_BCOPY) 724 bp->b_resid = len; 725 return (EIO); 726 } 727 } 728 if (method == RDWR_BCOPY) { 729 /* DO NOT update bp->b_resid for bcopy */ 730 bcopy(iobuf, bcopy_locn, len); 731 error = 0; 732 } else { /* RDWR_RAW */ 733 error = vn_rdwr(UIO_WRITE, lsp->ls_vp, iobuf, len, 734 offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, 735 &resid); 736 bp->b_resid = resid; 737 } 738 if (lsp->ls_crypto_enabled) { 739 kmem_free(iobuf, len); 740 } 741 return (error); 742 } 743 } 744 745 static int 746 lofi_mapped_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp, 747 struct lofi_state *lsp) 748 { 749 int error; 750 offset_t alignedoffset, mapoffset; 751 size_t xfersize; 752 int isread; 753 int smflags; 754 caddr_t mapaddr; 755 size_t len; 756 enum seg_rw srw; 757 int save_error; 758 759 /* 760 * Note: offset is already shifted by lsp->ls_crypto_offset 761 * when it gets here. 762 */ 763 if (lsp->ls_crypto_enabled) 764 ASSERT(lsp->ls_vp_comp_size == lsp->ls_vp_size); 765 766 /* 767 * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on 768 * an 8K boundary, but the buf transfer address may not be 769 * aligned on more than a 512-byte boundary (we don't enforce 770 * that even though we could). This matters since the initial 771 * part of the transfer may not start at offset 0 within the 772 * segmap'd chunk. So we have to compensate for that with 773 * 'mapoffset'. Subsequent chunks always start off at the 774 * beginning, and the last is capped by b_resid 775 * 776 * Visually, where "|" represents page map boundaries: 777 * alignedoffset (mapaddr begins at this segmap boundary) 778 * | offset (from beginning of file) 779 * | | len 780 * v v v 781 * ===|====X========|====...======|========X====|==== 782 * /-------------...---------------/ 783 * ^ bp->b_bcount/bp->b_resid at start 784 * /----/--------/----...------/--------/ 785 * ^ ^ ^ ^ ^ 786 * | | | | nth xfersize (<= MAXBSIZE) 787 * | | 2nd thru n-1st xfersize (= MAXBSIZE) 788 * | 1st xfersize (<= MAXBSIZE) 789 * mapoffset (offset into 1st segmap, non-0 1st time, 0 thereafter) 790 * 791 * Notes: "alignedoffset" is "offset" rounded down to nearest 792 * MAXBSIZE boundary. "len" is next page boundary of size 793 * PAGESIZE after "alignedoffset". 794 */ 795 mapoffset = offset & MAXBOFFSET; 796 alignedoffset = offset - mapoffset; 797 bp->b_resid = bp->b_bcount; 798 isread = bp->b_flags & B_READ; 799 srw = isread ? S_READ : S_WRITE; 800 do { 801 xfersize = MIN(lsp->ls_vp_comp_size - offset, 802 MIN(MAXBSIZE - mapoffset, bp->b_resid)); 803 len = roundup(mapoffset + xfersize, PAGESIZE); 804 mapaddr = segmap_getmapflt(segkmap, lsp->ls_vp, 805 alignedoffset, MAXBSIZE, 1, srw); 806 /* 807 * Now fault in the pages. This lets us check 808 * for errors before we reference mapaddr and 809 * try to resolve the fault in bcopy (which would 810 * panic instead). And this can easily happen, 811 * particularly if you've lofi'd a file over NFS 812 * and someone deletes the file on the server. 813 */ 814 error = segmap_fault(kas.a_hat, segkmap, mapaddr, 815 len, F_SOFTLOCK, srw); 816 if (error) { 817 (void) segmap_release(segkmap, mapaddr, 0); 818 if (FC_CODE(error) == FC_OBJERR) 819 error = FC_ERRNO(error); 820 else 821 error = EIO; 822 break; 823 } 824 /* error may be non-zero for encrypted lofi */ 825 error = lofi_rdwr(bufaddr, 0, bp, lsp, xfersize, 826 RDWR_BCOPY, mapaddr + mapoffset); 827 if (error == 0) { 828 bp->b_resid -= xfersize; 829 bufaddr += xfersize; 830 offset += xfersize; 831 } 832 smflags = 0; 833 if (isread) { 834 smflags |= SM_FREE; 835 /* 836 * If we're reading an entire page starting 837 * at a page boundary, there's a good chance 838 * we won't need it again. Put it on the 839 * head of the freelist. 840 */ 841 if (mapoffset == 0 && xfersize == MAXBSIZE) 842 smflags |= SM_DONTNEED; 843 } else { 844 /* 845 * Write back good pages, it is okay to 846 * always release asynchronous here as we'll 847 * follow with VOP_FSYNC for B_SYNC buffers. 848 */ 849 if (error == 0) 850 smflags |= SM_WRITE | SM_ASYNC; 851 } 852 (void) segmap_fault(kas.a_hat, segkmap, mapaddr, 853 len, F_SOFTUNLOCK, srw); 854 save_error = segmap_release(segkmap, mapaddr, smflags); 855 if (error == 0) 856 error = save_error; 857 /* only the first map may start partial */ 858 mapoffset = 0; 859 alignedoffset += MAXBSIZE; 860 } while ((error == 0) && (bp->b_resid > 0) && 861 (offset < lsp->ls_vp_comp_size)); 862 863 return (error); 864 } 865 866 /* 867 * Check if segment seg_index is present in the decompressed segment 868 * data cache. 869 * 870 * Returns a pointer to the decompressed segment data cache entry if 871 * found, and NULL when decompressed data for this segment is not yet 872 * cached. 873 */ 874 static struct lofi_comp_cache * 875 lofi_find_comp_data(struct lofi_state *lsp, uint64_t seg_index) 876 { 877 struct lofi_comp_cache *lc; 878 879 ASSERT(MUTEX_HELD(&lsp->ls_comp_cache_lock)); 880 881 for (lc = list_head(&lsp->ls_comp_cache); lc != NULL; 882 lc = list_next(&lsp->ls_comp_cache, lc)) { 883 if (lc->lc_index == seg_index) { 884 /* 885 * Decompressed segment data was found in the 886 * cache. 887 * 888 * The cache uses an LRU replacement strategy; 889 * move the entry to head of list. 890 */ 891 list_remove(&lsp->ls_comp_cache, lc); 892 list_insert_head(&lsp->ls_comp_cache, lc); 893 return (lc); 894 } 895 } 896 return (NULL); 897 } 898 899 /* 900 * Add the data for a decompressed segment at segment index 901 * seg_index to the cache of the decompressed segments. 902 * 903 * Returns a pointer to the cache element structure in case 904 * the data was added to the cache; returns NULL when the data 905 * wasn't cached. 906 */ 907 static struct lofi_comp_cache * 908 lofi_add_comp_data(struct lofi_state *lsp, uint64_t seg_index, 909 uchar_t *data) 910 { 911 struct lofi_comp_cache *lc; 912 913 ASSERT(MUTEX_HELD(&lsp->ls_comp_cache_lock)); 914 915 while (lsp->ls_comp_cache_count > lofi_max_comp_cache) { 916 lc = list_remove_tail(&lsp->ls_comp_cache); 917 ASSERT(lc != NULL); 918 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 919 kmem_free(lc, sizeof (struct lofi_comp_cache)); 920 lsp->ls_comp_cache_count--; 921 } 922 923 /* 924 * Do not cache when disabled by tunable variable 925 */ 926 if (lofi_max_comp_cache == 0) 927 return (NULL); 928 929 /* 930 * When the cache has not yet reached the maximum allowed 931 * number of segments, allocate a new cache element. 932 * Otherwise the cache is full; reuse the last list element 933 * (LRU) for caching the decompressed segment data. 934 * 935 * The cache element for the new decompressed segment data is 936 * added to the head of the list. 937 */ 938 if (lsp->ls_comp_cache_count < lofi_max_comp_cache) { 939 lc = kmem_alloc(sizeof (struct lofi_comp_cache), KM_SLEEP); 940 lc->lc_data = NULL; 941 list_insert_head(&lsp->ls_comp_cache, lc); 942 lsp->ls_comp_cache_count++; 943 } else { 944 lc = list_remove_tail(&lsp->ls_comp_cache); 945 if (lc == NULL) 946 return (NULL); 947 list_insert_head(&lsp->ls_comp_cache, lc); 948 } 949 950 /* 951 * Free old uncompressed segment data when reusing a cache 952 * entry. 953 */ 954 if (lc->lc_data != NULL) 955 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 956 957 lc->lc_data = data; 958 lc->lc_index = seg_index; 959 return (lc); 960 } 961 962 963 /*ARGSUSED*/ 964 static int 965 gzip_decompress(void *src, size_t srclen, void *dst, 966 size_t *dstlen, int level) 967 { 968 ASSERT(*dstlen >= srclen); 969 970 if (z_uncompress(dst, dstlen, src, srclen) != Z_OK) 971 return (-1); 972 return (0); 973 } 974 975 #define LZMA_HEADER_SIZE (LZMA_PROPS_SIZE + 8) 976 /*ARGSUSED*/ 977 static int 978 lzma_decompress(void *src, size_t srclen, void *dst, 979 size_t *dstlen, int level) 980 { 981 size_t insizepure; 982 void *actual_src; 983 ELzmaStatus status; 984 985 insizepure = srclen - LZMA_HEADER_SIZE; 986 actual_src = (void *)((Byte *)src + LZMA_HEADER_SIZE); 987 988 if (LzmaDecode((Byte *)dst, (size_t *)dstlen, 989 (const Byte *)actual_src, &insizepure, 990 (const Byte *)src, LZMA_PROPS_SIZE, LZMA_FINISH_ANY, &status, 991 &g_Alloc) != SZ_OK) { 992 return (-1); 993 } 994 return (0); 995 } 996 997 /* 998 * This is basically what strategy used to be before we found we 999 * needed task queues. 1000 */ 1001 static void 1002 lofi_strategy_task(void *arg) 1003 { 1004 struct buf *bp = (struct buf *)arg; 1005 int error; 1006 int syncflag = 0; 1007 struct lofi_state *lsp; 1008 offset_t offset; 1009 caddr_t bufaddr; 1010 size_t len; 1011 size_t xfersize; 1012 boolean_t bufinited = B_FALSE; 1013 1014 lsp = ddi_get_soft_state(lofi_statep, getminor(bp->b_edev)); 1015 if (lsp == NULL) { 1016 error = ENXIO; 1017 goto errout; 1018 } 1019 if (lsp->ls_kstat) { 1020 mutex_enter(lsp->ls_kstat->ks_lock); 1021 kstat_waitq_to_runq(KSTAT_IO_PTR(lsp->ls_kstat)); 1022 mutex_exit(lsp->ls_kstat->ks_lock); 1023 } 1024 bp_mapin(bp); 1025 bufaddr = bp->b_un.b_addr; 1026 offset = bp->b_lblkno * DEV_BSIZE; /* offset within file */ 1027 if (lsp->ls_crypto_enabled) { 1028 /* encrypted data really begins after crypto header */ 1029 offset += lsp->ls_crypto_offset; 1030 } 1031 len = bp->b_bcount; 1032 bufinited = B_TRUE; 1033 1034 if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { 1035 error = EIO; 1036 goto errout; 1037 } 1038 1039 /* 1040 * If we're writing and the buffer was not B_ASYNC 1041 * we'll follow up with a VOP_FSYNC() to force any 1042 * asynchronous I/O to stable storage. 1043 */ 1044 if (!(bp->b_flags & B_READ) && !(bp->b_flags & B_ASYNC)) 1045 syncflag = FSYNC; 1046 1047 /* 1048 * We used to always use vn_rdwr here, but we cannot do that because 1049 * we might decide to read or write from the the underlying 1050 * file during this call, which would be a deadlock because 1051 * we have the rw_lock. So instead we page, unless it's not 1052 * mapable or it's a character device or it's an encrypted lofi. 1053 */ 1054 if ((lsp->ls_vp->v_flag & VNOMAP) || (lsp->ls_vp->v_type == VCHR) || 1055 lsp->ls_crypto_enabled) { 1056 error = lofi_rdwr(bufaddr, offset, bp, lsp, len, RDWR_RAW, 1057 NULL); 1058 } else if (lsp->ls_uncomp_seg_sz == 0) { 1059 error = lofi_mapped_rdwr(bufaddr, offset, bp, lsp); 1060 } else { 1061 uchar_t *compressed_seg = NULL, *cmpbuf; 1062 uchar_t *uncompressed_seg = NULL; 1063 lofi_compress_info_t *li; 1064 size_t oblkcount; 1065 ulong_t seglen; 1066 uint64_t sblkno, eblkno, cmpbytes; 1067 uint64_t uncompressed_seg_index; 1068 struct lofi_comp_cache *lc; 1069 offset_t sblkoff, eblkoff; 1070 u_offset_t salign, ealign; 1071 u_offset_t sdiff; 1072 uint32_t comp_data_sz; 1073 uint64_t i; 1074 int j; 1075 1076 /* 1077 * From here on we're dealing primarily with compressed files 1078 */ 1079 ASSERT(!lsp->ls_crypto_enabled); 1080 1081 /* 1082 * Compressed files can only be read from and 1083 * not written to 1084 */ 1085 if (!(bp->b_flags & B_READ)) { 1086 bp->b_resid = bp->b_bcount; 1087 error = EROFS; 1088 goto done; 1089 } 1090 1091 ASSERT(lsp->ls_comp_algorithm_index >= 0); 1092 li = &lofi_compress_table[lsp->ls_comp_algorithm_index]; 1093 /* 1094 * Compute starting and ending compressed segment numbers 1095 * We use only bitwise operations avoiding division and 1096 * modulus because we enforce the compression segment size 1097 * to a power of 2 1098 */ 1099 sblkno = offset >> lsp->ls_comp_seg_shift; 1100 sblkoff = offset & (lsp->ls_uncomp_seg_sz - 1); 1101 eblkno = (offset + bp->b_bcount) >> lsp->ls_comp_seg_shift; 1102 eblkoff = (offset + bp->b_bcount) & (lsp->ls_uncomp_seg_sz - 1); 1103 1104 /* 1105 * Check the decompressed segment cache. 1106 * 1107 * The cache is used only when the requested data 1108 * is within a segment. Requests that cross 1109 * segment boundaries bypass the cache. 1110 */ 1111 if (sblkno == eblkno || 1112 (sblkno + 1 == eblkno && eblkoff == 0)) { 1113 /* 1114 * Request doesn't cross a segment boundary, 1115 * now check the cache. 1116 */ 1117 mutex_enter(&lsp->ls_comp_cache_lock); 1118 lc = lofi_find_comp_data(lsp, sblkno); 1119 if (lc != NULL) { 1120 /* 1121 * We've found the decompressed segment 1122 * data in the cache; reuse it. 1123 */ 1124 bcopy(lc->lc_data + sblkoff, bufaddr, 1125 bp->b_bcount); 1126 mutex_exit(&lsp->ls_comp_cache_lock); 1127 bp->b_resid = 0; 1128 error = 0; 1129 goto done; 1130 } 1131 mutex_exit(&lsp->ls_comp_cache_lock); 1132 } 1133 1134 /* 1135 * Align start offset to block boundary for segmap 1136 */ 1137 salign = lsp->ls_comp_seg_index[sblkno]; 1138 sdiff = salign & (DEV_BSIZE - 1); 1139 salign -= sdiff; 1140 if (eblkno >= (lsp->ls_comp_index_sz - 1)) { 1141 /* 1142 * We're dealing with the last segment of 1143 * the compressed file -- the size of this 1144 * segment *may not* be the same as the 1145 * segment size for the file 1146 */ 1147 eblkoff = (offset + bp->b_bcount) & 1148 (lsp->ls_uncomp_last_seg_sz - 1); 1149 ealign = lsp->ls_vp_comp_size; 1150 } else { 1151 ealign = lsp->ls_comp_seg_index[eblkno + 1]; 1152 } 1153 1154 /* 1155 * Preserve original request paramaters 1156 */ 1157 oblkcount = bp->b_bcount; 1158 1159 /* 1160 * Assign the calculated parameters 1161 */ 1162 comp_data_sz = ealign - salign; 1163 bp->b_bcount = comp_data_sz; 1164 1165 /* 1166 * Buffers to hold compressed segments are pre-allocated 1167 * on a per-thread basis. Find a pre-allocated buffer 1168 * that is not currently in use and mark it for use. 1169 */ 1170 mutex_enter(&lsp->ls_comp_bufs_lock); 1171 for (j = 0; j < lofi_taskq_nthreads; j++) { 1172 if (lsp->ls_comp_bufs[j].inuse == 0) { 1173 lsp->ls_comp_bufs[j].inuse = 1; 1174 break; 1175 } 1176 } 1177 1178 mutex_exit(&lsp->ls_comp_bufs_lock); 1179 ASSERT(j < lofi_taskq_nthreads); 1180 1181 /* 1182 * If the pre-allocated buffer size does not match 1183 * the size of the I/O request, re-allocate it with 1184 * the appropriate size 1185 */ 1186 if (lsp->ls_comp_bufs[j].bufsize < bp->b_bcount) { 1187 if (lsp->ls_comp_bufs[j].bufsize > 0) 1188 kmem_free(lsp->ls_comp_bufs[j].buf, 1189 lsp->ls_comp_bufs[j].bufsize); 1190 lsp->ls_comp_bufs[j].buf = kmem_alloc(bp->b_bcount, 1191 KM_SLEEP); 1192 lsp->ls_comp_bufs[j].bufsize = bp->b_bcount; 1193 } 1194 compressed_seg = lsp->ls_comp_bufs[j].buf; 1195 1196 /* 1197 * Map in the calculated number of blocks 1198 */ 1199 error = lofi_mapped_rdwr((caddr_t)compressed_seg, salign, 1200 bp, lsp); 1201 1202 bp->b_bcount = oblkcount; 1203 bp->b_resid = oblkcount; 1204 if (error != 0) 1205 goto done; 1206 1207 /* 1208 * decompress compressed blocks start 1209 */ 1210 cmpbuf = compressed_seg + sdiff; 1211 for (i = sblkno; i <= eblkno; i++) { 1212 ASSERT(i < lsp->ls_comp_index_sz - 1); 1213 uchar_t *useg; 1214 1215 /* 1216 * The last segment is special in that it is 1217 * most likely not going to be the same 1218 * (uncompressed) size as the other segments. 1219 */ 1220 if (i == (lsp->ls_comp_index_sz - 2)) { 1221 seglen = lsp->ls_uncomp_last_seg_sz; 1222 } else { 1223 seglen = lsp->ls_uncomp_seg_sz; 1224 } 1225 1226 /* 1227 * Each of the segment index entries contains 1228 * the starting block number for that segment. 1229 * The number of compressed bytes in a segment 1230 * is thus the difference between the starting 1231 * block number of this segment and the starting 1232 * block number of the next segment. 1233 */ 1234 cmpbytes = lsp->ls_comp_seg_index[i + 1] - 1235 lsp->ls_comp_seg_index[i]; 1236 1237 /* 1238 * The first byte in a compressed segment is a flag 1239 * that indicates whether this segment is compressed 1240 * at all. 1241 * 1242 * The variable 'useg' is used (instead of 1243 * uncompressed_seg) in this loop to keep a 1244 * reference to the uncompressed segment. 1245 * 1246 * N.B. If 'useg' is replaced with uncompressed_seg, 1247 * it leads to memory leaks and heap corruption in 1248 * corner cases where compressed segments lie 1249 * adjacent to uncompressed segments. 1250 */ 1251 if (*cmpbuf == UNCOMPRESSED) { 1252 useg = cmpbuf + SEGHDR; 1253 } else { 1254 if (uncompressed_seg == NULL) 1255 uncompressed_seg = 1256 kmem_alloc(lsp->ls_uncomp_seg_sz, 1257 KM_SLEEP); 1258 useg = uncompressed_seg; 1259 uncompressed_seg_index = i; 1260 1261 if (li->l_decompress((cmpbuf + SEGHDR), 1262 (cmpbytes - SEGHDR), uncompressed_seg, 1263 &seglen, li->l_level) != 0) { 1264 error = EIO; 1265 goto done; 1266 } 1267 } 1268 1269 /* 1270 * Determine how much uncompressed data we 1271 * have to copy and copy it 1272 */ 1273 xfersize = lsp->ls_uncomp_seg_sz - sblkoff; 1274 if (i == eblkno) 1275 xfersize -= (lsp->ls_uncomp_seg_sz - eblkoff); 1276 1277 bcopy((useg + sblkoff), bufaddr, xfersize); 1278 1279 cmpbuf += cmpbytes; 1280 bufaddr += xfersize; 1281 bp->b_resid -= xfersize; 1282 sblkoff = 0; 1283 1284 if (bp->b_resid == 0) 1285 break; 1286 } /* decompress compressed blocks ends */ 1287 1288 /* 1289 * Skip to done if there is no uncompressed data to cache 1290 */ 1291 if (uncompressed_seg == NULL) 1292 goto done; 1293 1294 /* 1295 * Add the data for the last decompressed segment to 1296 * the cache. 1297 * 1298 * In case the uncompressed segment data was added to (and 1299 * is referenced by) the cache, make sure we don't free it 1300 * here. 1301 */ 1302 mutex_enter(&lsp->ls_comp_cache_lock); 1303 if ((lc = lofi_add_comp_data(lsp, uncompressed_seg_index, 1304 uncompressed_seg)) != NULL) { 1305 uncompressed_seg = NULL; 1306 } 1307 mutex_exit(&lsp->ls_comp_cache_lock); 1308 1309 done: 1310 if (compressed_seg != NULL) { 1311 mutex_enter(&lsp->ls_comp_bufs_lock); 1312 lsp->ls_comp_bufs[j].inuse = 0; 1313 mutex_exit(&lsp->ls_comp_bufs_lock); 1314 } 1315 if (uncompressed_seg != NULL) 1316 kmem_free(uncompressed_seg, lsp->ls_uncomp_seg_sz); 1317 } /* end of handling compressed files */ 1318 1319 if ((error == 0) && (syncflag != 0)) 1320 error = VOP_FSYNC(lsp->ls_vp, syncflag, kcred, NULL); 1321 1322 errout: 1323 if (bufinited && lsp->ls_kstat) { 1324 size_t n_done = bp->b_bcount - bp->b_resid; 1325 kstat_io_t *kioptr; 1326 1327 mutex_enter(lsp->ls_kstat->ks_lock); 1328 kioptr = KSTAT_IO_PTR(lsp->ls_kstat); 1329 if (bp->b_flags & B_READ) { 1330 kioptr->nread += n_done; 1331 kioptr->reads++; 1332 } else { 1333 kioptr->nwritten += n_done; 1334 kioptr->writes++; 1335 } 1336 kstat_runq_exit(kioptr); 1337 mutex_exit(lsp->ls_kstat->ks_lock); 1338 } 1339 1340 mutex_enter(&lsp->ls_vp_lock); 1341 if (--lsp->ls_vp_iocount == 0) 1342 cv_broadcast(&lsp->ls_vp_cv); 1343 mutex_exit(&lsp->ls_vp_lock); 1344 1345 bioerror(bp, error); 1346 biodone(bp); 1347 } 1348 1349 static int 1350 lofi_strategy(struct buf *bp) 1351 { 1352 struct lofi_state *lsp; 1353 offset_t offset; 1354 1355 /* 1356 * We cannot just do I/O here, because the current thread 1357 * _might_ end up back in here because the underlying filesystem 1358 * wants a buffer, which eventually gets into bio_recycle and 1359 * might call into lofi to write out a delayed-write buffer. 1360 * This is bad if the filesystem above lofi is the same as below. 1361 * 1362 * We could come up with a complex strategy using threads to 1363 * do the I/O asynchronously, or we could use task queues. task 1364 * queues were incredibly easy so they win. 1365 */ 1366 lsp = ddi_get_soft_state(lofi_statep, getminor(bp->b_edev)); 1367 if (lsp == NULL) { 1368 bioerror(bp, ENXIO); 1369 biodone(bp); 1370 return (0); 1371 } 1372 1373 mutex_enter(&lsp->ls_vp_lock); 1374 if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { 1375 bioerror(bp, EIO); 1376 biodone(bp); 1377 mutex_exit(&lsp->ls_vp_lock); 1378 return (0); 1379 } 1380 1381 offset = bp->b_lblkno * DEV_BSIZE; /* offset within file */ 1382 if (lsp->ls_crypto_enabled) { 1383 /* encrypted data really begins after crypto header */ 1384 offset += lsp->ls_crypto_offset; 1385 } 1386 if (offset == lsp->ls_vp_size) { 1387 /* EOF */ 1388 if ((bp->b_flags & B_READ) != 0) { 1389 bp->b_resid = bp->b_bcount; 1390 bioerror(bp, 0); 1391 } else { 1392 /* writes should fail */ 1393 bioerror(bp, ENXIO); 1394 } 1395 biodone(bp); 1396 mutex_exit(&lsp->ls_vp_lock); 1397 return (0); 1398 } 1399 if (offset > lsp->ls_vp_size) { 1400 bioerror(bp, ENXIO); 1401 biodone(bp); 1402 mutex_exit(&lsp->ls_vp_lock); 1403 return (0); 1404 } 1405 lsp->ls_vp_iocount++; 1406 mutex_exit(&lsp->ls_vp_lock); 1407 1408 if (lsp->ls_kstat) { 1409 mutex_enter(lsp->ls_kstat->ks_lock); 1410 kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat)); 1411 mutex_exit(lsp->ls_kstat->ks_lock); 1412 } 1413 (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP); 1414 return (0); 1415 } 1416 1417 /*ARGSUSED2*/ 1418 static int 1419 lofi_read(dev_t dev, struct uio *uio, struct cred *credp) 1420 { 1421 if (getminor(dev) == 0) 1422 return (EINVAL); 1423 UIO_CHECK(uio); 1424 return (physio(lofi_strategy, NULL, dev, B_READ, minphys, uio)); 1425 } 1426 1427 /*ARGSUSED2*/ 1428 static int 1429 lofi_write(dev_t dev, struct uio *uio, struct cred *credp) 1430 { 1431 if (getminor(dev) == 0) 1432 return (EINVAL); 1433 UIO_CHECK(uio); 1434 return (physio(lofi_strategy, NULL, dev, B_WRITE, minphys, uio)); 1435 } 1436 1437 /*ARGSUSED2*/ 1438 static int 1439 lofi_aread(dev_t dev, struct aio_req *aio, struct cred *credp) 1440 { 1441 if (getminor(dev) == 0) 1442 return (EINVAL); 1443 UIO_CHECK(aio->aio_uio); 1444 return (aphysio(lofi_strategy, anocancel, dev, B_READ, minphys, aio)); 1445 } 1446 1447 /*ARGSUSED2*/ 1448 static int 1449 lofi_awrite(dev_t dev, struct aio_req *aio, struct cred *credp) 1450 { 1451 if (getminor(dev) == 0) 1452 return (EINVAL); 1453 UIO_CHECK(aio->aio_uio); 1454 return (aphysio(lofi_strategy, anocancel, dev, B_WRITE, minphys, aio)); 1455 } 1456 1457 /*ARGSUSED*/ 1458 static int 1459 lofi_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 1460 { 1461 switch (infocmd) { 1462 case DDI_INFO_DEVT2DEVINFO: 1463 *result = lofi_dip; 1464 return (DDI_SUCCESS); 1465 case DDI_INFO_DEVT2INSTANCE: 1466 *result = 0; 1467 return (DDI_SUCCESS); 1468 } 1469 return (DDI_FAILURE); 1470 } 1471 1472 static int 1473 lofi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 1474 { 1475 int error; 1476 1477 if (cmd != DDI_ATTACH) 1478 return (DDI_FAILURE); 1479 1480 lofi_minor_id = id_space_create("lofi_minor_id", 1, L_MAXMIN32 + 1); 1481 1482 if (!lofi_minor_id) 1483 return (DDI_FAILURE); 1484 1485 error = ddi_soft_state_zalloc(lofi_statep, 0); 1486 if (error == DDI_FAILURE) { 1487 id_space_destroy(lofi_minor_id); 1488 return (DDI_FAILURE); 1489 } 1490 error = ddi_create_minor_node(dip, LOFI_CTL_NODE, S_IFCHR, 0, 1491 DDI_PSEUDO, NULL); 1492 if (error == DDI_FAILURE) { 1493 ddi_soft_state_free(lofi_statep, 0); 1494 id_space_destroy(lofi_minor_id); 1495 return (DDI_FAILURE); 1496 } 1497 /* driver handles kernel-issued IOCTLs */ 1498 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 1499 DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { 1500 ddi_remove_minor_node(dip, NULL); 1501 ddi_soft_state_free(lofi_statep, 0); 1502 id_space_destroy(lofi_minor_id); 1503 return (DDI_FAILURE); 1504 } 1505 1506 zone_key_create(&lofi_zone_key, NULL, lofi_zone_shutdown, NULL); 1507 1508 lofi_dip = dip; 1509 ddi_report_dev(dip); 1510 return (DDI_SUCCESS); 1511 } 1512 1513 static int 1514 lofi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 1515 { 1516 if (cmd != DDI_DETACH) 1517 return (DDI_FAILURE); 1518 1519 mutex_enter(&lofi_lock); 1520 1521 if (!list_is_empty(&lofi_list)) { 1522 mutex_exit(&lofi_lock); 1523 return (DDI_FAILURE); 1524 } 1525 1526 lofi_dip = NULL; 1527 ddi_remove_minor_node(dip, NULL); 1528 ddi_prop_remove_all(dip); 1529 1530 mutex_exit(&lofi_lock); 1531 1532 if (zone_key_delete(lofi_zone_key) != 0) 1533 cmn_err(CE_WARN, "failed to delete zone key"); 1534 1535 ddi_soft_state_free(lofi_statep, 0); 1536 1537 id_space_destroy(lofi_minor_id); 1538 1539 return (DDI_SUCCESS); 1540 } 1541 1542 /* 1543 * With addition of encryption, be careful that encryption key is wiped before 1544 * kernel memory structures are freed, and also that key is not accidentally 1545 * passed out into userland structures. 1546 */ 1547 static void 1548 free_lofi_ioctl(struct lofi_ioctl *klip) 1549 { 1550 /* Make sure this encryption key doesn't stick around */ 1551 bzero(klip->li_key, sizeof (klip->li_key)); 1552 kmem_free(klip, sizeof (struct lofi_ioctl)); 1553 } 1554 1555 /* 1556 * These two just simplify the rest of the ioctls that need to copyin/out 1557 * the lofi_ioctl structure. 1558 */ 1559 int 1560 copy_in_lofi_ioctl(const struct lofi_ioctl *ulip, struct lofi_ioctl **klipp, 1561 int flag) 1562 { 1563 struct lofi_ioctl *klip; 1564 int error; 1565 1566 klip = *klipp = kmem_alloc(sizeof (struct lofi_ioctl), KM_SLEEP); 1567 error = ddi_copyin(ulip, klip, sizeof (struct lofi_ioctl), flag); 1568 if (error) 1569 goto err; 1570 1571 /* ensure NULL termination */ 1572 klip->li_filename[MAXPATHLEN-1] = '\0'; 1573 klip->li_algorithm[MAXALGLEN-1] = '\0'; 1574 klip->li_cipher[CRYPTO_MAX_MECH_NAME-1] = '\0'; 1575 klip->li_iv_cipher[CRYPTO_MAX_MECH_NAME-1] = '\0'; 1576 1577 if (klip->li_minor > L_MAXMIN32) { 1578 error = EINVAL; 1579 goto err; 1580 } 1581 1582 return (0); 1583 1584 err: 1585 free_lofi_ioctl(klip); 1586 return (error); 1587 } 1588 1589 int 1590 copy_out_lofi_ioctl(const struct lofi_ioctl *klip, struct lofi_ioctl *ulip, 1591 int flag) 1592 { 1593 int error; 1594 1595 /* 1596 * NOTE: Do NOT copy the crypto_key_t "back" to userland. 1597 * This ensures that an attacker can't trivially find the 1598 * key for a mapping just by issuing the ioctl. 1599 * 1600 * It can still be found by poking around in kmem with mdb(1), 1601 * but there is no point in making it easy when the info isn't 1602 * of any use in this direction anyway. 1603 * 1604 * Either way we don't actually have the raw key stored in 1605 * a form that we can get it anyway, since we just used it 1606 * to create a ctx template and didn't keep "the original". 1607 */ 1608 error = ddi_copyout(klip, ulip, sizeof (struct lofi_ioctl), flag); 1609 if (error) 1610 return (EFAULT); 1611 return (0); 1612 } 1613 1614 static int 1615 lofi_access(struct lofi_state *lsp) 1616 { 1617 ASSERT(MUTEX_HELD(&lofi_lock)); 1618 if (INGLOBALZONE(curproc) || lsp->ls_zone.zref_zone == curzone) 1619 return (0); 1620 return (EPERM); 1621 } 1622 1623 /* 1624 * Find the lofi state for the given filename. We compare by vnode to 1625 * allow the global zone visibility into NGZ lofi nodes. 1626 */ 1627 static int 1628 file_to_lofi_nocheck(char *filename, boolean_t readonly, 1629 struct lofi_state **lspp) 1630 { 1631 struct lofi_state *lsp; 1632 vnode_t *vp = NULL; 1633 int err = 0; 1634 int rdfiles = 0; 1635 1636 ASSERT(MUTEX_HELD(&lofi_lock)); 1637 1638 if ((err = lookupname(filename, UIO_SYSSPACE, FOLLOW, 1639 NULLVPP, &vp)) != 0) 1640 goto out; 1641 1642 if (vp->v_type == VREG) { 1643 vnode_t *realvp; 1644 if (VOP_REALVP(vp, &realvp, NULL) == 0) { 1645 VN_HOLD(realvp); 1646 VN_RELE(vp); 1647 vp = realvp; 1648 } 1649 } 1650 1651 for (lsp = list_head(&lofi_list); lsp != NULL; 1652 lsp = list_next(&lofi_list, lsp)) { 1653 if (lsp->ls_vp == vp) { 1654 if (lspp != NULL) 1655 *lspp = lsp; 1656 if (lsp->ls_readonly) { 1657 rdfiles++; 1658 /* Skip if '-r' is specified */ 1659 if (readonly) 1660 continue; 1661 } 1662 goto out; 1663 } 1664 } 1665 1666 err = ENOENT; 1667 1668 /* 1669 * If a filename is given as an argument for lofi_unmap, we shouldn't 1670 * allow unmap if there are multiple read-only lofi devices associated 1671 * with this file. 1672 */ 1673 if (lspp != NULL) { 1674 if (rdfiles == 1) 1675 err = 0; 1676 else if (rdfiles > 1) 1677 err = EBUSY; 1678 } 1679 1680 out: 1681 if (vp != NULL) 1682 VN_RELE(vp); 1683 return (err); 1684 } 1685 1686 /* 1687 * Find the minor for the given filename, checking the zone can access 1688 * it. 1689 */ 1690 static int 1691 file_to_lofi(char *filename, boolean_t readonly, struct lofi_state **lspp) 1692 { 1693 int err = 0; 1694 1695 ASSERT(MUTEX_HELD(&lofi_lock)); 1696 1697 if ((err = file_to_lofi_nocheck(filename, readonly, lspp)) != 0) 1698 return (err); 1699 1700 if ((err = lofi_access(*lspp)) != 0) 1701 return (err); 1702 1703 return (0); 1704 } 1705 1706 /* 1707 * Fakes up a disk geometry, and one big partition, based on the size 1708 * of the file. This is needed because we allow newfs'ing the device, 1709 * and newfs will do several disk ioctls to figure out the geometry and 1710 * partition information. It uses that information to determine the parameters 1711 * to pass to mkfs. Geometry is pretty much irrelevant these days, but we 1712 * have to support it. 1713 */ 1714 static void 1715 fake_disk_geometry(struct lofi_state *lsp) 1716 { 1717 u_offset_t dsize = lsp->ls_vp_size - lsp->ls_crypto_offset; 1718 1719 /* dk_geom - see dkio(7I) */ 1720 /* 1721 * dkg_ncyl _could_ be set to one here (one big cylinder with gobs 1722 * of sectors), but that breaks programs like fdisk which want to 1723 * partition a disk by cylinder. With one cylinder, you can't create 1724 * an fdisk partition and put pcfs on it for testing (hard to pick 1725 * a number between one and one). 1726 * 1727 * The cheezy floppy test is an attempt to not have too few cylinders 1728 * for a small file, or so many on a big file that you waste space 1729 * for backup superblocks or cylinder group structures. 1730 */ 1731 if (dsize < (2 * 1024 * 1024)) /* floppy? */ 1732 lsp->ls_dkg.dkg_ncyl = dsize / (100 * 1024); 1733 else 1734 lsp->ls_dkg.dkg_ncyl = dsize / (300 * 1024); 1735 /* in case file file is < 100k */ 1736 if (lsp->ls_dkg.dkg_ncyl == 0) 1737 lsp->ls_dkg.dkg_ncyl = 1; 1738 lsp->ls_dkg.dkg_acyl = 0; 1739 lsp->ls_dkg.dkg_bcyl = 0; 1740 lsp->ls_dkg.dkg_nhead = 1; 1741 lsp->ls_dkg.dkg_obs1 = 0; 1742 lsp->ls_dkg.dkg_intrlv = 0; 1743 lsp->ls_dkg.dkg_obs2 = 0; 1744 lsp->ls_dkg.dkg_obs3 = 0; 1745 lsp->ls_dkg.dkg_apc = 0; 1746 lsp->ls_dkg.dkg_rpm = 7200; 1747 lsp->ls_dkg.dkg_pcyl = lsp->ls_dkg.dkg_ncyl + lsp->ls_dkg.dkg_acyl; 1748 lsp->ls_dkg.dkg_nsect = dsize / (DEV_BSIZE * lsp->ls_dkg.dkg_ncyl); 1749 lsp->ls_dkg.dkg_write_reinstruct = 0; 1750 lsp->ls_dkg.dkg_read_reinstruct = 0; 1751 1752 /* vtoc - see dkio(7I) */ 1753 bzero(&lsp->ls_vtoc, sizeof (struct vtoc)); 1754 lsp->ls_vtoc.v_sanity = VTOC_SANE; 1755 lsp->ls_vtoc.v_version = V_VERSION; 1756 (void) strncpy(lsp->ls_vtoc.v_volume, LOFI_DRIVER_NAME, 1757 sizeof (lsp->ls_vtoc.v_volume)); 1758 lsp->ls_vtoc.v_sectorsz = DEV_BSIZE; 1759 lsp->ls_vtoc.v_nparts = 1; 1760 lsp->ls_vtoc.v_part[0].p_tag = V_UNASSIGNED; 1761 1762 /* 1763 * A compressed file is read-only, other files can 1764 * be read-write 1765 */ 1766 if (lsp->ls_uncomp_seg_sz > 0) { 1767 lsp->ls_vtoc.v_part[0].p_flag = V_UNMNT | V_RONLY; 1768 } else { 1769 lsp->ls_vtoc.v_part[0].p_flag = V_UNMNT; 1770 } 1771 lsp->ls_vtoc.v_part[0].p_start = (daddr_t)0; 1772 /* 1773 * The partition size cannot just be the number of sectors, because 1774 * that might not end on a cylinder boundary. And if that's the case, 1775 * newfs/mkfs will print a scary warning. So just figure the size 1776 * based on the number of cylinders and sectors/cylinder. 1777 */ 1778 lsp->ls_vtoc.v_part[0].p_size = lsp->ls_dkg.dkg_pcyl * 1779 lsp->ls_dkg.dkg_nsect * lsp->ls_dkg.dkg_nhead; 1780 1781 /* dk_cinfo - see dkio(7I) */ 1782 bzero(&lsp->ls_ci, sizeof (struct dk_cinfo)); 1783 (void) strcpy(lsp->ls_ci.dki_cname, LOFI_DRIVER_NAME); 1784 lsp->ls_ci.dki_ctype = DKC_MD; 1785 lsp->ls_ci.dki_flags = 0; 1786 lsp->ls_ci.dki_cnum = 0; 1787 lsp->ls_ci.dki_addr = 0; 1788 lsp->ls_ci.dki_space = 0; 1789 lsp->ls_ci.dki_prio = 0; 1790 lsp->ls_ci.dki_vec = 0; 1791 (void) strcpy(lsp->ls_ci.dki_dname, LOFI_DRIVER_NAME); 1792 lsp->ls_ci.dki_unit = 0; 1793 lsp->ls_ci.dki_slave = 0; 1794 lsp->ls_ci.dki_partition = 0; 1795 /* 1796 * newfs uses this to set maxcontig. Must not be < 16, or it 1797 * will be 0 when newfs multiplies it by DEV_BSIZE and divides 1798 * it by the block size. Then tunefs doesn't work because 1799 * maxcontig is 0. 1800 */ 1801 lsp->ls_ci.dki_maxtransfer = 16; 1802 } 1803 1804 /* 1805 * map in a compressed file 1806 * 1807 * Read in the header and the index that follows. 1808 * 1809 * The header is as follows - 1810 * 1811 * Signature (name of the compression algorithm) 1812 * Compression segment size (a multiple of 512) 1813 * Number of index entries 1814 * Size of the last block 1815 * The array containing the index entries 1816 * 1817 * The header information is always stored in 1818 * network byte order on disk. 1819 */ 1820 static int 1821 lofi_map_compressed_file(struct lofi_state *lsp, char *buf) 1822 { 1823 uint32_t index_sz, header_len, i; 1824 ssize_t resid; 1825 enum uio_rw rw; 1826 char *tbuf = buf; 1827 int error; 1828 1829 /* The signature has already been read */ 1830 tbuf += sizeof (lsp->ls_comp_algorithm); 1831 bcopy(tbuf, &(lsp->ls_uncomp_seg_sz), sizeof (lsp->ls_uncomp_seg_sz)); 1832 lsp->ls_uncomp_seg_sz = ntohl(lsp->ls_uncomp_seg_sz); 1833 1834 /* 1835 * The compressed segment size must be a power of 2 1836 */ 1837 if (lsp->ls_uncomp_seg_sz < DEV_BSIZE || 1838 !ISP2(lsp->ls_uncomp_seg_sz)) 1839 return (EINVAL); 1840 1841 for (i = 0; !((lsp->ls_uncomp_seg_sz >> i) & 1); i++) 1842 ; 1843 1844 lsp->ls_comp_seg_shift = i; 1845 1846 tbuf += sizeof (lsp->ls_uncomp_seg_sz); 1847 bcopy(tbuf, &(lsp->ls_comp_index_sz), sizeof (lsp->ls_comp_index_sz)); 1848 lsp->ls_comp_index_sz = ntohl(lsp->ls_comp_index_sz); 1849 1850 tbuf += sizeof (lsp->ls_comp_index_sz); 1851 bcopy(tbuf, &(lsp->ls_uncomp_last_seg_sz), 1852 sizeof (lsp->ls_uncomp_last_seg_sz)); 1853 lsp->ls_uncomp_last_seg_sz = ntohl(lsp->ls_uncomp_last_seg_sz); 1854 1855 /* 1856 * Compute the total size of the uncompressed data 1857 * for use in fake_disk_geometry and other calculations. 1858 * Disk geometry has to be faked with respect to the 1859 * actual uncompressed data size rather than the 1860 * compressed file size. 1861 */ 1862 lsp->ls_vp_size = 1863 (u_offset_t)(lsp->ls_comp_index_sz - 2) * lsp->ls_uncomp_seg_sz 1864 + lsp->ls_uncomp_last_seg_sz; 1865 1866 /* 1867 * Index size is rounded up to DEV_BSIZE for ease 1868 * of segmapping 1869 */ 1870 index_sz = sizeof (*lsp->ls_comp_seg_index) * lsp->ls_comp_index_sz; 1871 header_len = sizeof (lsp->ls_comp_algorithm) + 1872 sizeof (lsp->ls_uncomp_seg_sz) + 1873 sizeof (lsp->ls_comp_index_sz) + 1874 sizeof (lsp->ls_uncomp_last_seg_sz); 1875 lsp->ls_comp_offbase = header_len + index_sz; 1876 1877 index_sz += header_len; 1878 index_sz = roundup(index_sz, DEV_BSIZE); 1879 1880 lsp->ls_comp_index_data = kmem_alloc(index_sz, KM_SLEEP); 1881 lsp->ls_comp_index_data_sz = index_sz; 1882 1883 /* 1884 * Read in the index -- this has a side-effect 1885 * of reading in the header as well 1886 */ 1887 rw = UIO_READ; 1888 error = vn_rdwr(rw, lsp->ls_vp, lsp->ls_comp_index_data, index_sz, 1889 0, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 1890 1891 if (error != 0) 1892 return (error); 1893 1894 /* Skip the header, this is where the index really begins */ 1895 lsp->ls_comp_seg_index = 1896 /*LINTED*/ 1897 (uint64_t *)(lsp->ls_comp_index_data + header_len); 1898 1899 /* 1900 * Now recompute offsets in the index to account for 1901 * the header length 1902 */ 1903 for (i = 0; i < lsp->ls_comp_index_sz; i++) { 1904 lsp->ls_comp_seg_index[i] = lsp->ls_comp_offbase + 1905 BE_64(lsp->ls_comp_seg_index[i]); 1906 } 1907 1908 return (error); 1909 } 1910 1911 static int 1912 lofi_init_crypto(struct lofi_state *lsp, struct lofi_ioctl *klip) 1913 { 1914 struct crypto_meta chead; 1915 char buf[DEV_BSIZE]; 1916 ssize_t resid; 1917 char *marker; 1918 int error; 1919 int ret; 1920 int i; 1921 1922 if (!klip->li_crypto_enabled) 1923 return (0); 1924 1925 /* 1926 * All current algorithms have a max of 448 bits. 1927 */ 1928 if (klip->li_iv_len > CRYPTO_BITS2BYTES(512)) 1929 return (EINVAL); 1930 1931 if (CRYPTO_BITS2BYTES(klip->li_key_len) > sizeof (klip->li_key)) 1932 return (EINVAL); 1933 1934 lsp->ls_crypto_enabled = klip->li_crypto_enabled; 1935 1936 mutex_init(&lsp->ls_crypto_lock, NULL, MUTEX_DRIVER, NULL); 1937 1938 lsp->ls_mech.cm_type = crypto_mech2id(klip->li_cipher); 1939 if (lsp->ls_mech.cm_type == CRYPTO_MECH_INVALID) { 1940 cmn_err(CE_WARN, "invalid cipher %s requested for %s", 1941 klip->li_cipher, klip->li_filename); 1942 return (EINVAL); 1943 } 1944 1945 /* this is just initialization here */ 1946 lsp->ls_mech.cm_param = NULL; 1947 lsp->ls_mech.cm_param_len = 0; 1948 1949 lsp->ls_iv_type = klip->li_iv_type; 1950 lsp->ls_iv_mech.cm_type = crypto_mech2id(klip->li_iv_cipher); 1951 if (lsp->ls_iv_mech.cm_type == CRYPTO_MECH_INVALID) { 1952 cmn_err(CE_WARN, "invalid iv cipher %s requested" 1953 " for %s", klip->li_iv_cipher, klip->li_filename); 1954 return (EINVAL); 1955 } 1956 1957 /* iv mech must itself take a null iv */ 1958 lsp->ls_iv_mech.cm_param = NULL; 1959 lsp->ls_iv_mech.cm_param_len = 0; 1960 lsp->ls_iv_len = klip->li_iv_len; 1961 1962 /* 1963 * Create ctx using li_cipher & the raw li_key after checking 1964 * that it isn't a weak key. 1965 */ 1966 lsp->ls_key.ck_format = CRYPTO_KEY_RAW; 1967 lsp->ls_key.ck_length = klip->li_key_len; 1968 lsp->ls_key.ck_data = kmem_alloc( 1969 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length), KM_SLEEP); 1970 bcopy(klip->li_key, lsp->ls_key.ck_data, 1971 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 1972 1973 ret = crypto_key_check(&lsp->ls_mech, &lsp->ls_key); 1974 if (ret != CRYPTO_SUCCESS) { 1975 cmn_err(CE_WARN, "weak key check failed for cipher " 1976 "%s on file %s (0x%x)", klip->li_cipher, 1977 klip->li_filename, ret); 1978 return (EINVAL); 1979 } 1980 1981 error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE, 1982 CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 1983 if (error != 0) 1984 return (error); 1985 1986 /* 1987 * This is the case where the header in the lofi image is already 1988 * initialized to indicate it is encrypted. 1989 */ 1990 if (strncmp(buf, lofi_crypto_magic, sizeof (lofi_crypto_magic)) == 0) { 1991 /* 1992 * The encryption header information is laid out this way: 1993 * 6 bytes: hex "CFLOFI" 1994 * 2 bytes: version = 0 ... for now 1995 * 96 bytes: reserved1 (not implemented yet) 1996 * 4 bytes: data_sector = 2 ... for now 1997 * more... not implemented yet 1998 */ 1999 2000 marker = buf; 2001 2002 /* copy the magic */ 2003 bcopy(marker, lsp->ls_crypto.magic, 2004 sizeof (lsp->ls_crypto.magic)); 2005 marker += sizeof (lsp->ls_crypto.magic); 2006 2007 /* read the encryption version number */ 2008 bcopy(marker, &(lsp->ls_crypto.version), 2009 sizeof (lsp->ls_crypto.version)); 2010 lsp->ls_crypto.version = ntohs(lsp->ls_crypto.version); 2011 marker += sizeof (lsp->ls_crypto.version); 2012 2013 /* read a chunk of reserved data */ 2014 bcopy(marker, lsp->ls_crypto.reserved1, 2015 sizeof (lsp->ls_crypto.reserved1)); 2016 marker += sizeof (lsp->ls_crypto.reserved1); 2017 2018 /* read block number where encrypted data begins */ 2019 bcopy(marker, &(lsp->ls_crypto.data_sector), 2020 sizeof (lsp->ls_crypto.data_sector)); 2021 lsp->ls_crypto.data_sector = ntohl(lsp->ls_crypto.data_sector); 2022 marker += sizeof (lsp->ls_crypto.data_sector); 2023 2024 /* and ignore the rest until it is implemented */ 2025 2026 lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE; 2027 return (0); 2028 } 2029 2030 /* 2031 * We've requested encryption, but no magic was found, so it must be 2032 * a new image. 2033 */ 2034 2035 for (i = 0; i < sizeof (struct crypto_meta); i++) { 2036 if (buf[i] != '\0') 2037 return (EINVAL); 2038 } 2039 2040 marker = buf; 2041 bcopy(lofi_crypto_magic, marker, sizeof (lofi_crypto_magic)); 2042 marker += sizeof (lofi_crypto_magic); 2043 chead.version = htons(LOFI_CRYPTO_VERSION); 2044 bcopy(&(chead.version), marker, sizeof (chead.version)); 2045 marker += sizeof (chead.version); 2046 marker += sizeof (chead.reserved1); 2047 chead.data_sector = htonl(LOFI_CRYPTO_DATA_SECTOR); 2048 bcopy(&(chead.data_sector), marker, sizeof (chead.data_sector)); 2049 2050 /* write the header */ 2051 error = vn_rdwr(UIO_WRITE, lsp->ls_vp, buf, DEV_BSIZE, 2052 CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 2053 if (error != 0) 2054 return (error); 2055 2056 /* fix things up so it looks like we read this info */ 2057 bcopy(lofi_crypto_magic, lsp->ls_crypto.magic, 2058 sizeof (lofi_crypto_magic)); 2059 lsp->ls_crypto.version = LOFI_CRYPTO_VERSION; 2060 lsp->ls_crypto.data_sector = LOFI_CRYPTO_DATA_SECTOR; 2061 lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE; 2062 return (0); 2063 } 2064 2065 /* 2066 * Check to see if the passed in signature is a valid one. If it is 2067 * valid, return the index into lofi_compress_table. 2068 * 2069 * Return -1 if it is invalid 2070 */ 2071 static int 2072 lofi_compress_select(const char *signature) 2073 { 2074 int i; 2075 2076 for (i = 0; i < LOFI_COMPRESS_FUNCTIONS; i++) { 2077 if (strcmp(lofi_compress_table[i].l_name, signature) == 0) 2078 return (i); 2079 } 2080 2081 return (-1); 2082 } 2083 2084 static int 2085 lofi_init_compress(struct lofi_state *lsp) 2086 { 2087 char buf[DEV_BSIZE]; 2088 int compress_index; 2089 ssize_t resid; 2090 int error; 2091 2092 error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE, 0, UIO_SYSSPACE, 2093 0, RLIM64_INFINITY, kcred, &resid); 2094 2095 if (error != 0) 2096 return (error); 2097 2098 if ((compress_index = lofi_compress_select(buf)) == -1) 2099 return (0); 2100 2101 /* compression and encryption are mutually exclusive */ 2102 if (lsp->ls_crypto_enabled) 2103 return (ENOTSUP); 2104 2105 /* initialize compression info for compressed lofi */ 2106 lsp->ls_comp_algorithm_index = compress_index; 2107 (void) strlcpy(lsp->ls_comp_algorithm, 2108 lofi_compress_table[compress_index].l_name, 2109 sizeof (lsp->ls_comp_algorithm)); 2110 2111 /* Finally setup per-thread pre-allocated buffers */ 2112 lsp->ls_comp_bufs = kmem_zalloc(lofi_taskq_nthreads * 2113 sizeof (struct compbuf), KM_SLEEP); 2114 2115 return (lofi_map_compressed_file(lsp, buf)); 2116 } 2117 2118 /* 2119 * map a file to a minor number. Return the minor number. 2120 */ 2121 static int 2122 lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor, 2123 int *rvalp, struct cred *credp, int ioctl_flag) 2124 { 2125 minor_t minor = (minor_t)-1; 2126 struct lofi_state *lsp = NULL; 2127 struct lofi_ioctl *klip; 2128 int error; 2129 struct vnode *vp = NULL; 2130 vattr_t vattr; 2131 int flag; 2132 dev_t newdev; 2133 char namebuf[50]; 2134 2135 error = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 2136 if (error != 0) 2137 return (error); 2138 2139 mutex_enter(&lofi_lock); 2140 2141 mutex_enter(&curproc->p_lock); 2142 if ((error = rctl_incr_lofi(curproc, curproc->p_zone, 1)) != 0) { 2143 mutex_exit(&curproc->p_lock); 2144 mutex_exit(&lofi_lock); 2145 free_lofi_ioctl(klip); 2146 return (error); 2147 } 2148 mutex_exit(&curproc->p_lock); 2149 2150 if (file_to_lofi_nocheck(klip->li_filename, klip->li_readonly, 2151 NULL) == 0) { 2152 error = EBUSY; 2153 goto err; 2154 } 2155 2156 if (pickminor) { 2157 minor = (minor_t)id_allocff_nosleep(lofi_minor_id); 2158 if (minor == (minor_t)-1) { 2159 error = EAGAIN; 2160 goto err; 2161 } 2162 } else { 2163 if (ddi_get_soft_state(lofi_statep, klip->li_minor) != NULL) { 2164 error = EEXIST; 2165 goto err; 2166 } 2167 2168 minor = (minor_t) 2169 id_alloc_specific_nosleep(lofi_minor_id, klip->li_minor); 2170 ASSERT(minor != (minor_t)-1); 2171 } 2172 2173 flag = FREAD | FWRITE | FOFFMAX | FEXCL; 2174 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, &vp, 0, 0); 2175 if (error) { 2176 /* try read-only */ 2177 flag &= ~FWRITE; 2178 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, 2179 &vp, 0, 0); 2180 if (error) 2181 goto err; 2182 } 2183 2184 if (!V_ISLOFIABLE(vp->v_type)) { 2185 error = EINVAL; 2186 goto err; 2187 } 2188 2189 vattr.va_mask = AT_SIZE; 2190 error = VOP_GETATTR(vp, &vattr, 0, credp, NULL); 2191 if (error) 2192 goto err; 2193 2194 /* the file needs to be a multiple of the block size */ 2195 if ((vattr.va_size % DEV_BSIZE) != 0) { 2196 error = EINVAL; 2197 goto err; 2198 } 2199 2200 /* lsp alloc+init */ 2201 2202 error = ddi_soft_state_zalloc(lofi_statep, minor); 2203 if (error == DDI_FAILURE) { 2204 error = ENOMEM; 2205 goto err; 2206 } 2207 2208 lsp = ddi_get_soft_state(lofi_statep, minor); 2209 list_insert_tail(&lofi_list, lsp); 2210 2211 newdev = makedevice(getmajor(dev), minor); 2212 lsp->ls_dev = newdev; 2213 zone_init_ref(&lsp->ls_zone); 2214 zone_hold_ref(curzone, &lsp->ls_zone, ZONE_REF_LOFI); 2215 lsp->ls_uncomp_seg_sz = 0; 2216 lsp->ls_comp_algorithm[0] = '\0'; 2217 lsp->ls_crypto_offset = 0; 2218 2219 cv_init(&lsp->ls_vp_cv, NULL, CV_DRIVER, NULL); 2220 mutex_init(&lsp->ls_comp_cache_lock, NULL, MUTEX_DRIVER, NULL); 2221 mutex_init(&lsp->ls_comp_bufs_lock, NULL, MUTEX_DRIVER, NULL); 2222 mutex_init(&lsp->ls_kstat_lock, NULL, MUTEX_DRIVER, NULL); 2223 mutex_init(&lsp->ls_vp_lock, NULL, MUTEX_DRIVER, NULL); 2224 2225 (void) snprintf(namebuf, sizeof (namebuf), "%s_taskq_%d", 2226 LOFI_DRIVER_NAME, minor); 2227 lsp->ls_taskq = taskq_create_proc(namebuf, lofi_taskq_nthreads, 2228 minclsyspri, 1, lofi_taskq_maxalloc, curzone->zone_zsched, 0); 2229 2230 list_create(&lsp->ls_comp_cache, sizeof (struct lofi_comp_cache), 2231 offsetof(struct lofi_comp_cache, lc_list)); 2232 2233 /* 2234 * save open mode so file can be closed properly and vnode counts 2235 * updated correctly. 2236 */ 2237 lsp->ls_openflag = flag; 2238 2239 lsp->ls_vp = vp; 2240 lsp->ls_stacked_vp = vp; 2241 /* 2242 * Try to handle stacked lofs vnodes. 2243 */ 2244 if (vp->v_type == VREG) { 2245 vnode_t *realvp; 2246 2247 if (VOP_REALVP(vp, &realvp, NULL) == 0) { 2248 /* 2249 * We need to use the realvp for uniqueness 2250 * checking, but keep the stacked vp for 2251 * LOFI_GET_FILENAME display. 2252 */ 2253 VN_HOLD(realvp); 2254 lsp->ls_vp = realvp; 2255 } 2256 } 2257 2258 lsp->ls_vp_size = vattr.va_size; 2259 lsp->ls_vp_comp_size = lsp->ls_vp_size; 2260 2261 lsp->ls_kstat = kstat_create_zone(LOFI_DRIVER_NAME, minor, 2262 NULL, "disk", KSTAT_TYPE_IO, 1, 0, getzoneid()); 2263 2264 if (lsp->ls_kstat == NULL) { 2265 error = ENOMEM; 2266 goto err; 2267 } 2268 2269 lsp->ls_kstat->ks_lock = &lsp->ls_kstat_lock; 2270 kstat_zone_add(lsp->ls_kstat, GLOBAL_ZONEID); 2271 2272 lsp->ls_readonly = klip->li_readonly; 2273 2274 if ((error = lofi_init_crypto(lsp, klip)) != 0) 2275 goto err; 2276 2277 if ((error = lofi_init_compress(lsp)) != 0) 2278 goto err; 2279 2280 fake_disk_geometry(lsp); 2281 2282 /* create minor nodes */ 2283 2284 (void) snprintf(namebuf, sizeof (namebuf), "%d", minor); 2285 error = ddi_create_minor_node(lofi_dip, namebuf, S_IFBLK, minor, 2286 DDI_PSEUDO, NULL); 2287 if (error != DDI_SUCCESS) { 2288 error = ENXIO; 2289 goto err; 2290 } 2291 2292 (void) snprintf(namebuf, sizeof (namebuf), "%d,raw", minor); 2293 error = ddi_create_minor_node(lofi_dip, namebuf, S_IFCHR, minor, 2294 DDI_PSEUDO, NULL); 2295 if (error != DDI_SUCCESS) { 2296 /* remove block node */ 2297 (void) snprintf(namebuf, sizeof (namebuf), "%d", minor); 2298 ddi_remove_minor_node(lofi_dip, namebuf); 2299 error = ENXIO; 2300 goto err; 2301 } 2302 2303 /* create DDI properties */ 2304 2305 if ((ddi_prop_update_int64(newdev, lofi_dip, SIZE_PROP_NAME, 2306 lsp->ls_vp_size - lsp->ls_crypto_offset)) != DDI_PROP_SUCCESS) { 2307 error = EINVAL; 2308 goto nodeerr; 2309 } 2310 2311 if ((ddi_prop_update_int64(newdev, lofi_dip, NBLOCKS_PROP_NAME, 2312 (lsp->ls_vp_size - lsp->ls_crypto_offset) / DEV_BSIZE)) 2313 != DDI_PROP_SUCCESS) { 2314 error = EINVAL; 2315 goto nodeerr; 2316 } 2317 2318 if (ddi_prop_update_string(newdev, lofi_dip, ZONE_PROP_NAME, 2319 (char *)curproc->p_zone->zone_name) != DDI_PROP_SUCCESS) { 2320 error = EINVAL; 2321 goto nodeerr; 2322 } 2323 2324 kstat_install(lsp->ls_kstat); 2325 2326 mutex_exit(&lofi_lock); 2327 2328 if (rvalp) 2329 *rvalp = (int)minor; 2330 klip->li_minor = minor; 2331 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 2332 free_lofi_ioctl(klip); 2333 return (0); 2334 2335 nodeerr: 2336 lofi_free_dev(newdev); 2337 err: 2338 if (lsp != NULL) { 2339 lofi_destroy(lsp, credp); 2340 } else { 2341 if (vp != NULL) { 2342 (void) VOP_CLOSE(vp, flag, 1, 0, credp, NULL); 2343 VN_RELE(vp); 2344 } 2345 2346 if (minor != (minor_t)-1) 2347 id_free(lofi_minor_id, minor); 2348 2349 rctl_decr_lofi(curproc->p_zone, 1); 2350 } 2351 2352 mutex_exit(&lofi_lock); 2353 free_lofi_ioctl(klip); 2354 return (error); 2355 } 2356 2357 /* 2358 * unmap a file. 2359 */ 2360 static int 2361 lofi_unmap_file(struct lofi_ioctl *ulip, int byfilename, 2362 struct cred *credp, int ioctl_flag) 2363 { 2364 struct lofi_state *lsp; 2365 struct lofi_ioctl *klip; 2366 int err; 2367 2368 err = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 2369 if (err != 0) 2370 return (err); 2371 2372 mutex_enter(&lofi_lock); 2373 if (byfilename) { 2374 if ((err = file_to_lofi(klip->li_filename, klip->li_readonly, 2375 &lsp)) != 0) { 2376 mutex_exit(&lofi_lock); 2377 return (err); 2378 } 2379 } else if (klip->li_minor == 0) { 2380 mutex_exit(&lofi_lock); 2381 free_lofi_ioctl(klip); 2382 return (ENXIO); 2383 } else { 2384 lsp = ddi_get_soft_state(lofi_statep, klip->li_minor); 2385 } 2386 2387 if (lsp == NULL || lsp->ls_vp == NULL || lofi_access(lsp) != 0) { 2388 mutex_exit(&lofi_lock); 2389 free_lofi_ioctl(klip); 2390 return (ENXIO); 2391 } 2392 2393 klip->li_minor = getminor(lsp->ls_dev); 2394 2395 /* 2396 * If it's still held open, we'll do one of three things: 2397 * 2398 * If no flag is set, just return EBUSY. 2399 * 2400 * If the 'cleanup' flag is set, unmap and remove the device when 2401 * the last user finishes. 2402 * 2403 * If the 'force' flag is set, then we forcibly close the underlying 2404 * file. Subsequent operations will fail, and the DKIOCSTATE ioctl 2405 * will return DKIO_DEV_GONE. When the device is last closed, the 2406 * device will be cleaned up appropriately. 2407 * 2408 * This is complicated by the fact that we may have outstanding 2409 * dispatched I/Os. Rather than having a single mutex to serialize all 2410 * I/O, we keep a count of the number of outstanding I/O requests 2411 * (ls_vp_iocount), as well as a flag to indicate that no new I/Os 2412 * should be dispatched (ls_vp_closereq). 2413 * 2414 * We set the flag, wait for the number of outstanding I/Os to reach 0, 2415 * and then close the underlying vnode. 2416 */ 2417 if (is_opened(lsp)) { 2418 if (klip->li_force) { 2419 mutex_enter(&lsp->ls_vp_lock); 2420 lsp->ls_vp_closereq = B_TRUE; 2421 /* wake up any threads waiting on dkiocstate */ 2422 cv_broadcast(&lsp->ls_vp_cv); 2423 while (lsp->ls_vp_iocount > 0) 2424 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 2425 mutex_exit(&lsp->ls_vp_lock); 2426 2427 goto out; 2428 } else if (klip->li_cleanup) { 2429 lsp->ls_cleanup = 1; 2430 mutex_exit(&lofi_lock); 2431 free_lofi_ioctl(klip); 2432 return (0); 2433 } 2434 2435 mutex_exit(&lofi_lock); 2436 free_lofi_ioctl(klip); 2437 return (EBUSY); 2438 } 2439 2440 out: 2441 lofi_free_dev(lsp->ls_dev); 2442 lofi_destroy(lsp, credp); 2443 2444 mutex_exit(&lofi_lock); 2445 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 2446 free_lofi_ioctl(klip); 2447 return (0); 2448 } 2449 2450 /* 2451 * get the filename given the minor number, or the minor number given 2452 * the name. 2453 */ 2454 /*ARGSUSED*/ 2455 static int 2456 lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which, 2457 struct cred *credp, int ioctl_flag) 2458 { 2459 struct lofi_ioctl *klip; 2460 struct lofi_state *lsp; 2461 int error; 2462 2463 error = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 2464 if (error != 0) 2465 return (error); 2466 2467 switch (which) { 2468 case LOFI_GET_FILENAME: 2469 if (klip->li_minor == 0) { 2470 free_lofi_ioctl(klip); 2471 return (EINVAL); 2472 } 2473 2474 mutex_enter(&lofi_lock); 2475 lsp = ddi_get_soft_state(lofi_statep, klip->li_minor); 2476 if (lsp == NULL || lofi_access(lsp) != 0) { 2477 mutex_exit(&lofi_lock); 2478 free_lofi_ioctl(klip); 2479 return (ENXIO); 2480 } 2481 2482 /* 2483 * This may fail if, for example, we're trying to look 2484 * up a zoned NFS path from the global zone. 2485 */ 2486 if (vnodetopath(NULL, lsp->ls_stacked_vp, klip->li_filename, 2487 sizeof (klip->li_filename), CRED()) != 0) { 2488 (void) strlcpy(klip->li_filename, "?", 2489 sizeof (klip->li_filename)); 2490 } 2491 2492 klip->li_readonly = lsp->ls_readonly; 2493 2494 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm, 2495 sizeof (klip->li_algorithm)); 2496 klip->li_crypto_enabled = lsp->ls_crypto_enabled; 2497 mutex_exit(&lofi_lock); 2498 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 2499 free_lofi_ioctl(klip); 2500 return (error); 2501 case LOFI_GET_MINOR: 2502 mutex_enter(&lofi_lock); 2503 error = file_to_lofi(klip->li_filename, 2504 klip->li_readonly, &lsp); 2505 if (error == 0) 2506 klip->li_minor = getminor(lsp->ls_dev); 2507 mutex_exit(&lofi_lock); 2508 2509 if (error == 0) 2510 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 2511 2512 free_lofi_ioctl(klip); 2513 return (error); 2514 case LOFI_CHECK_COMPRESSED: 2515 mutex_enter(&lofi_lock); 2516 error = file_to_lofi(klip->li_filename, 2517 klip->li_readonly, &lsp); 2518 if (error != 0) { 2519 mutex_exit(&lofi_lock); 2520 free_lofi_ioctl(klip); 2521 return (error); 2522 } 2523 2524 klip->li_minor = getminor(lsp->ls_dev); 2525 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm, 2526 sizeof (klip->li_algorithm)); 2527 2528 mutex_exit(&lofi_lock); 2529 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 2530 free_lofi_ioctl(klip); 2531 return (error); 2532 default: 2533 free_lofi_ioctl(klip); 2534 return (EINVAL); 2535 } 2536 } 2537 2538 static int 2539 lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, 2540 int *rvalp) 2541 { 2542 int error; 2543 enum dkio_state dkstate; 2544 struct lofi_state *lsp; 2545 minor_t minor; 2546 2547 minor = getminor(dev); 2548 /* lofi ioctls only apply to the master device */ 2549 if (minor == 0) { 2550 struct lofi_ioctl *lip = (struct lofi_ioctl *)arg; 2551 2552 /* 2553 * the query command only need read-access - i.e., normal 2554 * users are allowed to do those on the ctl device as 2555 * long as they can open it read-only. 2556 */ 2557 switch (cmd) { 2558 case LOFI_MAP_FILE: 2559 if ((flag & FWRITE) == 0) 2560 return (EPERM); 2561 return (lofi_map_file(dev, lip, 1, rvalp, credp, flag)); 2562 case LOFI_MAP_FILE_MINOR: 2563 if ((flag & FWRITE) == 0) 2564 return (EPERM); 2565 return (lofi_map_file(dev, lip, 0, rvalp, credp, flag)); 2566 case LOFI_UNMAP_FILE: 2567 if ((flag & FWRITE) == 0) 2568 return (EPERM); 2569 return (lofi_unmap_file(lip, 1, credp, flag)); 2570 case LOFI_UNMAP_FILE_MINOR: 2571 if ((flag & FWRITE) == 0) 2572 return (EPERM); 2573 return (lofi_unmap_file(lip, 0, credp, flag)); 2574 case LOFI_GET_FILENAME: 2575 return (lofi_get_info(dev, lip, LOFI_GET_FILENAME, 2576 credp, flag)); 2577 case LOFI_GET_MINOR: 2578 return (lofi_get_info(dev, lip, LOFI_GET_MINOR, 2579 credp, flag)); 2580 2581 /* 2582 * This API made limited sense when this value was fixed 2583 * at LOFI_MAX_FILES. However, its use to iterate 2584 * across all possible devices in lofiadm means we don't 2585 * want to return L_MAXMIN32, but the highest 2586 * *allocated* minor. 2587 */ 2588 case LOFI_GET_MAXMINOR: 2589 minor = 0; 2590 2591 mutex_enter(&lofi_lock); 2592 2593 for (lsp = list_head(&lofi_list); lsp != NULL; 2594 lsp = list_next(&lofi_list, lsp)) { 2595 if (lofi_access(lsp) != 0) 2596 continue; 2597 2598 if (getminor(lsp->ls_dev) > minor) 2599 minor = getminor(lsp->ls_dev); 2600 } 2601 2602 mutex_exit(&lofi_lock); 2603 2604 error = ddi_copyout(&minor, &lip->li_minor, 2605 sizeof (minor), flag); 2606 if (error) 2607 return (EFAULT); 2608 return (0); 2609 2610 case LOFI_CHECK_COMPRESSED: 2611 return (lofi_get_info(dev, lip, LOFI_CHECK_COMPRESSED, 2612 credp, flag)); 2613 default: 2614 return (EINVAL); 2615 } 2616 } 2617 2618 mutex_enter(&lofi_lock); 2619 lsp = ddi_get_soft_state(lofi_statep, minor); 2620 if (lsp == NULL || lsp->ls_vp_closereq) { 2621 mutex_exit(&lofi_lock); 2622 return (ENXIO); 2623 } 2624 mutex_exit(&lofi_lock); 2625 2626 /* 2627 * We explicitly allow DKIOCSTATE, but all other ioctls should fail with 2628 * EIO as if the device was no longer present. 2629 */ 2630 if (lsp->ls_vp == NULL && cmd != DKIOCSTATE) 2631 return (EIO); 2632 2633 /* these are for faking out utilities like newfs */ 2634 switch (cmd) { 2635 case DKIOCGVTOC: 2636 switch (ddi_model_convert_from(flag & FMODELS)) { 2637 case DDI_MODEL_ILP32: { 2638 struct vtoc32 vtoc32; 2639 2640 vtoctovtoc32(lsp->ls_vtoc, vtoc32); 2641 if (ddi_copyout(&vtoc32, (void *)arg, 2642 sizeof (struct vtoc32), flag)) 2643 return (EFAULT); 2644 break; 2645 } 2646 2647 case DDI_MODEL_NONE: 2648 if (ddi_copyout(&lsp->ls_vtoc, (void *)arg, 2649 sizeof (struct vtoc), flag)) 2650 return (EFAULT); 2651 break; 2652 } 2653 return (0); 2654 case DKIOCINFO: 2655 error = ddi_copyout(&lsp->ls_ci, (void *)arg, 2656 sizeof (struct dk_cinfo), flag); 2657 if (error) 2658 return (EFAULT); 2659 return (0); 2660 case DKIOCG_VIRTGEOM: 2661 case DKIOCG_PHYGEOM: 2662 case DKIOCGGEOM: 2663 error = ddi_copyout(&lsp->ls_dkg, (void *)arg, 2664 sizeof (struct dk_geom), flag); 2665 if (error) 2666 return (EFAULT); 2667 return (0); 2668 case DKIOCSTATE: 2669 /* 2670 * Normally, lofi devices are always in the INSERTED state. If 2671 * a device is forcefully unmapped, then the device transitions 2672 * to the DKIO_DEV_GONE state. 2673 */ 2674 if (ddi_copyin((void *)arg, &dkstate, sizeof (dkstate), 2675 flag) != 0) 2676 return (EFAULT); 2677 2678 mutex_enter(&lsp->ls_vp_lock); 2679 lsp->ls_vp_iocount++; 2680 while (((dkstate == DKIO_INSERTED && lsp->ls_vp != NULL) || 2681 (dkstate == DKIO_DEV_GONE && lsp->ls_vp == NULL)) && 2682 !lsp->ls_vp_closereq) { 2683 /* 2684 * By virtue of having the device open, we know that 2685 * 'lsp' will remain valid when we return. 2686 */ 2687 if (!cv_wait_sig(&lsp->ls_vp_cv, 2688 &lsp->ls_vp_lock)) { 2689 lsp->ls_vp_iocount--; 2690 cv_broadcast(&lsp->ls_vp_cv); 2691 mutex_exit(&lsp->ls_vp_lock); 2692 return (EINTR); 2693 } 2694 } 2695 2696 dkstate = (!lsp->ls_vp_closereq && lsp->ls_vp != NULL ? 2697 DKIO_INSERTED : DKIO_DEV_GONE); 2698 lsp->ls_vp_iocount--; 2699 cv_broadcast(&lsp->ls_vp_cv); 2700 mutex_exit(&lsp->ls_vp_lock); 2701 2702 if (ddi_copyout(&dkstate, (void *)arg, 2703 sizeof (dkstate), flag) != 0) 2704 return (EFAULT); 2705 return (0); 2706 default: 2707 return (ENOTTY); 2708 } 2709 } 2710 2711 static struct cb_ops lofi_cb_ops = { 2712 lofi_open, /* open */ 2713 lofi_close, /* close */ 2714 lofi_strategy, /* strategy */ 2715 nodev, /* print */ 2716 nodev, /* dump */ 2717 lofi_read, /* read */ 2718 lofi_write, /* write */ 2719 lofi_ioctl, /* ioctl */ 2720 nodev, /* devmap */ 2721 nodev, /* mmap */ 2722 nodev, /* segmap */ 2723 nochpoll, /* poll */ 2724 ddi_prop_op, /* prop_op */ 2725 0, /* streamtab */ 2726 D_64BIT | D_NEW | D_MP, /* Driver compatibility flag */ 2727 CB_REV, 2728 lofi_aread, 2729 lofi_awrite 2730 }; 2731 2732 static struct dev_ops lofi_ops = { 2733 DEVO_REV, /* devo_rev, */ 2734 0, /* refcnt */ 2735 lofi_info, /* info */ 2736 nulldev, /* identify */ 2737 nulldev, /* probe */ 2738 lofi_attach, /* attach */ 2739 lofi_detach, /* detach */ 2740 nodev, /* reset */ 2741 &lofi_cb_ops, /* driver operations */ 2742 NULL, /* no bus operations */ 2743 NULL, /* power */ 2744 ddi_quiesce_not_needed, /* quiesce */ 2745 }; 2746 2747 static struct modldrv modldrv = { 2748 &mod_driverops, 2749 "loopback file driver", 2750 &lofi_ops, 2751 }; 2752 2753 static struct modlinkage modlinkage = { 2754 MODREV_1, 2755 &modldrv, 2756 NULL 2757 }; 2758 2759 int 2760 _init(void) 2761 { 2762 int error; 2763 2764 list_create(&lofi_list, sizeof (struct lofi_state), 2765 offsetof(struct lofi_state, ls_list)); 2766 2767 error = ddi_soft_state_init(&lofi_statep, 2768 sizeof (struct lofi_state), 0); 2769 if (error) 2770 return (error); 2771 2772 mutex_init(&lofi_lock, NULL, MUTEX_DRIVER, NULL); 2773 2774 error = mod_install(&modlinkage); 2775 if (error) { 2776 mutex_destroy(&lofi_lock); 2777 ddi_soft_state_fini(&lofi_statep); 2778 list_destroy(&lofi_list); 2779 } 2780 2781 return (error); 2782 } 2783 2784 int 2785 _fini(void) 2786 { 2787 int error; 2788 2789 mutex_enter(&lofi_lock); 2790 2791 if (!list_is_empty(&lofi_list)) { 2792 mutex_exit(&lofi_lock); 2793 return (EBUSY); 2794 } 2795 2796 mutex_exit(&lofi_lock); 2797 2798 error = mod_remove(&modlinkage); 2799 if (error) 2800 return (error); 2801 2802 mutex_destroy(&lofi_lock); 2803 ddi_soft_state_fini(&lofi_statep); 2804 list_destroy(&lofi_list); 2805 2806 return (error); 2807 } 2808 2809 int 2810 _info(struct modinfo *modinfop) 2811 { 2812 return (mod_info(&modlinkage, modinfop)); 2813 }