1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2016 Andrey Sokolov 26 * Copyright 2016 Toomas Soome <tsoome@me.com> 27 * Copyright 2019 Joyent, Inc. 28 */ 29 30 /* 31 * lofi (loopback file) driver - allows you to attach a file to a device, 32 * which can then be accessed through that device. The simple model is that 33 * you tell lofi to open a file, and then use the block device you get as 34 * you would any block device. lofi translates access to the block device 35 * into I/O on the underlying file. This is mostly useful for 36 * mounting images of filesystems. 37 * 38 * lofi is controlled through /dev/lofictl - this is the only device exported 39 * during attach, and is instance number 0. lofiadm communicates with lofi 40 * through ioctls on this device. When a file is attached to lofi, block and 41 * character devices are exported in /dev/lofi and /dev/rlofi. These devices 42 * are identified by lofi instance number, and the instance number is also used 43 * as the name in /dev/lofi. 44 * 45 * Virtual disks, or, labeled lofi, implements virtual disk support to 46 * support partition table and related tools. Such mappings will cause 47 * block and character devices to be exported in /dev/dsk and /dev/rdsk 48 * directories. 49 * 50 * To support virtual disks, the instance number space is divided to two 51 * parts, upper part for instance number and lower part for minor number 52 * space to identify partitions and slices. The virtual disk support is 53 * implemented by stacking cmlb module. For virtual disks, the partition 54 * related ioctl calls are routed to cmlb module. Compression and encryption 55 * is not supported for virtual disks. 56 * 57 * Mapped devices are tracked with state structures handled with 58 * ddi_soft_state(9F) for simplicity. 59 * 60 * A file attached to lofi is opened when attached and not closed until 61 * explicitly detached from lofi. This seems more sensible than deferring 62 * the open until the /dev/lofi device is opened, for a number of reasons. 63 * One is that any failure is likely to be noticed by the person (or script) 64 * running lofiadm. Another is that it would be a security problem if the 65 * file was replaced by another one after being added but before being opened. 66 * 67 * The only hard part about lofi is the ioctls. In order to support things 68 * like 'newfs' on a lofi device, it needs to support certain disk ioctls. 69 * So it has to fake disk geometry and partition information. More may need 70 * to be faked if your favorite utility doesn't work and you think it should 71 * (fdformat doesn't work because it really wants to know the type of floppy 72 * controller to talk to, and that didn't seem easy to fake. Or possibly even 73 * necessary, since we have mkfs_pcfs now). 74 * 75 * Normally, a lofi device cannot be detached if it is open (i.e. busy). To 76 * support simulation of hotplug events, an optional force flag is provided. 77 * If a lofi device is open when a force detach is requested, then the 78 * underlying file is closed and any subsequent operations return EIO. When the 79 * device is closed for the last time, it will be cleaned up at that time. In 80 * addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is 81 * detached but not removed. 82 * 83 * If detach was requested and lofi device is not open, we will perform 84 * unmap and remove the lofi instance. 85 * 86 * If the lofi device is open and the li_cleanup is set on ioctl request, 87 * we set ls_cleanup flag to notify the cleanup is requested, and the 88 * last lofi_close will perform the unmapping and this lofi instance will be 89 * removed. 90 * 91 * If the lofi device is open and the li_force is set on ioctl request, 92 * we set ls_cleanup flag to notify the cleanup is requested, 93 * we also set ls_vp_closereq to notify IO tasks to return EIO on new 94 * IO requests and wait in process IO count to become 0, indicating there 95 * are no more IO requests. Since ls_cleanup is set, the last lofi_close 96 * will perform unmap and this lofi instance will be removed. 97 * See also lofi_unmap_file() for details. 98 * 99 * Once ls_cleanup is set for the instance, we do not allow lofi_open() 100 * calls to succeed and can have last lofi_close() to remove the instance. 101 * 102 * Known problems: 103 * 104 * UFS logging. Mounting a UFS filesystem image "logging" 105 * works for basic copy testing but wedges during a build of ON through 106 * that image. Some deadlock in lufs holding the log mutex and then 107 * getting stuck on a buf. So for now, don't do that. 108 * 109 * Direct I/O. Since the filesystem data is being cached in the buffer 110 * cache, _and_ again in the underlying filesystem, it's tempting to 111 * enable direct I/O on the underlying file. Don't, because that deadlocks. 112 * I think to fix the cache-twice problem we might need filesystem support. 113 * 114 * Interesting things to do: 115 * 116 * Allow multiple files for each device. A poor-man's metadisk, basically. 117 * 118 * Pass-through ioctls on block devices. You can (though it's not 119 * documented), give lofi a block device as a file name. Then we shouldn't 120 * need to fake a geometry, however, it may be relevant if you're replacing 121 * metadisk, or using lofi to get crypto. 122 * It makes sense to do lofiadm -c aes -a /dev/dsk/c0t0d0s4 /dev/lofi/1 123 * and then in /etc/vfstab have an entry for /dev/lofi/1 as /export/home. 124 * In fact this even makes sense if you have lofi "above" metadisk. 125 * 126 * Encryption: 127 * Each lofi device can have its own symmetric key and cipher. 128 * They are passed to us by lofiadm(1m) in the correct format for use 129 * with the misc/kcf crypto_* routines. 130 * 131 * Each block has its own IV, that is calculated in lofi_blk_mech(), based 132 * on the "master" key held in the lsp and the block number of the buffer. 133 */ 134 135 #include <sys/types.h> 136 #include <netinet/in.h> 137 #include <sys/sysmacros.h> 138 #include <sys/uio.h> 139 #include <sys/kmem.h> 140 #include <sys/cred.h> 141 #include <sys/mman.h> 142 #include <sys/errno.h> 143 #include <sys/aio_req.h> 144 #include <sys/stat.h> 145 #include <sys/file.h> 146 #include <sys/modctl.h> 147 #include <sys/conf.h> 148 #include <sys/debug.h> 149 #include <sys/vnode.h> 150 #include <sys/lofi.h> 151 #include <sys/lofi_impl.h> /* for cache structure */ 152 #include <sys/fcntl.h> 153 #include <sys/pathname.h> 154 #include <sys/filio.h> 155 #include <sys/fdio.h> 156 #include <sys/open.h> 157 #include <sys/disp.h> 158 #include <vm/seg_map.h> 159 #include <sys/ddi.h> 160 #include <sys/sunddi.h> 161 #include <sys/zmod.h> 162 #include <sys/id_space.h> 163 #include <sys/mkdev.h> 164 #include <sys/crypto/common.h> 165 #include <sys/crypto/api.h> 166 #include <sys/rctl.h> 167 #include <sys/vtoc.h> 168 #include <sys/scsi/scsi.h> /* for DTYPE_DIRECT */ 169 #include <sys/scsi/impl/uscsi.h> 170 #include <sys/sysevent/dev.h> 171 #include <sys/efi_partition.h> 172 #include <sys/note.h> 173 #include <LzmaDec.h> 174 175 #define NBLOCKS_PROP_NAME "Nblocks" 176 #define SIZE_PROP_NAME "Size" 177 #define ZONE_PROP_NAME "zone" 178 179 #define SETUP_C_DATA(cd, buf, len) \ 180 (cd).cd_format = CRYPTO_DATA_RAW; \ 181 (cd).cd_offset = 0; \ 182 (cd).cd_miscdata = NULL; \ 183 (cd).cd_length = (len); \ 184 (cd).cd_raw.iov_base = (buf); \ 185 (cd).cd_raw.iov_len = (len); 186 187 #define UIO_CHECK(uio) \ 188 if (((uio)->uio_loffset % DEV_BSIZE) != 0 || \ 189 ((uio)->uio_resid % DEV_BSIZE) != 0) { \ 190 return (EINVAL); \ 191 } 192 193 #define LOFI_TIMEOUT 30 194 195 static void *lofi_statep; 196 static kmutex_t lofi_lock; /* state lock */ 197 static id_space_t *lofi_id; /* lofi ID values */ 198 static list_t lofi_list; 199 static zone_key_t lofi_zone_key; 200 201 /* 202 * Because lofi_taskq_nthreads limits the actual swamping of the device, the 203 * maxalloc parameter (lofi_taskq_maxalloc) should be tuned conservatively 204 * high. If we want to be assured that the underlying device is always busy, 205 * we must be sure that the number of bytes enqueued when the number of 206 * enqueued tasks exceeds maxalloc is sufficient to keep the device busy for 207 * the duration of the sleep time in taskq_ent_alloc(). That is, lofi should 208 * set maxalloc to be the maximum throughput (in bytes per second) of the 209 * underlying device divided by the minimum I/O size. We assume a realistic 210 * maximum throughput of one hundred megabytes per second; we set maxalloc on 211 * the lofi task queue to be 104857600 divided by DEV_BSIZE. 212 */ 213 static int lofi_taskq_maxalloc = 104857600 / DEV_BSIZE; 214 static int lofi_taskq_nthreads = 4; /* # of taskq threads per device */ 215 216 const char lofi_crypto_magic[6] = LOFI_CRYPTO_MAGIC; 217 218 /* 219 * To avoid decompressing data in a compressed segment multiple times 220 * when accessing small parts of a segment's data, we cache and reuse 221 * the uncompressed segment's data. 222 * 223 * A single cached segment is sufficient to avoid lots of duplicate 224 * segment decompress operations. A small cache size also reduces the 225 * memory footprint. 226 * 227 * lofi_max_comp_cache is the maximum number of decompressed data segments 228 * cached for each compressed lofi image. It can be set to 0 to disable 229 * caching. 230 */ 231 232 uint32_t lofi_max_comp_cache = 1; 233 234 static int gzip_decompress(void *src, size_t srclen, void *dst, 235 size_t *destlen, int level); 236 237 static int lzma_decompress(void *src, size_t srclen, void *dst, 238 size_t *dstlen, int level); 239 240 lofi_compress_info_t lofi_compress_table[LOFI_COMPRESS_FUNCTIONS] = { 241 {gzip_decompress, NULL, 6, "gzip"}, /* default */ 242 {gzip_decompress, NULL, 6, "gzip-6"}, 243 {gzip_decompress, NULL, 9, "gzip-9"}, 244 {lzma_decompress, NULL, 0, "lzma"} 245 }; 246 247 static void lofi_strategy_task(void *); 248 static int lofi_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, 249 size_t, void *); 250 static int lofi_tg_getinfo(dev_info_t *, int, void *, void *); 251 252 struct cmlb_tg_ops lofi_tg_ops = { 253 TG_DK_OPS_VERSION_1, 254 lofi_tg_rdwr, 255 lofi_tg_getinfo 256 }; 257 258 /*ARGSUSED*/ 259 static void 260 *SzAlloc(void *p, size_t size) 261 { 262 return (kmem_alloc(size, KM_SLEEP)); 263 } 264 265 /*ARGSUSED*/ 266 static void 267 SzFree(void *p, void *address, size_t size) 268 { 269 kmem_free(address, size); 270 } 271 272 static ISzAlloc g_Alloc = { SzAlloc, SzFree }; 273 274 /* 275 * Free data referenced by the linked list of cached uncompressed 276 * segments. 277 */ 278 static void 279 lofi_free_comp_cache(struct lofi_state *lsp) 280 { 281 struct lofi_comp_cache *lc; 282 283 while ((lc = list_remove_head(&lsp->ls_comp_cache)) != NULL) { 284 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 285 kmem_free(lc, sizeof (struct lofi_comp_cache)); 286 lsp->ls_comp_cache_count--; 287 } 288 ASSERT(lsp->ls_comp_cache_count == 0); 289 } 290 291 static int 292 is_opened(struct lofi_state *lsp) 293 { 294 int i; 295 boolean_t last = B_TRUE; 296 297 ASSERT(MUTEX_HELD(&lofi_lock)); 298 for (i = 0; i < LOFI_PART_MAX; i++) { 299 if (lsp->ls_open_lyr[i]) { 300 last = B_FALSE; 301 break; 302 } 303 } 304 305 for (i = 0; last && (i < OTYP_LYR); i++) { 306 if (lsp->ls_open_reg[i]) { 307 last = B_FALSE; 308 } 309 } 310 311 return (!last); 312 } 313 314 static void 315 lofi_set_cleanup(struct lofi_state *lsp) 316 { 317 ASSERT(MUTEX_HELD(&lofi_lock)); 318 319 lsp->ls_cleanup = B_TRUE; 320 321 /* wake up any threads waiting on dkiocstate */ 322 cv_broadcast(&lsp->ls_vp_cv); 323 } 324 325 static void 326 lofi_free_crypto(struct lofi_state *lsp) 327 { 328 ASSERT(MUTEX_HELD(&lofi_lock)); 329 330 if (lsp->ls_crypto_enabled) { 331 /* 332 * Clean up the crypto state so that it doesn't hang around 333 * in memory after we are done with it. 334 */ 335 if (lsp->ls_key.ck_data != NULL) { 336 bzero(lsp->ls_key.ck_data, 337 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 338 kmem_free(lsp->ls_key.ck_data, 339 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 340 lsp->ls_key.ck_data = NULL; 341 lsp->ls_key.ck_length = 0; 342 } 343 344 if (lsp->ls_mech.cm_param != NULL) { 345 kmem_free(lsp->ls_mech.cm_param, 346 lsp->ls_mech.cm_param_len); 347 lsp->ls_mech.cm_param = NULL; 348 lsp->ls_mech.cm_param_len = 0; 349 } 350 351 if (lsp->ls_iv_mech.cm_param != NULL) { 352 kmem_free(lsp->ls_iv_mech.cm_param, 353 lsp->ls_iv_mech.cm_param_len); 354 lsp->ls_iv_mech.cm_param = NULL; 355 lsp->ls_iv_mech.cm_param_len = 0; 356 } 357 358 mutex_destroy(&lsp->ls_crypto_lock); 359 } 360 } 361 362 /* ARGSUSED */ 363 static int 364 lofi_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start, 365 size_t length, void *tg_cookie) 366 { 367 struct lofi_state *lsp; 368 buf_t *bp; 369 int instance; 370 int rv = 0; 371 372 instance = ddi_get_instance(dip); 373 if (instance == 0) /* control node does not have disk */ 374 return (ENXIO); 375 376 lsp = ddi_get_soft_state(lofi_statep, instance); 377 378 if (lsp == NULL) 379 return (ENXIO); 380 381 if (cmd != TG_READ && cmd != TG_WRITE) 382 return (EINVAL); 383 384 /* 385 * Make sure the mapping is set up by checking lsp->ls_vp_ready. 386 */ 387 mutex_enter(&lsp->ls_vp_lock); 388 while (lsp->ls_vp_ready == B_FALSE) 389 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 390 mutex_exit(&lsp->ls_vp_lock); 391 392 if (P2PHASE(length, (1U << lsp->ls_lbshift)) != 0) { 393 /* We can only transfer whole blocks at a time! */ 394 return (EINVAL); 395 } 396 397 bp = getrbuf(KM_SLEEP); 398 399 if (cmd == TG_READ) { 400 bp->b_flags = B_READ; 401 } else { 402 if (lsp->ls_readonly == B_TRUE) { 403 freerbuf(bp); 404 return (EROFS); 405 } 406 bp->b_flags = B_WRITE; 407 } 408 409 bp->b_un.b_addr = bufaddr; 410 bp->b_bcount = length; 411 bp->b_lblkno = start; 412 bp->b_private = NULL; 413 bp->b_edev = lsp->ls_dev; 414 415 if (lsp->ls_kstat) { 416 mutex_enter(lsp->ls_kstat->ks_lock); 417 kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat)); 418 mutex_exit(lsp->ls_kstat->ks_lock); 419 } 420 (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP); 421 (void) biowait(bp); 422 423 rv = geterror(bp); 424 freerbuf(bp); 425 return (rv); 426 } 427 428 /* 429 * Get device geometry info for cmlb. 430 * 431 * We have mapped disk image as virtual block device and have to report 432 * physical/virtual geometry to cmlb. 433 * 434 * So we have two principal cases: 435 * 1. Uninitialised image without any existing labels, 436 * for this case we fabricate the data based on mapped image. 437 * 2. Image with existing label information. 438 * Since we have no information how the image was created (it may be 439 * dump from some physical device), we need to rely on label information 440 * from image, or we get "corrupted label" errors. 441 * NOTE: label can be MBR, MBR+SMI, GPT 442 */ 443 static int 444 lofi_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie) 445 { 446 struct lofi_state *lsp; 447 int instance; 448 int ashift; 449 450 _NOTE(ARGUNUSED(tg_cookie)); 451 instance = ddi_get_instance(dip); 452 if (instance == 0) /* control device has no storage */ 453 return (ENXIO); 454 455 lsp = ddi_get_soft_state(lofi_statep, instance); 456 457 if (lsp == NULL) 458 return (ENXIO); 459 460 /* 461 * Make sure the mapping is set up by checking lsp->ls_vp_ready. 462 * 463 * When mapping is created, new lofi instance is created and 464 * lofi_attach() will call cmlb_attach() as part of the procedure 465 * to set the mapping up. This chain of events will happen in 466 * the same thread. 467 * Since cmlb_attach() will call lofi_tg_getinfo to get 468 * capacity, we return error on that call if cookie is set, 469 * otherwise lofi_attach will be stuck as the mapping is not yet 470 * finalized and lofi is not yet ready. 471 * Note, such error is not fatal for cmlb, as the label setup 472 * will be finalized when cmlb_validate() is called. 473 */ 474 mutex_enter(&lsp->ls_vp_lock); 475 if (tg_cookie != NULL && lsp->ls_vp_ready == B_FALSE) { 476 mutex_exit(&lsp->ls_vp_lock); 477 return (ENXIO); 478 } 479 while (lsp->ls_vp_ready == B_FALSE) 480 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 481 mutex_exit(&lsp->ls_vp_lock); 482 483 ashift = lsp->ls_lbshift; 484 485 switch (cmd) { 486 case TG_GETPHYGEOM: { 487 cmlb_geom_t *geomp = arg; 488 489 geomp->g_capacity = 490 (lsp->ls_vp_size - lsp->ls_crypto_offset) >> ashift; 491 geomp->g_nsect = lsp->ls_dkg.dkg_nsect; 492 geomp->g_nhead = lsp->ls_dkg.dkg_nhead; 493 geomp->g_acyl = lsp->ls_dkg.dkg_acyl; 494 geomp->g_ncyl = lsp->ls_dkg.dkg_ncyl; 495 geomp->g_secsize = (1U << ashift); 496 geomp->g_intrlv = lsp->ls_dkg.dkg_intrlv; 497 geomp->g_rpm = lsp->ls_dkg.dkg_rpm; 498 return (0); 499 } 500 501 case TG_GETCAPACITY: 502 *(diskaddr_t *)arg = 503 (lsp->ls_vp_size - lsp->ls_crypto_offset) >> ashift; 504 return (0); 505 506 case TG_GETBLOCKSIZE: 507 *(uint32_t *)arg = (1U << ashift); 508 return (0); 509 510 case TG_GETATTR: { 511 tg_attribute_t *tgattr = arg; 512 513 tgattr->media_is_writable = !lsp->ls_readonly; 514 tgattr->media_is_solid_state = B_FALSE; 515 tgattr->media_is_rotational = B_FALSE; 516 return (0); 517 } 518 519 default: 520 return (EINVAL); 521 } 522 } 523 524 static void 525 lofi_destroy(struct lofi_state *lsp, cred_t *credp) 526 { 527 int id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 528 int i; 529 530 ASSERT(MUTEX_HELD(&lofi_lock)); 531 532 /* 533 * Before we can start to release the other resources, 534 * make sure we have all tasks completed and taskq removed. 535 */ 536 if (lsp->ls_taskq != NULL) { 537 taskq_destroy(lsp->ls_taskq); 538 lsp->ls_taskq = NULL; 539 } 540 541 list_remove(&lofi_list, lsp); 542 543 lofi_free_crypto(lsp); 544 545 /* 546 * Free pre-allocated compressed buffers 547 */ 548 if (lsp->ls_comp_bufs != NULL) { 549 for (i = 0; i < lofi_taskq_nthreads; i++) { 550 if (lsp->ls_comp_bufs[i].bufsize > 0) 551 kmem_free(lsp->ls_comp_bufs[i].buf, 552 lsp->ls_comp_bufs[i].bufsize); 553 } 554 kmem_free(lsp->ls_comp_bufs, 555 sizeof (struct compbuf) * lofi_taskq_nthreads); 556 } 557 558 if (lsp->ls_vp != NULL) { 559 (void) VOP_PUTPAGE(lsp->ls_vp, 0, 0, B_FREE, credp, NULL); 560 (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 561 1, 0, credp, NULL); 562 VN_RELE(lsp->ls_vp); 563 } 564 if (lsp->ls_stacked_vp != lsp->ls_vp) 565 VN_RELE(lsp->ls_stacked_vp); 566 lsp->ls_vp = lsp->ls_stacked_vp = NULL; 567 568 if (lsp->ls_kstat != NULL) { 569 kstat_delete(lsp->ls_kstat); 570 lsp->ls_kstat = NULL; 571 } 572 573 /* 574 * Free cached decompressed segment data 575 */ 576 lofi_free_comp_cache(lsp); 577 list_destroy(&lsp->ls_comp_cache); 578 579 if (lsp->ls_uncomp_seg_sz > 0) { 580 kmem_free(lsp->ls_comp_index_data, lsp->ls_comp_index_data_sz); 581 lsp->ls_uncomp_seg_sz = 0; 582 } 583 584 rctl_decr_lofi(lsp->ls_zone.zref_zone, 1); 585 zone_rele_ref(&lsp->ls_zone, ZONE_REF_LOFI); 586 587 mutex_destroy(&lsp->ls_comp_cache_lock); 588 mutex_destroy(&lsp->ls_comp_bufs_lock); 589 mutex_destroy(&lsp->ls_kstat_lock); 590 mutex_destroy(&lsp->ls_vp_lock); 591 cv_destroy(&lsp->ls_vp_cv); 592 lsp->ls_vp_ready = B_FALSE; 593 lsp->ls_vp_closereq = B_FALSE; 594 595 ASSERT(ddi_get_soft_state(lofi_statep, id) == lsp); 596 (void) ndi_devi_offline(lsp->ls_dip, NDI_DEVI_REMOVE); 597 id_free(lofi_id, id); 598 } 599 600 static void 601 lofi_free_dev(struct lofi_state *lsp) 602 { 603 ASSERT(MUTEX_HELD(&lofi_lock)); 604 605 if (lsp->ls_cmlbhandle != NULL) { 606 cmlb_invalidate(lsp->ls_cmlbhandle, 0); 607 cmlb_detach(lsp->ls_cmlbhandle, 0); 608 cmlb_free_handle(&lsp->ls_cmlbhandle); 609 lsp->ls_cmlbhandle = NULL; 610 } 611 (void) ddi_prop_remove_all(lsp->ls_dip); 612 ddi_remove_minor_node(lsp->ls_dip, NULL); 613 } 614 615 /*ARGSUSED*/ 616 static void 617 lofi_zone_shutdown(zoneid_t zoneid, void *arg) 618 { 619 struct lofi_state *lsp; 620 struct lofi_state *next; 621 622 mutex_enter(&lofi_lock); 623 624 for (lsp = list_head(&lofi_list); lsp != NULL; lsp = next) { 625 626 /* lofi_destroy() frees lsp */ 627 next = list_next(&lofi_list, lsp); 628 629 if (lsp->ls_zone.zref_zone->zone_id != zoneid) 630 continue; 631 632 /* 633 * No in-zone processes are running, but something has this 634 * open. It's either a global zone process, or a lofi 635 * mount. In either case we set ls_cleanup so the last 636 * user destroys the device. 637 */ 638 if (is_opened(lsp)) { 639 lofi_set_cleanup(lsp); 640 } else { 641 lofi_free_dev(lsp); 642 lofi_destroy(lsp, kcred); 643 } 644 } 645 646 mutex_exit(&lofi_lock); 647 } 648 649 /*ARGSUSED*/ 650 static int 651 lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp) 652 { 653 int id; 654 minor_t part; 655 uint64_t mask; 656 diskaddr_t nblks; 657 diskaddr_t lba; 658 boolean_t ndelay; 659 660 struct lofi_state *lsp; 661 662 if (otyp >= OTYPCNT) 663 return (EINVAL); 664 665 ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE; 666 667 /* 668 * lofiadm -a /dev/lofi/1 gets us here. 669 */ 670 if (mutex_owner(&lofi_lock) == curthread) 671 return (EINVAL); 672 673 mutex_enter(&lofi_lock); 674 675 id = LOFI_MINOR2ID(getminor(*devp)); 676 part = LOFI_PART(getminor(*devp)); 677 mask = (1U << part); 678 679 /* master control device */ 680 if (id == 0) { 681 mutex_exit(&lofi_lock); 682 return (0); 683 } 684 685 /* otherwise, the mapping should already exist */ 686 lsp = ddi_get_soft_state(lofi_statep, id); 687 if (lsp == NULL) { 688 mutex_exit(&lofi_lock); 689 return (EINVAL); 690 } 691 692 if (lsp->ls_cleanup == B_TRUE) { 693 mutex_exit(&lofi_lock); 694 return (ENXIO); 695 } 696 697 if (lsp->ls_vp == NULL) { 698 mutex_exit(&lofi_lock); 699 return (ENXIO); 700 } 701 702 if (lsp->ls_readonly && (flag & FWRITE)) { 703 mutex_exit(&lofi_lock); 704 return (EROFS); 705 } 706 707 if ((lsp->ls_open_excl) & (mask)) { 708 mutex_exit(&lofi_lock); 709 return (EBUSY); 710 } 711 712 if (flag & FEXCL) { 713 if (lsp->ls_open_lyr[part]) { 714 mutex_exit(&lofi_lock); 715 return (EBUSY); 716 } 717 for (int i = 0; i < OTYP_LYR; i++) { 718 if (lsp->ls_open_reg[i] & mask) { 719 mutex_exit(&lofi_lock); 720 return (EBUSY); 721 } 722 } 723 } 724 725 if (lsp->ls_cmlbhandle != NULL) { 726 if (cmlb_validate(lsp->ls_cmlbhandle, 0, 0) != 0) { 727 /* 728 * non-blocking opens are allowed to succeed to 729 * support format and fdisk to create partitioning. 730 */ 731 if (!ndelay) { 732 mutex_exit(&lofi_lock); 733 return (ENXIO); 734 } 735 } else if (cmlb_partinfo(lsp->ls_cmlbhandle, part, &nblks, &lba, 736 NULL, NULL, 0) == 0) { 737 if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) { 738 mutex_exit(&lofi_lock); 739 return (ENXIO); 740 } 741 } else if (!ndelay) { 742 mutex_exit(&lofi_lock); 743 return (ENXIO); 744 } 745 } 746 747 if (otyp == OTYP_LYR) { 748 lsp->ls_open_lyr[part]++; 749 } else { 750 lsp->ls_open_reg[otyp] |= mask; 751 } 752 if (flag & FEXCL) { 753 lsp->ls_open_excl |= mask; 754 } 755 756 mutex_exit(&lofi_lock); 757 return (0); 758 } 759 760 /*ARGSUSED*/ 761 static int 762 lofi_close(dev_t dev, int flag, int otyp, struct cred *credp) 763 { 764 minor_t part; 765 int id; 766 uint64_t mask; 767 struct lofi_state *lsp; 768 769 id = LOFI_MINOR2ID(getminor(dev)); 770 part = LOFI_PART(getminor(dev)); 771 mask = (1U << part); 772 773 mutex_enter(&lofi_lock); 774 lsp = ddi_get_soft_state(lofi_statep, id); 775 if (lsp == NULL) { 776 mutex_exit(&lofi_lock); 777 return (EINVAL); 778 } 779 780 if (id == 0) { 781 mutex_exit(&lofi_lock); 782 return (0); 783 } 784 785 if (lsp->ls_open_excl & mask) 786 lsp->ls_open_excl &= ~mask; 787 788 if (otyp == OTYP_LYR) { 789 lsp->ls_open_lyr[part]--; 790 } else { 791 lsp->ls_open_reg[otyp] &= ~mask; 792 } 793 794 /* 795 * If we forcibly closed the underlying device (li_force), or 796 * asked for cleanup (li_cleanup), finish up if we're the last 797 * out of the door. 798 */ 799 if (!is_opened(lsp) && 800 (lsp->ls_cleanup == B_TRUE || lsp->ls_vp == NULL)) { 801 lofi_free_dev(lsp); 802 lofi_destroy(lsp, credp); 803 } 804 805 mutex_exit(&lofi_lock); 806 return (0); 807 } 808 809 /* 810 * Sets the mechanism's initialization vector (IV) if one is needed. 811 * The IV is computed from the data block number. lsp->ls_mech is 812 * altered so that: 813 * lsp->ls_mech.cm_param_len is set to the IV len. 814 * lsp->ls_mech.cm_param is set to the IV. 815 */ 816 static int 817 lofi_blk_mech(struct lofi_state *lsp, longlong_t lblkno) 818 { 819 int ret; 820 crypto_data_t cdata; 821 char *iv; 822 size_t iv_len; 823 size_t min; 824 void *data; 825 size_t datasz; 826 827 ASSERT(MUTEX_HELD(&lsp->ls_crypto_lock)); 828 829 if (lsp == NULL) 830 return (CRYPTO_DEVICE_ERROR); 831 832 /* lsp->ls_mech.cm_param{_len} has already been set for static iv */ 833 if (lsp->ls_iv_type == IVM_NONE) { 834 return (CRYPTO_SUCCESS); 835 } 836 837 /* 838 * if kmem already alloced from previous call and it's the same size 839 * we need now, just recycle it; allocate new kmem only if we have to 840 */ 841 if (lsp->ls_mech.cm_param == NULL || 842 lsp->ls_mech.cm_param_len != lsp->ls_iv_len) { 843 iv_len = lsp->ls_iv_len; 844 iv = kmem_zalloc(iv_len, KM_SLEEP); 845 } else { 846 iv_len = lsp->ls_mech.cm_param_len; 847 iv = lsp->ls_mech.cm_param; 848 bzero(iv, iv_len); 849 } 850 851 switch (lsp->ls_iv_type) { 852 case IVM_ENC_BLKNO: 853 /* iv is not static, lblkno changes each time */ 854 data = &lblkno; 855 datasz = sizeof (lblkno); 856 break; 857 default: 858 data = 0; 859 datasz = 0; 860 break; 861 } 862 863 /* 864 * write blkno into the iv buffer padded on the left in case 865 * blkno ever grows bigger than its current longlong_t size 866 * or a variation other than blkno is used for the iv data 867 */ 868 min = MIN(datasz, iv_len); 869 bcopy(data, iv + (iv_len - min), min); 870 871 /* encrypt the data in-place to get the IV */ 872 SETUP_C_DATA(cdata, iv, iv_len); 873 874 ret = crypto_encrypt(&lsp->ls_iv_mech, &cdata, &lsp->ls_key, 875 NULL, NULL, NULL); 876 if (ret != CRYPTO_SUCCESS) { 877 cmn_err(CE_WARN, "failed to create iv for block %lld: (0x%x)", 878 lblkno, ret); 879 if (lsp->ls_mech.cm_param != iv) 880 kmem_free(iv, iv_len); 881 882 return (ret); 883 } 884 885 /* clean up the iv from the last computation */ 886 if (lsp->ls_mech.cm_param != NULL && lsp->ls_mech.cm_param != iv) 887 kmem_free(lsp->ls_mech.cm_param, lsp->ls_mech.cm_param_len); 888 889 lsp->ls_mech.cm_param_len = iv_len; 890 lsp->ls_mech.cm_param = iv; 891 892 return (CRYPTO_SUCCESS); 893 } 894 895 /* 896 * Performs encryption and decryption of a chunk of data of size "len", 897 * one DEV_BSIZE block at a time. "len" is assumed to be a multiple of 898 * DEV_BSIZE. 899 */ 900 static int 901 lofi_crypto(struct lofi_state *lsp, struct buf *bp, caddr_t plaintext, 902 caddr_t ciphertext, size_t len, boolean_t op_encrypt) 903 { 904 crypto_data_t cdata; 905 crypto_data_t wdata; 906 int ret; 907 longlong_t lblkno = bp->b_lblkno; 908 909 mutex_enter(&lsp->ls_crypto_lock); 910 911 /* 912 * though we could encrypt/decrypt entire "len" chunk of data, we need 913 * to break it into DEV_BSIZE pieces to capture blkno incrementing 914 */ 915 SETUP_C_DATA(cdata, plaintext, len); 916 cdata.cd_length = DEV_BSIZE; 917 if (ciphertext != NULL) { /* not in-place crypto */ 918 SETUP_C_DATA(wdata, ciphertext, len); 919 wdata.cd_length = DEV_BSIZE; 920 } 921 922 do { 923 ret = lofi_blk_mech(lsp, lblkno); 924 if (ret != CRYPTO_SUCCESS) 925 continue; 926 927 if (op_encrypt) { 928 ret = crypto_encrypt(&lsp->ls_mech, &cdata, 929 &lsp->ls_key, NULL, 930 ((ciphertext != NULL) ? &wdata : NULL), NULL); 931 } else { 932 ret = crypto_decrypt(&lsp->ls_mech, &cdata, 933 &lsp->ls_key, NULL, 934 ((ciphertext != NULL) ? &wdata : NULL), NULL); 935 } 936 937 cdata.cd_offset += DEV_BSIZE; 938 if (ciphertext != NULL) 939 wdata.cd_offset += DEV_BSIZE; 940 lblkno++; 941 } while (ret == CRYPTO_SUCCESS && cdata.cd_offset < len); 942 943 mutex_exit(&lsp->ls_crypto_lock); 944 945 if (ret != CRYPTO_SUCCESS) { 946 cmn_err(CE_WARN, "%s failed for block %lld: (0x%x)", 947 op_encrypt ? "crypto_encrypt()" : "crypto_decrypt()", 948 lblkno, ret); 949 } 950 951 return (ret); 952 } 953 954 #define RDWR_RAW 1 955 #define RDWR_BCOPY 2 956 957 static int 958 lofi_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp, 959 struct lofi_state *lsp, size_t len, int method, caddr_t bcopy_locn) 960 { 961 ssize_t resid; 962 int isread; 963 int error; 964 965 /* 966 * Handles reads/writes for both plain and encrypted lofi 967 * Note: offset is already shifted by lsp->ls_crypto_offset 968 * when it gets here. 969 */ 970 971 isread = bp->b_flags & B_READ; 972 if (isread) { 973 if (method == RDWR_BCOPY) { 974 /* DO NOT update bp->b_resid for bcopy */ 975 bcopy(bcopy_locn, bufaddr, len); 976 error = 0; 977 } else { /* RDWR_RAW */ 978 error = vn_rdwr(UIO_READ, lsp->ls_vp, bufaddr, len, 979 offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, 980 &resid); 981 bp->b_resid = resid; 982 } 983 if (lsp->ls_crypto_enabled && error == 0) { 984 if (lofi_crypto(lsp, bp, bufaddr, NULL, len, 985 B_FALSE) != CRYPTO_SUCCESS) { 986 /* 987 * XXX: original code didn't set residual 988 * back to len because no error was expected 989 * from bcopy() if encryption is not enabled 990 */ 991 if (method != RDWR_BCOPY) 992 bp->b_resid = len; 993 error = EIO; 994 } 995 } 996 return (error); 997 } else { 998 void *iobuf = bufaddr; 999 1000 if (lsp->ls_crypto_enabled) { 1001 /* don't do in-place crypto to keep bufaddr intact */ 1002 iobuf = kmem_alloc(len, KM_SLEEP); 1003 if (lofi_crypto(lsp, bp, bufaddr, iobuf, len, 1004 B_TRUE) != CRYPTO_SUCCESS) { 1005 kmem_free(iobuf, len); 1006 if (method != RDWR_BCOPY) 1007 bp->b_resid = len; 1008 return (EIO); 1009 } 1010 } 1011 if (method == RDWR_BCOPY) { 1012 /* DO NOT update bp->b_resid for bcopy */ 1013 bcopy(iobuf, bcopy_locn, len); 1014 error = 0; 1015 } else { /* RDWR_RAW */ 1016 error = vn_rdwr(UIO_WRITE, lsp->ls_vp, iobuf, len, 1017 offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, 1018 &resid); 1019 bp->b_resid = resid; 1020 } 1021 if (lsp->ls_crypto_enabled) { 1022 kmem_free(iobuf, len); 1023 } 1024 return (error); 1025 } 1026 } 1027 1028 static int 1029 lofi_mapped_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp, 1030 struct lofi_state *lsp) 1031 { 1032 int error; 1033 offset_t alignedoffset, mapoffset; 1034 size_t xfersize; 1035 int isread; 1036 int smflags; 1037 caddr_t mapaddr; 1038 size_t len; 1039 enum seg_rw srw; 1040 int save_error; 1041 1042 /* 1043 * Note: offset is already shifted by lsp->ls_crypto_offset 1044 * when it gets here. 1045 */ 1046 if (lsp->ls_crypto_enabled) 1047 ASSERT(lsp->ls_vp_comp_size == lsp->ls_vp_size); 1048 1049 /* 1050 * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on 1051 * an 8K boundary, but the buf transfer address may not be 1052 * aligned on more than a 512-byte boundary (we don't enforce 1053 * that even though we could). This matters since the initial 1054 * part of the transfer may not start at offset 0 within the 1055 * segmap'd chunk. So we have to compensate for that with 1056 * 'mapoffset'. Subsequent chunks always start off at the 1057 * beginning, and the last is capped by b_resid 1058 * 1059 * Visually, where "|" represents page map boundaries: 1060 * alignedoffset (mapaddr begins at this segmap boundary) 1061 * | offset (from beginning of file) 1062 * | | len 1063 * v v v 1064 * ===|====X========|====...======|========X====|==== 1065 * /-------------...---------------/ 1066 * ^ bp->b_bcount/bp->b_resid at start 1067 * /----/--------/----...------/--------/ 1068 * ^ ^ ^ ^ ^ 1069 * | | | | nth xfersize (<= MAXBSIZE) 1070 * | | 2nd thru n-1st xfersize (= MAXBSIZE) 1071 * | 1st xfersize (<= MAXBSIZE) 1072 * mapoffset (offset into 1st segmap, non-0 1st time, 0 thereafter) 1073 * 1074 * Notes: "alignedoffset" is "offset" rounded down to nearest 1075 * MAXBSIZE boundary. "len" is next page boundary of size 1076 * PAGESIZE after "alignedoffset". 1077 */ 1078 mapoffset = offset & MAXBOFFSET; 1079 alignedoffset = offset - mapoffset; 1080 bp->b_resid = bp->b_bcount; 1081 isread = bp->b_flags & B_READ; 1082 srw = isread ? S_READ : S_WRITE; 1083 do { 1084 xfersize = MIN(lsp->ls_vp_comp_size - offset, 1085 MIN(MAXBSIZE - mapoffset, bp->b_resid)); 1086 len = roundup(mapoffset + xfersize, PAGESIZE); 1087 mapaddr = segmap_getmapflt(segkmap, lsp->ls_vp, 1088 alignedoffset, MAXBSIZE, 1, srw); 1089 /* 1090 * Now fault in the pages. This lets us check 1091 * for errors before we reference mapaddr and 1092 * try to resolve the fault in bcopy (which would 1093 * panic instead). And this can easily happen, 1094 * particularly if you've lofi'd a file over NFS 1095 * and someone deletes the file on the server. 1096 */ 1097 error = segmap_fault(kas.a_hat, segkmap, mapaddr, 1098 len, F_SOFTLOCK, srw); 1099 if (error) { 1100 (void) segmap_release(segkmap, mapaddr, 0); 1101 if (FC_CODE(error) == FC_OBJERR) 1102 error = FC_ERRNO(error); 1103 else 1104 error = EIO; 1105 break; 1106 } 1107 /* error may be non-zero for encrypted lofi */ 1108 error = lofi_rdwr(bufaddr, 0, bp, lsp, xfersize, 1109 RDWR_BCOPY, mapaddr + mapoffset); 1110 if (error == 0) { 1111 bp->b_resid -= xfersize; 1112 bufaddr += xfersize; 1113 offset += xfersize; 1114 } 1115 smflags = 0; 1116 if (isread) { 1117 smflags |= SM_FREE; 1118 /* 1119 * If we're reading an entire page starting 1120 * at a page boundary, there's a good chance 1121 * we won't need it again. Put it on the 1122 * head of the freelist. 1123 */ 1124 if (mapoffset == 0 && xfersize == MAXBSIZE) 1125 smflags |= SM_DONTNEED; 1126 } else { 1127 /* 1128 * Write back good pages, it is okay to 1129 * always release asynchronous here as we'll 1130 * follow with VOP_FSYNC for B_SYNC buffers. 1131 */ 1132 if (error == 0) 1133 smflags |= SM_WRITE | SM_ASYNC; 1134 } 1135 (void) segmap_fault(kas.a_hat, segkmap, mapaddr, 1136 len, F_SOFTUNLOCK, srw); 1137 save_error = segmap_release(segkmap, mapaddr, smflags); 1138 if (error == 0) 1139 error = save_error; 1140 /* only the first map may start partial */ 1141 mapoffset = 0; 1142 alignedoffset += MAXBSIZE; 1143 } while ((error == 0) && (bp->b_resid > 0) && 1144 (offset < lsp->ls_vp_comp_size)); 1145 1146 return (error); 1147 } 1148 1149 /* 1150 * Check if segment seg_index is present in the decompressed segment 1151 * data cache. 1152 * 1153 * Returns a pointer to the decompressed segment data cache entry if 1154 * found, and NULL when decompressed data for this segment is not yet 1155 * cached. 1156 */ 1157 static struct lofi_comp_cache * 1158 lofi_find_comp_data(struct lofi_state *lsp, uint64_t seg_index) 1159 { 1160 struct lofi_comp_cache *lc; 1161 1162 ASSERT(MUTEX_HELD(&lsp->ls_comp_cache_lock)); 1163 1164 for (lc = list_head(&lsp->ls_comp_cache); lc != NULL; 1165 lc = list_next(&lsp->ls_comp_cache, lc)) { 1166 if (lc->lc_index == seg_index) { 1167 /* 1168 * Decompressed segment data was found in the 1169 * cache. 1170 * 1171 * The cache uses an LRU replacement strategy; 1172 * move the entry to head of list. 1173 */ 1174 list_remove(&lsp->ls_comp_cache, lc); 1175 list_insert_head(&lsp->ls_comp_cache, lc); 1176 return (lc); 1177 } 1178 } 1179 return (NULL); 1180 } 1181 1182 /* 1183 * Add the data for a decompressed segment at segment index 1184 * seg_index to the cache of the decompressed segments. 1185 * 1186 * Returns a pointer to the cache element structure in case 1187 * the data was added to the cache; returns NULL when the data 1188 * wasn't cached. 1189 */ 1190 static struct lofi_comp_cache * 1191 lofi_add_comp_data(struct lofi_state *lsp, uint64_t seg_index, 1192 uchar_t *data) 1193 { 1194 struct lofi_comp_cache *lc; 1195 1196 ASSERT(MUTEX_HELD(&lsp->ls_comp_cache_lock)); 1197 1198 while (lsp->ls_comp_cache_count > lofi_max_comp_cache) { 1199 lc = list_remove_tail(&lsp->ls_comp_cache); 1200 ASSERT(lc != NULL); 1201 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 1202 kmem_free(lc, sizeof (struct lofi_comp_cache)); 1203 lsp->ls_comp_cache_count--; 1204 } 1205 1206 /* 1207 * Do not cache when disabled by tunable variable 1208 */ 1209 if (lofi_max_comp_cache == 0) 1210 return (NULL); 1211 1212 /* 1213 * When the cache has not yet reached the maximum allowed 1214 * number of segments, allocate a new cache element. 1215 * Otherwise the cache is full; reuse the last list element 1216 * (LRU) for caching the decompressed segment data. 1217 * 1218 * The cache element for the new decompressed segment data is 1219 * added to the head of the list. 1220 */ 1221 if (lsp->ls_comp_cache_count < lofi_max_comp_cache) { 1222 lc = kmem_alloc(sizeof (struct lofi_comp_cache), KM_SLEEP); 1223 lc->lc_data = NULL; 1224 list_insert_head(&lsp->ls_comp_cache, lc); 1225 lsp->ls_comp_cache_count++; 1226 } else { 1227 lc = list_remove_tail(&lsp->ls_comp_cache); 1228 if (lc == NULL) 1229 return (NULL); 1230 list_insert_head(&lsp->ls_comp_cache, lc); 1231 } 1232 1233 /* 1234 * Free old uncompressed segment data when reusing a cache 1235 * entry. 1236 */ 1237 if (lc->lc_data != NULL) 1238 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 1239 1240 lc->lc_data = data; 1241 lc->lc_index = seg_index; 1242 return (lc); 1243 } 1244 1245 1246 /*ARGSUSED*/ 1247 static int 1248 gzip_decompress(void *src, size_t srclen, void *dst, 1249 size_t *dstlen, int level) 1250 { 1251 ASSERT(*dstlen >= srclen); 1252 1253 if (z_uncompress(dst, dstlen, src, srclen) != Z_OK) 1254 return (-1); 1255 return (0); 1256 } 1257 1258 #define LZMA_HEADER_SIZE (LZMA_PROPS_SIZE + 8) 1259 /*ARGSUSED*/ 1260 static int 1261 lzma_decompress(void *src, size_t srclen, void *dst, 1262 size_t *dstlen, int level) 1263 { 1264 size_t insizepure; 1265 void *actual_src; 1266 ELzmaStatus status; 1267 1268 insizepure = srclen - LZMA_HEADER_SIZE; 1269 actual_src = (void *)((Byte *)src + LZMA_HEADER_SIZE); 1270 1271 if (LzmaDecode((Byte *)dst, (size_t *)dstlen, 1272 (const Byte *)actual_src, &insizepure, 1273 (const Byte *)src, LZMA_PROPS_SIZE, LZMA_FINISH_ANY, &status, 1274 &g_Alloc) != SZ_OK) { 1275 return (-1); 1276 } 1277 return (0); 1278 } 1279 1280 /* 1281 * This is basically what strategy used to be before we found we 1282 * needed task queues. 1283 */ 1284 static void 1285 lofi_strategy_task(void *arg) 1286 { 1287 struct buf *bp = (struct buf *)arg; 1288 int error; 1289 int syncflag = 0; 1290 struct lofi_state *lsp; 1291 offset_t offset; 1292 caddr_t bufaddr; 1293 size_t len; 1294 size_t xfersize; 1295 boolean_t bufinited = B_FALSE; 1296 1297 lsp = ddi_get_soft_state(lofi_statep, 1298 LOFI_MINOR2ID(getminor(bp->b_edev))); 1299 1300 if (lsp == NULL) { 1301 error = ENXIO; 1302 goto errout; 1303 } 1304 if (lsp->ls_kstat) { 1305 mutex_enter(lsp->ls_kstat->ks_lock); 1306 kstat_waitq_to_runq(KSTAT_IO_PTR(lsp->ls_kstat)); 1307 mutex_exit(lsp->ls_kstat->ks_lock); 1308 } 1309 1310 mutex_enter(&lsp->ls_vp_lock); 1311 lsp->ls_vp_iocount++; 1312 mutex_exit(&lsp->ls_vp_lock); 1313 1314 bp_mapin(bp); 1315 bufaddr = bp->b_un.b_addr; 1316 offset = (bp->b_lblkno + (diskaddr_t)(uintptr_t)bp->b_private) 1317 << lsp->ls_lbshift; /* offset within file */ 1318 if (lsp->ls_crypto_enabled) { 1319 /* encrypted data really begins after crypto header */ 1320 offset += lsp->ls_crypto_offset; 1321 } 1322 len = bp->b_bcount; 1323 bufinited = B_TRUE; 1324 1325 if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { 1326 error = EIO; 1327 goto errout; 1328 } 1329 1330 /* 1331 * If we're writing and the buffer was not B_ASYNC 1332 * we'll follow up with a VOP_FSYNC() to force any 1333 * asynchronous I/O to stable storage. 1334 */ 1335 if (!(bp->b_flags & B_READ) && !(bp->b_flags & B_ASYNC)) 1336 syncflag = FSYNC; 1337 1338 /* 1339 * We used to always use vn_rdwr here, but we cannot do that because 1340 * we might decide to read or write from the the underlying 1341 * file during this call, which would be a deadlock because 1342 * we have the rw_lock. So instead we page, unless it's not 1343 * mapable or it's a character device or it's an encrypted lofi. 1344 */ 1345 if ((lsp->ls_vp->v_flag & VNOMAP) || (lsp->ls_vp->v_type == VCHR) || 1346 lsp->ls_crypto_enabled) { 1347 error = lofi_rdwr(bufaddr, offset, bp, lsp, len, RDWR_RAW, 1348 NULL); 1349 } else if (lsp->ls_uncomp_seg_sz == 0) { 1350 error = lofi_mapped_rdwr(bufaddr, offset, bp, lsp); 1351 } else { 1352 uchar_t *compressed_seg = NULL, *cmpbuf; 1353 uchar_t *uncompressed_seg = NULL; 1354 lofi_compress_info_t *li; 1355 size_t oblkcount; 1356 ulong_t seglen; 1357 uint64_t sblkno, eblkno, cmpbytes; 1358 uint64_t uncompressed_seg_index; 1359 struct lofi_comp_cache *lc; 1360 offset_t sblkoff, eblkoff; 1361 u_offset_t salign, ealign; 1362 u_offset_t sdiff; 1363 uint32_t comp_data_sz; 1364 uint64_t i; 1365 int j; 1366 1367 /* 1368 * From here on we're dealing primarily with compressed files 1369 */ 1370 ASSERT(!lsp->ls_crypto_enabled); 1371 1372 /* 1373 * Compressed files can only be read from and 1374 * not written to 1375 */ 1376 if (!(bp->b_flags & B_READ)) { 1377 bp->b_resid = bp->b_bcount; 1378 error = EROFS; 1379 goto done; 1380 } 1381 1382 ASSERT(lsp->ls_comp_algorithm_index >= 0); 1383 li = &lofi_compress_table[lsp->ls_comp_algorithm_index]; 1384 /* 1385 * Compute starting and ending compressed segment numbers 1386 * We use only bitwise operations avoiding division and 1387 * modulus because we enforce the compression segment size 1388 * to a power of 2 1389 */ 1390 sblkno = offset >> lsp->ls_comp_seg_shift; 1391 sblkoff = offset & (lsp->ls_uncomp_seg_sz - 1); 1392 eblkno = (offset + bp->b_bcount) >> lsp->ls_comp_seg_shift; 1393 eblkoff = (offset + bp->b_bcount) & (lsp->ls_uncomp_seg_sz - 1); 1394 1395 /* 1396 * Check the decompressed segment cache. 1397 * 1398 * The cache is used only when the requested data 1399 * is within a segment. Requests that cross 1400 * segment boundaries bypass the cache. 1401 */ 1402 if (sblkno == eblkno || 1403 (sblkno + 1 == eblkno && eblkoff == 0)) { 1404 /* 1405 * Request doesn't cross a segment boundary, 1406 * now check the cache. 1407 */ 1408 mutex_enter(&lsp->ls_comp_cache_lock); 1409 lc = lofi_find_comp_data(lsp, sblkno); 1410 if (lc != NULL) { 1411 /* 1412 * We've found the decompressed segment 1413 * data in the cache; reuse it. 1414 */ 1415 bcopy(lc->lc_data + sblkoff, bufaddr, 1416 bp->b_bcount); 1417 mutex_exit(&lsp->ls_comp_cache_lock); 1418 bp->b_resid = 0; 1419 error = 0; 1420 goto done; 1421 } 1422 mutex_exit(&lsp->ls_comp_cache_lock); 1423 } 1424 1425 /* 1426 * Align start offset to block boundary for segmap 1427 */ 1428 salign = lsp->ls_comp_seg_index[sblkno]; 1429 sdiff = salign & (DEV_BSIZE - 1); 1430 salign -= sdiff; 1431 if (eblkno >= (lsp->ls_comp_index_sz - 1)) { 1432 /* 1433 * We're dealing with the last segment of 1434 * the compressed file -- the size of this 1435 * segment *may not* be the same as the 1436 * segment size for the file 1437 */ 1438 eblkoff = (offset + bp->b_bcount) & 1439 (lsp->ls_uncomp_last_seg_sz - 1); 1440 ealign = lsp->ls_vp_comp_size; 1441 } else { 1442 ealign = lsp->ls_comp_seg_index[eblkno + 1]; 1443 } 1444 1445 /* 1446 * Preserve original request paramaters 1447 */ 1448 oblkcount = bp->b_bcount; 1449 1450 /* 1451 * Assign the calculated parameters 1452 */ 1453 comp_data_sz = ealign - salign; 1454 bp->b_bcount = comp_data_sz; 1455 1456 /* 1457 * Buffers to hold compressed segments are pre-allocated 1458 * on a per-thread basis. Find a pre-allocated buffer 1459 * that is not currently in use and mark it for use. 1460 */ 1461 mutex_enter(&lsp->ls_comp_bufs_lock); 1462 for (j = 0; j < lofi_taskq_nthreads; j++) { 1463 if (lsp->ls_comp_bufs[j].inuse == 0) { 1464 lsp->ls_comp_bufs[j].inuse = 1; 1465 break; 1466 } 1467 } 1468 1469 mutex_exit(&lsp->ls_comp_bufs_lock); 1470 ASSERT(j < lofi_taskq_nthreads); 1471 1472 /* 1473 * If the pre-allocated buffer size does not match 1474 * the size of the I/O request, re-allocate it with 1475 * the appropriate size 1476 */ 1477 if (lsp->ls_comp_bufs[j].bufsize < bp->b_bcount) { 1478 if (lsp->ls_comp_bufs[j].bufsize > 0) 1479 kmem_free(lsp->ls_comp_bufs[j].buf, 1480 lsp->ls_comp_bufs[j].bufsize); 1481 lsp->ls_comp_bufs[j].buf = kmem_alloc(bp->b_bcount, 1482 KM_SLEEP); 1483 lsp->ls_comp_bufs[j].bufsize = bp->b_bcount; 1484 } 1485 compressed_seg = lsp->ls_comp_bufs[j].buf; 1486 1487 /* 1488 * Map in the calculated number of blocks 1489 */ 1490 error = lofi_mapped_rdwr((caddr_t)compressed_seg, salign, 1491 bp, lsp); 1492 1493 bp->b_bcount = oblkcount; 1494 bp->b_resid = oblkcount; 1495 if (error != 0) 1496 goto done; 1497 1498 /* 1499 * decompress compressed blocks start 1500 */ 1501 cmpbuf = compressed_seg + sdiff; 1502 for (i = sblkno; i <= eblkno; i++) { 1503 ASSERT(i < lsp->ls_comp_index_sz - 1); 1504 uchar_t *useg; 1505 1506 /* 1507 * The last segment is special in that it is 1508 * most likely not going to be the same 1509 * (uncompressed) size as the other segments. 1510 */ 1511 if (i == (lsp->ls_comp_index_sz - 2)) { 1512 seglen = lsp->ls_uncomp_last_seg_sz; 1513 } else { 1514 seglen = lsp->ls_uncomp_seg_sz; 1515 } 1516 1517 /* 1518 * Each of the segment index entries contains 1519 * the starting block number for that segment. 1520 * The number of compressed bytes in a segment 1521 * is thus the difference between the starting 1522 * block number of this segment and the starting 1523 * block number of the next segment. 1524 */ 1525 cmpbytes = lsp->ls_comp_seg_index[i + 1] - 1526 lsp->ls_comp_seg_index[i]; 1527 1528 /* 1529 * The first byte in a compressed segment is a flag 1530 * that indicates whether this segment is compressed 1531 * at all. 1532 * 1533 * The variable 'useg' is used (instead of 1534 * uncompressed_seg) in this loop to keep a 1535 * reference to the uncompressed segment. 1536 * 1537 * N.B. If 'useg' is replaced with uncompressed_seg, 1538 * it leads to memory leaks and heap corruption in 1539 * corner cases where compressed segments lie 1540 * adjacent to uncompressed segments. 1541 */ 1542 if (*cmpbuf == UNCOMPRESSED) { 1543 useg = cmpbuf + SEGHDR; 1544 } else { 1545 if (uncompressed_seg == NULL) 1546 uncompressed_seg = 1547 kmem_alloc(lsp->ls_uncomp_seg_sz, 1548 KM_SLEEP); 1549 useg = uncompressed_seg; 1550 uncompressed_seg_index = i; 1551 1552 if (li->l_decompress((cmpbuf + SEGHDR), 1553 (cmpbytes - SEGHDR), uncompressed_seg, 1554 &seglen, li->l_level) != 0) { 1555 error = EIO; 1556 goto done; 1557 } 1558 } 1559 1560 /* 1561 * Determine how much uncompressed data we 1562 * have to copy and copy it 1563 */ 1564 xfersize = lsp->ls_uncomp_seg_sz - sblkoff; 1565 if (i == eblkno) 1566 xfersize -= (lsp->ls_uncomp_seg_sz - eblkoff); 1567 1568 bcopy((useg + sblkoff), bufaddr, xfersize); 1569 1570 cmpbuf += cmpbytes; 1571 bufaddr += xfersize; 1572 bp->b_resid -= xfersize; 1573 sblkoff = 0; 1574 1575 if (bp->b_resid == 0) 1576 break; 1577 } /* decompress compressed blocks ends */ 1578 1579 /* 1580 * Skip to done if there is no uncompressed data to cache 1581 */ 1582 if (uncompressed_seg == NULL) 1583 goto done; 1584 1585 /* 1586 * Add the data for the last decompressed segment to 1587 * the cache. 1588 * 1589 * In case the uncompressed segment data was added to (and 1590 * is referenced by) the cache, make sure we don't free it 1591 * here. 1592 */ 1593 mutex_enter(&lsp->ls_comp_cache_lock); 1594 if ((lc = lofi_add_comp_data(lsp, uncompressed_seg_index, 1595 uncompressed_seg)) != NULL) { 1596 uncompressed_seg = NULL; 1597 } 1598 mutex_exit(&lsp->ls_comp_cache_lock); 1599 1600 done: 1601 if (compressed_seg != NULL) { 1602 mutex_enter(&lsp->ls_comp_bufs_lock); 1603 lsp->ls_comp_bufs[j].inuse = 0; 1604 mutex_exit(&lsp->ls_comp_bufs_lock); 1605 } 1606 if (uncompressed_seg != NULL) 1607 kmem_free(uncompressed_seg, lsp->ls_uncomp_seg_sz); 1608 } /* end of handling compressed files */ 1609 1610 if ((error == 0) && (syncflag != 0)) 1611 error = VOP_FSYNC(lsp->ls_vp, syncflag, kcred, NULL); 1612 1613 errout: 1614 if (bufinited && lsp->ls_kstat) { 1615 size_t n_done = bp->b_bcount - bp->b_resid; 1616 kstat_io_t *kioptr; 1617 1618 mutex_enter(lsp->ls_kstat->ks_lock); 1619 kioptr = KSTAT_IO_PTR(lsp->ls_kstat); 1620 if (bp->b_flags & B_READ) { 1621 kioptr->nread += n_done; 1622 kioptr->reads++; 1623 } else { 1624 kioptr->nwritten += n_done; 1625 kioptr->writes++; 1626 } 1627 kstat_runq_exit(kioptr); 1628 mutex_exit(lsp->ls_kstat->ks_lock); 1629 } 1630 1631 mutex_enter(&lsp->ls_vp_lock); 1632 if (--lsp->ls_vp_iocount == 0) 1633 cv_broadcast(&lsp->ls_vp_cv); 1634 mutex_exit(&lsp->ls_vp_lock); 1635 1636 bioerror(bp, error); 1637 biodone(bp); 1638 } 1639 1640 static int 1641 lofi_strategy(struct buf *bp) 1642 { 1643 struct lofi_state *lsp; 1644 offset_t offset; 1645 minor_t part; 1646 diskaddr_t p_lba; 1647 diskaddr_t p_nblks; 1648 int shift; 1649 1650 /* 1651 * We cannot just do I/O here, because the current thread 1652 * _might_ end up back in here because the underlying filesystem 1653 * wants a buffer, which eventually gets into bio_recycle and 1654 * might call into lofi to write out a delayed-write buffer. 1655 * This is bad if the filesystem above lofi is the same as below. 1656 * 1657 * We could come up with a complex strategy using threads to 1658 * do the I/O asynchronously, or we could use task queues. task 1659 * queues were incredibly easy so they win. 1660 */ 1661 1662 lsp = ddi_get_soft_state(lofi_statep, 1663 LOFI_MINOR2ID(getminor(bp->b_edev))); 1664 part = LOFI_PART(getminor(bp->b_edev)); 1665 1666 if (lsp == NULL) { 1667 bioerror(bp, ENXIO); 1668 biodone(bp); 1669 return (0); 1670 } 1671 1672 /* Check if we are closing. */ 1673 mutex_enter(&lsp->ls_vp_lock); 1674 if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { 1675 mutex_exit(&lsp->ls_vp_lock); 1676 bioerror(bp, EIO); 1677 biodone(bp); 1678 return (0); 1679 } 1680 mutex_exit(&lsp->ls_vp_lock); 1681 1682 shift = lsp->ls_lbshift; 1683 p_lba = 0; 1684 p_nblks = lsp->ls_vp_size >> shift; 1685 1686 if (lsp->ls_cmlbhandle != NULL) { 1687 if (cmlb_partinfo(lsp->ls_cmlbhandle, part, &p_nblks, &p_lba, 1688 NULL, NULL, 0)) { 1689 bioerror(bp, ENXIO); 1690 biodone(bp); 1691 return (0); 1692 } 1693 } 1694 1695 /* start block past partition end? */ 1696 if (bp->b_lblkno > p_nblks) { 1697 bioerror(bp, ENXIO); 1698 biodone(bp); 1699 return (0); 1700 } 1701 1702 offset = (bp->b_lblkno+p_lba) << shift; /* offset within file */ 1703 1704 mutex_enter(&lsp->ls_vp_lock); 1705 if (lsp->ls_crypto_enabled) { 1706 /* encrypted data really begins after crypto header */ 1707 offset += lsp->ls_crypto_offset; 1708 } 1709 1710 /* make sure we will not pass the file or partition size */ 1711 if (offset == lsp->ls_vp_size || 1712 offset == (((p_lba + p_nblks) << shift) + lsp->ls_crypto_offset)) { 1713 /* EOF */ 1714 if ((bp->b_flags & B_READ) != 0) { 1715 bp->b_resid = bp->b_bcount; 1716 bioerror(bp, 0); 1717 } else { 1718 /* writes should fail */ 1719 bioerror(bp, ENXIO); 1720 } 1721 biodone(bp); 1722 mutex_exit(&lsp->ls_vp_lock); 1723 return (0); 1724 } 1725 if ((offset > lsp->ls_vp_size) || 1726 (offset > (((p_lba + p_nblks) << shift) + lsp->ls_crypto_offset)) || 1727 ((offset + bp->b_bcount) > ((p_lba + p_nblks) << shift))) { 1728 bioerror(bp, ENXIO); 1729 biodone(bp); 1730 mutex_exit(&lsp->ls_vp_lock); 1731 return (0); 1732 } 1733 1734 mutex_exit(&lsp->ls_vp_lock); 1735 1736 if (lsp->ls_kstat) { 1737 mutex_enter(lsp->ls_kstat->ks_lock); 1738 kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat)); 1739 mutex_exit(lsp->ls_kstat->ks_lock); 1740 } 1741 bp->b_private = (void *)(uintptr_t)p_lba; /* partition start */ 1742 (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP); 1743 return (0); 1744 } 1745 1746 static int 1747 lofi_read(dev_t dev, struct uio *uio, struct cred *credp) 1748 { 1749 _NOTE(ARGUNUSED(credp)); 1750 1751 if (getminor(dev) == 0) 1752 return (EINVAL); 1753 UIO_CHECK(uio); 1754 return (physio(lofi_strategy, NULL, dev, B_READ, minphys, uio)); 1755 } 1756 1757 static int 1758 lofi_write(dev_t dev, struct uio *uio, struct cred *credp) 1759 { 1760 _NOTE(ARGUNUSED(credp)); 1761 1762 if (getminor(dev) == 0) 1763 return (EINVAL); 1764 UIO_CHECK(uio); 1765 return (physio(lofi_strategy, NULL, dev, B_WRITE, minphys, uio)); 1766 } 1767 1768 static int 1769 lofi_urw(struct lofi_state *lsp, uint16_t fmode, diskaddr_t off, size_t size, 1770 intptr_t arg, int flag, cred_t *credp) 1771 { 1772 struct uio uio; 1773 iovec_t iov; 1774 1775 /* 1776 * 1024 * 1024 apes cmlb_tg_max_efi_xfer as a reasonable max. 1777 */ 1778 if (size == 0 || size > 1024 * 1024 || 1779 (size % (1 << lsp->ls_lbshift)) != 0) 1780 return (EINVAL); 1781 1782 iov.iov_base = (void *)arg; 1783 iov.iov_len = size; 1784 uio.uio_iov = &iov; 1785 uio.uio_iovcnt = 1; 1786 uio.uio_loffset = off; 1787 uio.uio_segflg = (flag & FKIOCTL) ? UIO_SYSSPACE : UIO_USERSPACE; 1788 uio.uio_llimit = MAXOFFSET_T; 1789 uio.uio_resid = size; 1790 uio.uio_fmode = fmode; 1791 uio.uio_extflg = 0; 1792 1793 return (fmode == FREAD ? 1794 lofi_read(lsp->ls_dev, &uio, credp) : 1795 lofi_write(lsp->ls_dev, &uio, credp)); 1796 } 1797 1798 /*ARGSUSED2*/ 1799 static int 1800 lofi_aread(dev_t dev, struct aio_req *aio, struct cred *credp) 1801 { 1802 if (getminor(dev) == 0) 1803 return (EINVAL); 1804 UIO_CHECK(aio->aio_uio); 1805 return (aphysio(lofi_strategy, anocancel, dev, B_READ, minphys, aio)); 1806 } 1807 1808 /*ARGSUSED2*/ 1809 static int 1810 lofi_awrite(dev_t dev, struct aio_req *aio, struct cred *credp) 1811 { 1812 if (getminor(dev) == 0) 1813 return (EINVAL); 1814 UIO_CHECK(aio->aio_uio); 1815 return (aphysio(lofi_strategy, anocancel, dev, B_WRITE, minphys, aio)); 1816 } 1817 1818 /*ARGSUSED*/ 1819 static int 1820 lofi_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 1821 { 1822 struct lofi_state *lsp; 1823 dev_t dev = (dev_t)arg; 1824 int instance; 1825 1826 instance = LOFI_MINOR2ID(getminor(dev)); 1827 switch (infocmd) { 1828 case DDI_INFO_DEVT2DEVINFO: 1829 lsp = ddi_get_soft_state(lofi_statep, instance); 1830 if (lsp == NULL) 1831 return (DDI_FAILURE); 1832 *result = lsp->ls_dip; 1833 return (DDI_SUCCESS); 1834 case DDI_INFO_DEVT2INSTANCE: 1835 *result = (void *) (intptr_t)instance; 1836 return (DDI_SUCCESS); 1837 } 1838 return (DDI_FAILURE); 1839 } 1840 1841 static int 1842 lofi_create_minor_nodes(struct lofi_state *lsp, boolean_t labeled) 1843 { 1844 int error = 0; 1845 int instance = ddi_get_instance(lsp->ls_dip); 1846 1847 if (labeled == B_TRUE) { 1848 cmlb_alloc_handle(&lsp->ls_cmlbhandle); 1849 error = cmlb_attach(lsp->ls_dip, &lofi_tg_ops, DTYPE_DIRECT, 1850 B_FALSE, B_FALSE, DDI_NT_BLOCK_CHAN, 1851 CMLB_CREATE_P0_MINOR_NODE, lsp->ls_cmlbhandle, (void *)1); 1852 1853 if (error != DDI_SUCCESS) { 1854 cmlb_free_handle(&lsp->ls_cmlbhandle); 1855 lsp->ls_cmlbhandle = NULL; 1856 error = ENXIO; 1857 } 1858 } else { 1859 /* create minor nodes */ 1860 error = ddi_create_minor_node(lsp->ls_dip, LOFI_BLOCK_NODE, 1861 S_IFBLK, LOFI_ID2MINOR(instance), DDI_PSEUDO, 0); 1862 if (error == DDI_SUCCESS) { 1863 error = ddi_create_minor_node(lsp->ls_dip, 1864 LOFI_CHAR_NODE, S_IFCHR, LOFI_ID2MINOR(instance), 1865 DDI_PSEUDO, 0); 1866 if (error != DDI_SUCCESS) { 1867 ddi_remove_minor_node(lsp->ls_dip, 1868 LOFI_BLOCK_NODE); 1869 error = ENXIO; 1870 } 1871 } else 1872 error = ENXIO; 1873 } 1874 return (error); 1875 } 1876 1877 static int 1878 lofi_zone_bind(struct lofi_state *lsp) 1879 { 1880 int error = 0; 1881 1882 mutex_enter(&curproc->p_lock); 1883 if ((error = rctl_incr_lofi(curproc, curproc->p_zone, 1)) != 0) { 1884 mutex_exit(&curproc->p_lock); 1885 return (error); 1886 } 1887 mutex_exit(&curproc->p_lock); 1888 1889 if (ddi_prop_update_string(DDI_DEV_T_NONE, lsp->ls_dip, ZONE_PROP_NAME, 1890 (char *)curproc->p_zone->zone_name) != DDI_PROP_SUCCESS) { 1891 rctl_decr_lofi(curproc->p_zone, 1); 1892 error = EINVAL; 1893 } else { 1894 zone_init_ref(&lsp->ls_zone); 1895 zone_hold_ref(curzone, &lsp->ls_zone, ZONE_REF_LOFI); 1896 } 1897 return (error); 1898 } 1899 1900 static void 1901 lofi_zone_unbind(struct lofi_state *lsp) 1902 { 1903 (void) ddi_prop_remove(DDI_DEV_T_NONE, lsp->ls_dip, ZONE_PROP_NAME); 1904 rctl_decr_lofi(curproc->p_zone, 1); 1905 zone_rele_ref(&lsp->ls_zone, ZONE_REF_LOFI); 1906 } 1907 1908 static int 1909 lofi_online_dev(dev_info_t *dip) 1910 { 1911 boolean_t labeled; 1912 int error; 1913 int instance = ddi_get_instance(dip); 1914 struct lofi_state *lsp; 1915 1916 labeled = B_FALSE; 1917 if (ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "labeled")) 1918 labeled = B_TRUE; 1919 1920 /* lsp alloc+init, soft state is freed in lofi_detach */ 1921 error = ddi_soft_state_zalloc(lofi_statep, instance); 1922 if (error == DDI_FAILURE) { 1923 return (ENOMEM); 1924 } 1925 1926 lsp = ddi_get_soft_state(lofi_statep, instance); 1927 lsp->ls_dip = dip; 1928 1929 if ((error = lofi_zone_bind(lsp)) != 0) 1930 goto err; 1931 1932 cv_init(&lsp->ls_vp_cv, NULL, CV_DRIVER, NULL); 1933 mutex_init(&lsp->ls_comp_cache_lock, NULL, MUTEX_DRIVER, NULL); 1934 mutex_init(&lsp->ls_comp_bufs_lock, NULL, MUTEX_DRIVER, NULL); 1935 mutex_init(&lsp->ls_kstat_lock, NULL, MUTEX_DRIVER, NULL); 1936 mutex_init(&lsp->ls_vp_lock, NULL, MUTEX_DRIVER, NULL); 1937 1938 if ((error = lofi_create_minor_nodes(lsp, labeled)) != 0) { 1939 lofi_zone_unbind(lsp); 1940 goto lerr; 1941 } 1942 1943 /* driver handles kernel-issued IOCTLs */ 1944 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 1945 DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { 1946 error = DDI_FAILURE; 1947 goto merr; 1948 } 1949 1950 lsp->ls_kstat = kstat_create_zone(LOFI_DRIVER_NAME, instance, 1951 NULL, "disk", KSTAT_TYPE_IO, 1, 0, getzoneid()); 1952 if (lsp->ls_kstat == NULL) { 1953 (void) ddi_prop_remove(DDI_DEV_T_NONE, lsp->ls_dip, 1954 DDI_KERNEL_IOCTL); 1955 error = ENOMEM; 1956 goto merr; 1957 } 1958 1959 lsp->ls_kstat->ks_lock = &lsp->ls_kstat_lock; 1960 kstat_zone_add(lsp->ls_kstat, GLOBAL_ZONEID); 1961 kstat_install(lsp->ls_kstat); 1962 return (DDI_SUCCESS); 1963 merr: 1964 if (lsp->ls_cmlbhandle != NULL) { 1965 cmlb_detach(lsp->ls_cmlbhandle, 0); 1966 cmlb_free_handle(&lsp->ls_cmlbhandle); 1967 } 1968 ddi_remove_minor_node(dip, NULL); 1969 lofi_zone_unbind(lsp); 1970 lerr: 1971 mutex_destroy(&lsp->ls_comp_cache_lock); 1972 mutex_destroy(&lsp->ls_comp_bufs_lock); 1973 mutex_destroy(&lsp->ls_kstat_lock); 1974 mutex_destroy(&lsp->ls_vp_lock); 1975 cv_destroy(&lsp->ls_vp_cv); 1976 err: 1977 ddi_soft_state_free(lofi_statep, instance); 1978 return (error); 1979 } 1980 1981 static int 1982 lofi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 1983 { 1984 int rv; 1985 int instance = ddi_get_instance(dip); 1986 struct lofi_state *lsp; 1987 1988 if (cmd != DDI_ATTACH) 1989 return (DDI_FAILURE); 1990 1991 /* 1992 * Instance 0 is control instance, attaching control instance 1993 * will set the lofi up and ready. 1994 */ 1995 if (instance == 0) { 1996 rv = ddi_soft_state_zalloc(lofi_statep, 0); 1997 if (rv == DDI_FAILURE) { 1998 return (DDI_FAILURE); 1999 } 2000 lsp = ddi_get_soft_state(lofi_statep, instance); 2001 rv = ddi_create_minor_node(dip, LOFI_CTL_NODE, S_IFCHR, 0, 2002 DDI_PSEUDO, 0); 2003 if (rv == DDI_FAILURE) { 2004 ddi_soft_state_free(lofi_statep, 0); 2005 return (DDI_FAILURE); 2006 } 2007 /* driver handles kernel-issued IOCTLs */ 2008 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 2009 DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { 2010 ddi_remove_minor_node(dip, NULL); 2011 ddi_soft_state_free(lofi_statep, 0); 2012 return (DDI_FAILURE); 2013 } 2014 2015 zone_key_create(&lofi_zone_key, NULL, lofi_zone_shutdown, NULL); 2016 2017 lsp->ls_dip = dip; 2018 } else { 2019 if (lofi_online_dev(dip) == DDI_FAILURE) 2020 return (DDI_FAILURE); 2021 } 2022 2023 ddi_report_dev(dip); 2024 return (DDI_SUCCESS); 2025 } 2026 2027 static int 2028 lofi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2029 { 2030 struct lofi_state *lsp; 2031 int instance = ddi_get_instance(dip); 2032 2033 if (cmd != DDI_DETACH) 2034 return (DDI_FAILURE); 2035 2036 /* 2037 * If the instance is not 0, release state. 2038 * The instance 0 is control device, we can not detach it 2039 * before other instances are detached. 2040 */ 2041 if (instance != 0) { 2042 lsp = ddi_get_soft_state(lofi_statep, instance); 2043 if (lsp != NULL && lsp->ls_vp_ready == B_FALSE) { 2044 ddi_soft_state_free(lofi_statep, instance); 2045 return (DDI_SUCCESS); 2046 } else 2047 return (DDI_FAILURE); 2048 } 2049 mutex_enter(&lofi_lock); 2050 2051 if (!list_is_empty(&lofi_list)) { 2052 mutex_exit(&lofi_lock); 2053 return (DDI_FAILURE); 2054 } 2055 2056 ddi_remove_minor_node(dip, NULL); 2057 ddi_prop_remove_all(dip); 2058 2059 mutex_exit(&lofi_lock); 2060 2061 if (zone_key_delete(lofi_zone_key) != 0) 2062 cmn_err(CE_WARN, "failed to delete zone key"); 2063 2064 ddi_soft_state_free(lofi_statep, 0); 2065 2066 return (DDI_SUCCESS); 2067 } 2068 2069 /* 2070 * With the addition of encryption, we must be careful that encryption key is 2071 * wiped before kernel's data structures are freed so it cannot accidentally 2072 * slip out to userland through uninitialized data elsewhere. 2073 */ 2074 static void 2075 free_lofi_ioctl(struct lofi_ioctl *klip) 2076 { 2077 /* Make sure this encryption key doesn't stick around */ 2078 bzero(klip->li_key, sizeof (klip->li_key)); 2079 kmem_free(klip, sizeof (struct lofi_ioctl)); 2080 } 2081 2082 /* 2083 * These two functions simplify the rest of the ioctls that need to copyin/out 2084 * the lofi_ioctl structure. 2085 */ 2086 int 2087 copy_in_lofi_ioctl(const struct lofi_ioctl *ulip, struct lofi_ioctl **klipp, 2088 int flag) 2089 { 2090 struct lofi_ioctl *klip; 2091 int error; 2092 2093 klip = *klipp = kmem_alloc(sizeof (struct lofi_ioctl), KM_SLEEP); 2094 error = ddi_copyin(ulip, klip, sizeof (struct lofi_ioctl), flag); 2095 if (error) 2096 goto err; 2097 2098 /* ensure NULL termination */ 2099 klip->li_filename[MAXPATHLEN-1] = '\0'; 2100 klip->li_devpath[MAXPATHLEN-1] = '\0'; 2101 klip->li_algorithm[MAXALGLEN-1] = '\0'; 2102 klip->li_cipher[CRYPTO_MAX_MECH_NAME-1] = '\0'; 2103 klip->li_iv_cipher[CRYPTO_MAX_MECH_NAME-1] = '\0'; 2104 2105 if (klip->li_id > L_MAXMIN32) { 2106 error = EINVAL; 2107 goto err; 2108 } 2109 2110 return (0); 2111 2112 err: 2113 free_lofi_ioctl(klip); 2114 return (error); 2115 } 2116 2117 int 2118 copy_out_lofi_ioctl(const struct lofi_ioctl *klip, struct lofi_ioctl *ulip, 2119 int flag) 2120 { 2121 int error; 2122 2123 /* 2124 * NOTE: Do NOT copy the crypto_key_t "back" to userland. 2125 * This ensures that an attacker can't trivially find the 2126 * key for a mapping just by issuing the ioctl. 2127 * 2128 * It can still be found by poking around in kmem with mdb(1), 2129 * but there is no point in making it easy when the info isn't 2130 * of any use in this direction anyway. 2131 * 2132 * Either way we don't actually have the raw key stored in 2133 * a form that we can get it anyway, since we just used it 2134 * to create a ctx template and didn't keep "the original". 2135 */ 2136 error = ddi_copyout(klip, ulip, sizeof (struct lofi_ioctl), flag); 2137 if (error) 2138 return (EFAULT); 2139 return (0); 2140 } 2141 2142 static int 2143 lofi_access(struct lofi_state *lsp) 2144 { 2145 ASSERT(MUTEX_HELD(&lofi_lock)); 2146 if (INGLOBALZONE(curproc) || lsp->ls_zone.zref_zone == curzone) 2147 return (0); 2148 return (EPERM); 2149 } 2150 2151 /* 2152 * Find the lofi state for the given filename. We compare by vnode to 2153 * allow the global zone visibility into NGZ lofi nodes. 2154 */ 2155 static int 2156 file_to_lofi_nocheck(char *filename, boolean_t readonly, 2157 struct lofi_state **lspp) 2158 { 2159 struct lofi_state *lsp; 2160 vnode_t *vp = NULL; 2161 int err = 0; 2162 int rdfiles = 0; 2163 2164 ASSERT(MUTEX_HELD(&lofi_lock)); 2165 2166 if ((err = lookupname(filename, UIO_SYSSPACE, FOLLOW, 2167 NULLVPP, &vp)) != 0) 2168 goto out; 2169 2170 if (vp->v_type == VREG) { 2171 vnode_t *realvp; 2172 if (VOP_REALVP(vp, &realvp, NULL) == 0) { 2173 VN_HOLD(realvp); 2174 VN_RELE(vp); 2175 vp = realvp; 2176 } 2177 } 2178 2179 for (lsp = list_head(&lofi_list); lsp != NULL; 2180 lsp = list_next(&lofi_list, lsp)) { 2181 if (lsp->ls_vp == vp) { 2182 if (lspp != NULL) 2183 *lspp = lsp; 2184 if (lsp->ls_readonly) { 2185 rdfiles++; 2186 /* Skip if '-r' is specified */ 2187 if (readonly) 2188 continue; 2189 } 2190 goto out; 2191 } 2192 } 2193 2194 err = ENOENT; 2195 2196 /* 2197 * If a filename is given as an argument for lofi_unmap, we shouldn't 2198 * allow unmap if there are multiple read-only lofi devices associated 2199 * with this file. 2200 */ 2201 if (lspp != NULL) { 2202 if (rdfiles == 1) 2203 err = 0; 2204 else if (rdfiles > 1) 2205 err = EBUSY; 2206 } 2207 2208 out: 2209 if (vp != NULL) 2210 VN_RELE(vp); 2211 return (err); 2212 } 2213 2214 /* 2215 * Find the minor for the given filename, checking the zone can access 2216 * it. 2217 */ 2218 static int 2219 file_to_lofi(char *filename, boolean_t readonly, struct lofi_state **lspp) 2220 { 2221 int err = 0; 2222 2223 ASSERT(MUTEX_HELD(&lofi_lock)); 2224 2225 if ((err = file_to_lofi_nocheck(filename, readonly, lspp)) != 0) 2226 return (err); 2227 2228 if ((err = lofi_access(*lspp)) != 0) 2229 return (err); 2230 2231 return (0); 2232 } 2233 2234 /* 2235 * Fakes up a disk geometry based on the size of the file. This is needed 2236 * to support newfs on traditional lofi device, but also will provide 2237 * geometry hint for cmlb. 2238 */ 2239 static void 2240 fake_disk_geometry(struct lofi_state *lsp) 2241 { 2242 u_offset_t dsize = lsp->ls_vp_size - lsp->ls_crypto_offset; 2243 2244 /* dk_geom - see dkio(7I) */ 2245 /* 2246 * dkg_ncyl _could_ be set to one here (one big cylinder with gobs 2247 * of sectors), but that breaks programs like fdisk which want to 2248 * partition a disk by cylinder. With one cylinder, you can't create 2249 * an fdisk partition and put pcfs on it for testing (hard to pick 2250 * a number between one and one). 2251 * 2252 * The cheezy floppy test is an attempt to not have too few cylinders 2253 * for a small file, or so many on a big file that you waste space 2254 * for backup superblocks or cylinder group structures. 2255 */ 2256 bzero(&lsp->ls_dkg, sizeof (lsp->ls_dkg)); 2257 if (dsize < (2 * 1024 * 1024)) /* floppy? */ 2258 lsp->ls_dkg.dkg_ncyl = dsize / (100 * 1024); 2259 else 2260 lsp->ls_dkg.dkg_ncyl = dsize / (300 * 1024); 2261 /* in case file file is < 100k */ 2262 if (lsp->ls_dkg.dkg_ncyl == 0) 2263 lsp->ls_dkg.dkg_ncyl = 1; 2264 2265 lsp->ls_dkg.dkg_pcyl = lsp->ls_dkg.dkg_ncyl; 2266 lsp->ls_dkg.dkg_nhead = 1; 2267 lsp->ls_dkg.dkg_rpm = 7200; 2268 2269 lsp->ls_dkg.dkg_nsect = dsize / 2270 (lsp->ls_dkg.dkg_ncyl << lsp->ls_pbshift); 2271 } 2272 2273 /* 2274 * build vtoc - see dkio(7I) 2275 * 2276 * Fakes one big partition based on the size of the file. This is needed 2277 * because we allow newfs'ing the traditional lofi device and newfs will 2278 * do several disk ioctls to figure out the geometry and partition information. 2279 * It uses that information to determine the parameters to pass to mkfs. 2280 */ 2281 static void 2282 fake_disk_vtoc(struct lofi_state *lsp, struct vtoc *vt) 2283 { 2284 bzero(vt, sizeof (struct vtoc)); 2285 vt->v_sanity = VTOC_SANE; 2286 vt->v_version = V_VERSION; 2287 (void) strncpy(vt->v_volume, LOFI_DRIVER_NAME, 2288 sizeof (vt->v_volume)); 2289 vt->v_sectorsz = 1 << lsp->ls_pbshift; 2290 vt->v_nparts = 1; 2291 vt->v_part[0].p_tag = V_UNASSIGNED; 2292 2293 /* 2294 * A compressed file is read-only, other files can 2295 * be read-write 2296 */ 2297 if (lsp->ls_uncomp_seg_sz > 0) { 2298 vt->v_part[0].p_flag = V_UNMNT | V_RONLY; 2299 } else { 2300 vt->v_part[0].p_flag = V_UNMNT; 2301 } 2302 vt->v_part[0].p_start = (daddr_t)0; 2303 /* 2304 * The partition size cannot just be the number of sectors, because 2305 * that might not end on a cylinder boundary. And if that's the case, 2306 * newfs/mkfs will print a scary warning. So just figure the size 2307 * based on the number of cylinders and sectors/cylinder. 2308 */ 2309 vt->v_part[0].p_size = lsp->ls_dkg.dkg_pcyl * 2310 lsp->ls_dkg.dkg_nsect * lsp->ls_dkg.dkg_nhead; 2311 } 2312 2313 /* 2314 * build dk_cinfo - see dkio(7I) 2315 */ 2316 static void 2317 fake_disk_info(dev_t dev, struct dk_cinfo *ci) 2318 { 2319 bzero(ci, sizeof (struct dk_cinfo)); 2320 (void) strlcpy(ci->dki_cname, LOFI_DRIVER_NAME, sizeof (ci->dki_cname)); 2321 ci->dki_ctype = DKC_SCSI_CCS; 2322 (void) strlcpy(ci->dki_dname, LOFI_DRIVER_NAME, sizeof (ci->dki_dname)); 2323 ci->dki_unit = LOFI_MINOR2ID(getminor(dev)); 2324 ci->dki_partition = LOFI_PART(getminor(dev)); 2325 /* 2326 * newfs uses this to set maxcontig. Must not be < 16, or it 2327 * will be 0 when newfs multiplies it by DEV_BSIZE and divides 2328 * it by the block size. Then tunefs doesn't work because 2329 * maxcontig is 0. 2330 */ 2331 ci->dki_maxtransfer = 16; 2332 } 2333 2334 /* 2335 * map in a compressed file 2336 * 2337 * Read in the header and the index that follows. 2338 * 2339 * The header is as follows - 2340 * 2341 * Signature (name of the compression algorithm) 2342 * Compression segment size (a multiple of 512) 2343 * Number of index entries 2344 * Size of the last block 2345 * The array containing the index entries 2346 * 2347 * The header information is always stored in 2348 * network byte order on disk. 2349 */ 2350 static int 2351 lofi_map_compressed_file(struct lofi_state *lsp, char *buf) 2352 { 2353 uint32_t index_sz, header_len, i; 2354 ssize_t resid; 2355 enum uio_rw rw; 2356 char *tbuf = buf; 2357 int error; 2358 2359 /* The signature has already been read */ 2360 tbuf += sizeof (lsp->ls_comp_algorithm); 2361 bcopy(tbuf, &(lsp->ls_uncomp_seg_sz), sizeof (lsp->ls_uncomp_seg_sz)); 2362 lsp->ls_uncomp_seg_sz = ntohl(lsp->ls_uncomp_seg_sz); 2363 2364 /* 2365 * The compressed segment size must be a power of 2 2366 */ 2367 if (lsp->ls_uncomp_seg_sz < DEV_BSIZE || 2368 !ISP2(lsp->ls_uncomp_seg_sz)) 2369 return (EINVAL); 2370 2371 for (i = 0; !((lsp->ls_uncomp_seg_sz >> i) & 1); i++) 2372 ; 2373 2374 lsp->ls_comp_seg_shift = i; 2375 2376 tbuf += sizeof (lsp->ls_uncomp_seg_sz); 2377 bcopy(tbuf, &(lsp->ls_comp_index_sz), sizeof (lsp->ls_comp_index_sz)); 2378 lsp->ls_comp_index_sz = ntohl(lsp->ls_comp_index_sz); 2379 2380 tbuf += sizeof (lsp->ls_comp_index_sz); 2381 bcopy(tbuf, &(lsp->ls_uncomp_last_seg_sz), 2382 sizeof (lsp->ls_uncomp_last_seg_sz)); 2383 lsp->ls_uncomp_last_seg_sz = ntohl(lsp->ls_uncomp_last_seg_sz); 2384 2385 /* 2386 * Compute the total size of the uncompressed data 2387 * for use in fake_disk_geometry and other calculations. 2388 * Disk geometry has to be faked with respect to the 2389 * actual uncompressed data size rather than the 2390 * compressed file size. 2391 */ 2392 lsp->ls_vp_size = 2393 (u_offset_t)(lsp->ls_comp_index_sz - 2) * lsp->ls_uncomp_seg_sz 2394 + lsp->ls_uncomp_last_seg_sz; 2395 2396 /* 2397 * Index size is rounded up to DEV_BSIZE for ease 2398 * of segmapping 2399 */ 2400 index_sz = sizeof (*lsp->ls_comp_seg_index) * lsp->ls_comp_index_sz; 2401 header_len = sizeof (lsp->ls_comp_algorithm) + 2402 sizeof (lsp->ls_uncomp_seg_sz) + 2403 sizeof (lsp->ls_comp_index_sz) + 2404 sizeof (lsp->ls_uncomp_last_seg_sz); 2405 lsp->ls_comp_offbase = header_len + index_sz; 2406 2407 index_sz += header_len; 2408 index_sz = roundup(index_sz, DEV_BSIZE); 2409 2410 lsp->ls_comp_index_data = kmem_alloc(index_sz, KM_SLEEP); 2411 lsp->ls_comp_index_data_sz = index_sz; 2412 2413 /* 2414 * Read in the index -- this has a side-effect 2415 * of reading in the header as well 2416 */ 2417 rw = UIO_READ; 2418 error = vn_rdwr(rw, lsp->ls_vp, lsp->ls_comp_index_data, index_sz, 2419 0, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 2420 2421 if (error != 0) 2422 return (error); 2423 2424 /* Skip the header, this is where the index really begins */ 2425 lsp->ls_comp_seg_index = 2426 /*LINTED*/ 2427 (uint64_t *)(lsp->ls_comp_index_data + header_len); 2428 2429 /* 2430 * Now recompute offsets in the index to account for 2431 * the header length 2432 */ 2433 for (i = 0; i < lsp->ls_comp_index_sz; i++) { 2434 lsp->ls_comp_seg_index[i] = lsp->ls_comp_offbase + 2435 BE_64(lsp->ls_comp_seg_index[i]); 2436 } 2437 2438 return (error); 2439 } 2440 2441 static int 2442 lofi_init_crypto(struct lofi_state *lsp, struct lofi_ioctl *klip) 2443 { 2444 struct crypto_meta chead; 2445 char buf[DEV_BSIZE]; 2446 ssize_t resid; 2447 char *marker; 2448 int error; 2449 int ret; 2450 int i; 2451 2452 if (!klip->li_crypto_enabled) 2453 return (0); 2454 2455 /* 2456 * All current algorithms have a max of 448 bits. 2457 */ 2458 if (klip->li_iv_len > CRYPTO_BITS2BYTES(512)) 2459 return (EINVAL); 2460 2461 if (CRYPTO_BITS2BYTES(klip->li_key_len) > sizeof (klip->li_key)) 2462 return (EINVAL); 2463 2464 lsp->ls_crypto_enabled = klip->li_crypto_enabled; 2465 2466 mutex_init(&lsp->ls_crypto_lock, NULL, MUTEX_DRIVER, NULL); 2467 2468 lsp->ls_mech.cm_type = crypto_mech2id(klip->li_cipher); 2469 if (lsp->ls_mech.cm_type == CRYPTO_MECH_INVALID) { 2470 cmn_err(CE_WARN, "invalid cipher %s requested for %s", 2471 klip->li_cipher, klip->li_filename); 2472 return (EINVAL); 2473 } 2474 2475 /* this is just initialization here */ 2476 lsp->ls_mech.cm_param = NULL; 2477 lsp->ls_mech.cm_param_len = 0; 2478 2479 lsp->ls_iv_type = klip->li_iv_type; 2480 lsp->ls_iv_mech.cm_type = crypto_mech2id(klip->li_iv_cipher); 2481 if (lsp->ls_iv_mech.cm_type == CRYPTO_MECH_INVALID) { 2482 cmn_err(CE_WARN, "invalid iv cipher %s requested" 2483 " for %s", klip->li_iv_cipher, klip->li_filename); 2484 return (EINVAL); 2485 } 2486 2487 /* iv mech must itself take a null iv */ 2488 lsp->ls_iv_mech.cm_param = NULL; 2489 lsp->ls_iv_mech.cm_param_len = 0; 2490 lsp->ls_iv_len = klip->li_iv_len; 2491 2492 /* 2493 * Create ctx using li_cipher & the raw li_key after checking 2494 * that it isn't a weak key. 2495 */ 2496 lsp->ls_key.ck_format = CRYPTO_KEY_RAW; 2497 lsp->ls_key.ck_length = klip->li_key_len; 2498 lsp->ls_key.ck_data = kmem_alloc( 2499 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length), KM_SLEEP); 2500 bcopy(klip->li_key, lsp->ls_key.ck_data, 2501 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 2502 2503 ret = crypto_key_check(&lsp->ls_mech, &lsp->ls_key); 2504 if (ret != CRYPTO_SUCCESS) { 2505 cmn_err(CE_WARN, "weak key check failed for cipher " 2506 "%s on file %s (0x%x)", klip->li_cipher, 2507 klip->li_filename, ret); 2508 return (EINVAL); 2509 } 2510 2511 error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE, 2512 CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 2513 if (error != 0) 2514 return (error); 2515 2516 /* 2517 * This is the case where the header in the lofi image is already 2518 * initialized to indicate it is encrypted. 2519 */ 2520 if (strncmp(buf, lofi_crypto_magic, sizeof (lofi_crypto_magic)) == 0) { 2521 /* 2522 * The encryption header information is laid out this way: 2523 * 6 bytes: hex "CFLOFI" 2524 * 2 bytes: version = 0 ... for now 2525 * 96 bytes: reserved1 (not implemented yet) 2526 * 4 bytes: data_sector = 2 ... for now 2527 * more... not implemented yet 2528 */ 2529 2530 marker = buf; 2531 2532 /* copy the magic */ 2533 bcopy(marker, lsp->ls_crypto.magic, 2534 sizeof (lsp->ls_crypto.magic)); 2535 marker += sizeof (lsp->ls_crypto.magic); 2536 2537 /* read the encryption version number */ 2538 bcopy(marker, &(lsp->ls_crypto.version), 2539 sizeof (lsp->ls_crypto.version)); 2540 lsp->ls_crypto.version = ntohs(lsp->ls_crypto.version); 2541 marker += sizeof (lsp->ls_crypto.version); 2542 2543 /* read a chunk of reserved data */ 2544 bcopy(marker, lsp->ls_crypto.reserved1, 2545 sizeof (lsp->ls_crypto.reserved1)); 2546 marker += sizeof (lsp->ls_crypto.reserved1); 2547 2548 /* read block number where encrypted data begins */ 2549 bcopy(marker, &(lsp->ls_crypto.data_sector), 2550 sizeof (lsp->ls_crypto.data_sector)); 2551 lsp->ls_crypto.data_sector = ntohl(lsp->ls_crypto.data_sector); 2552 marker += sizeof (lsp->ls_crypto.data_sector); 2553 2554 /* and ignore the rest until it is implemented */ 2555 2556 lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE; 2557 return (0); 2558 } 2559 2560 /* 2561 * We've requested encryption, but no magic was found, so it must be 2562 * a new image. 2563 */ 2564 2565 for (i = 0; i < sizeof (struct crypto_meta); i++) { 2566 if (buf[i] != '\0') 2567 return (EINVAL); 2568 } 2569 2570 marker = buf; 2571 bcopy(lofi_crypto_magic, marker, sizeof (lofi_crypto_magic)); 2572 marker += sizeof (lofi_crypto_magic); 2573 chead.version = htons(LOFI_CRYPTO_VERSION); 2574 bcopy(&(chead.version), marker, sizeof (chead.version)); 2575 marker += sizeof (chead.version); 2576 marker += sizeof (chead.reserved1); 2577 chead.data_sector = htonl(LOFI_CRYPTO_DATA_SECTOR); 2578 bcopy(&(chead.data_sector), marker, sizeof (chead.data_sector)); 2579 2580 /* write the header */ 2581 error = vn_rdwr(UIO_WRITE, lsp->ls_vp, buf, DEV_BSIZE, 2582 CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 2583 if (error != 0) 2584 return (error); 2585 2586 /* fix things up so it looks like we read this info */ 2587 bcopy(lofi_crypto_magic, lsp->ls_crypto.magic, 2588 sizeof (lofi_crypto_magic)); 2589 lsp->ls_crypto.version = LOFI_CRYPTO_VERSION; 2590 lsp->ls_crypto.data_sector = LOFI_CRYPTO_DATA_SECTOR; 2591 lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE; 2592 return (0); 2593 } 2594 2595 /* 2596 * Check to see if the passed in signature is a valid one. If it is 2597 * valid, return the index into lofi_compress_table. 2598 * 2599 * Return -1 if it is invalid 2600 */ 2601 static int 2602 lofi_compress_select(const char *signature) 2603 { 2604 int i; 2605 2606 for (i = 0; i < LOFI_COMPRESS_FUNCTIONS; i++) { 2607 if (strcmp(lofi_compress_table[i].l_name, signature) == 0) 2608 return (i); 2609 } 2610 2611 return (-1); 2612 } 2613 2614 static int 2615 lofi_init_compress(struct lofi_state *lsp) 2616 { 2617 char buf[DEV_BSIZE]; 2618 int compress_index; 2619 ssize_t resid; 2620 int error; 2621 2622 error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE, 0, UIO_SYSSPACE, 2623 0, RLIM64_INFINITY, kcred, &resid); 2624 2625 if (error != 0) 2626 return (error); 2627 2628 if ((compress_index = lofi_compress_select(buf)) == -1) 2629 return (0); 2630 2631 /* compression and encryption are mutually exclusive */ 2632 if (lsp->ls_crypto_enabled) 2633 return (ENOTSUP); 2634 2635 /* initialize compression info for compressed lofi */ 2636 lsp->ls_comp_algorithm_index = compress_index; 2637 (void) strlcpy(lsp->ls_comp_algorithm, 2638 lofi_compress_table[compress_index].l_name, 2639 sizeof (lsp->ls_comp_algorithm)); 2640 2641 /* Finally setup per-thread pre-allocated buffers */ 2642 lsp->ls_comp_bufs = kmem_zalloc(lofi_taskq_nthreads * 2643 sizeof (struct compbuf), KM_SLEEP); 2644 2645 return (lofi_map_compressed_file(lsp, buf)); 2646 } 2647 2648 /* 2649 * Allocate new or proposed id from lofi_id. 2650 * 2651 * Special cases for proposed id: 2652 * 0: not allowed, 0 is id for control device. 2653 * -1: allocate first usable id from lofi_id. 2654 * any other value is proposed value from userland 2655 * 2656 * returns DDI_SUCCESS or errno. 2657 */ 2658 static int 2659 lofi_alloc_id(int *idp) 2660 { 2661 int id, error = DDI_SUCCESS; 2662 2663 if (*idp == -1) { 2664 id = id_allocff_nosleep(lofi_id); 2665 if (id == -1) { 2666 error = EAGAIN; 2667 goto err; 2668 } 2669 } else if (*idp == 0) { 2670 error = EINVAL; 2671 goto err; 2672 } else if (*idp > ((1 << (L_BITSMINOR - LOFI_CMLB_SHIFT)) - 1)) { 2673 error = ERANGE; 2674 goto err; 2675 } else { 2676 if (ddi_get_soft_state(lofi_statep, *idp) != NULL) { 2677 error = EEXIST; 2678 goto err; 2679 } 2680 2681 id = id_alloc_specific_nosleep(lofi_id, *idp); 2682 if (id == -1) { 2683 error = EAGAIN; 2684 goto err; 2685 } 2686 } 2687 *idp = id; 2688 err: 2689 return (error); 2690 } 2691 2692 static int 2693 lofi_create_dev(struct lofi_ioctl *klip) 2694 { 2695 dev_info_t *parent, *child; 2696 struct lofi_state *lsp = NULL; 2697 char namebuf[MAXNAMELEN]; 2698 int error, circ; 2699 2700 /* get control device */ 2701 lsp = ddi_get_soft_state(lofi_statep, 0); 2702 parent = ddi_get_parent(lsp->ls_dip); 2703 2704 if ((error = lofi_alloc_id((int *)&klip->li_id))) 2705 return (error); 2706 2707 (void) snprintf(namebuf, sizeof (namebuf), LOFI_DRIVER_NAME "@%d", 2708 klip->li_id); 2709 2710 ndi_devi_enter(parent, &circ); 2711 child = ndi_devi_findchild(parent, namebuf); 2712 ndi_devi_exit(parent, circ); 2713 2714 if (child == NULL) { 2715 child = ddi_add_child(parent, LOFI_DRIVER_NAME, 2716 (pnode_t)DEVI_SID_NODEID, klip->li_id); 2717 if ((error = ddi_prop_update_int(DDI_DEV_T_NONE, child, 2718 "instance", klip->li_id)) != DDI_PROP_SUCCESS) 2719 goto err; 2720 2721 if (klip->li_labeled == B_TRUE) { 2722 if ((error = ddi_prop_create(DDI_DEV_T_NONE, child, 2723 DDI_PROP_CANSLEEP, "labeled", 0, 0)) 2724 != DDI_PROP_SUCCESS) 2725 goto err; 2726 } 2727 2728 if ((error = ndi_devi_online(child, NDI_ONLINE_ATTACH)) 2729 != NDI_SUCCESS) 2730 goto err; 2731 } else { 2732 id_free(lofi_id, klip->li_id); 2733 error = EEXIST; 2734 return (error); 2735 } 2736 2737 goto done; 2738 2739 err: 2740 ddi_prop_remove_all(child); 2741 (void) ndi_devi_offline(child, NDI_DEVI_REMOVE); 2742 id_free(lofi_id, klip->li_id); 2743 done: 2744 2745 return (error); 2746 } 2747 2748 static void 2749 lofi_create_inquiry(struct lofi_state *lsp, struct scsi_inquiry *inq) 2750 { 2751 char *p = NULL; 2752 2753 (void) strlcpy(inq->inq_vid, LOFI_DRIVER_NAME, sizeof (inq->inq_vid)); 2754 2755 mutex_enter(&lsp->ls_vp_lock); 2756 if (lsp->ls_vp != NULL) 2757 p = strrchr(lsp->ls_vp->v_path, '/'); 2758 if (p != NULL) 2759 (void) strncpy(inq->inq_pid, p + 1, sizeof (inq->inq_pid)); 2760 mutex_exit(&lsp->ls_vp_lock); 2761 (void) strlcpy(inq->inq_revision, "1.0", sizeof (inq->inq_revision)); 2762 } 2763 2764 /* 2765 * copy devlink name from event cache 2766 */ 2767 static void 2768 lofi_copy_devpath(struct lofi_ioctl *klip) 2769 { 2770 int error; 2771 char namebuf[MAXNAMELEN], *str; 2772 clock_t ticks; 2773 nvlist_t *nvl = NULL; 2774 2775 if (klip->li_labeled == B_TRUE) 2776 klip->li_devpath[0] = '\0'; 2777 else { 2778 /* no need to wait for messages */ 2779 (void) snprintf(klip->li_devpath, sizeof (klip->li_devpath), 2780 "/dev/" LOFI_CHAR_NAME "/%d", klip->li_id); 2781 return; 2782 } 2783 2784 (void) snprintf(namebuf, sizeof (namebuf), "%d", klip->li_id); 2785 ticks = ddi_get_lbolt() + LOFI_TIMEOUT * drv_usectohz(1000000); 2786 2787 mutex_enter(&lofi_devlink_cache.ln_lock); 2788 error = nvlist_lookup_nvlist(lofi_devlink_cache.ln_data, namebuf, &nvl); 2789 while (error != 0) { 2790 error = cv_timedwait(&lofi_devlink_cache.ln_cv, 2791 &lofi_devlink_cache.ln_lock, ticks); 2792 if (error == -1) 2793 break; 2794 error = nvlist_lookup_nvlist(lofi_devlink_cache.ln_data, 2795 namebuf, &nvl); 2796 } 2797 2798 if (nvl != NULL) { 2799 if (nvlist_lookup_string(nvl, DEV_NAME, &str) == 0) { 2800 (void) strlcpy(klip->li_devpath, str, 2801 sizeof (klip->li_devpath)); 2802 } 2803 } 2804 mutex_exit(&lofi_devlink_cache.ln_lock); 2805 } 2806 2807 /* 2808 * map a file to a minor number. Return the minor number. 2809 */ 2810 static int 2811 lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor, 2812 int *rvalp, struct cred *credp, int ioctl_flag) 2813 { 2814 int id = -1; 2815 struct lofi_state *lsp = NULL; 2816 struct lofi_ioctl *klip; 2817 int error; 2818 struct vnode *vp = NULL; 2819 vattr_t vattr; 2820 int flag; 2821 char namebuf[MAXNAMELEN]; 2822 2823 error = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 2824 if (error != 0) 2825 return (error); 2826 2827 mutex_enter(&lofi_lock); 2828 2829 if (file_to_lofi_nocheck(klip->li_filename, klip->li_readonly, 2830 NULL) == 0) { 2831 error = EBUSY; 2832 goto err; 2833 } 2834 2835 flag = FREAD | FWRITE | FOFFMAX | FEXCL; 2836 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, &vp, 0, 0); 2837 if (error) { 2838 /* try read-only */ 2839 flag &= ~FWRITE; 2840 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, 2841 &vp, 0, 0); 2842 if (error) 2843 goto err; 2844 } 2845 2846 if (!V_ISLOFIABLE(vp->v_type)) { 2847 error = EINVAL; 2848 goto err; 2849 } 2850 2851 vattr.va_mask = AT_SIZE; 2852 error = VOP_GETATTR(vp, &vattr, 0, credp, NULL); 2853 if (error) 2854 goto err; 2855 2856 /* the file needs to be a multiple of the block size */ 2857 if ((vattr.va_size % DEV_BSIZE) != 0) { 2858 error = EINVAL; 2859 goto err; 2860 } 2861 2862 if (pickminor) { 2863 klip->li_id = (uint32_t)-1; 2864 } 2865 if ((error = lofi_create_dev(klip)) != 0) 2866 goto err; 2867 2868 id = klip->li_id; 2869 lsp = ddi_get_soft_state(lofi_statep, id); 2870 if (lsp == NULL) 2871 goto err; 2872 2873 /* 2874 * from this point lofi_destroy() is used to clean up on error 2875 * make sure the basic data is set 2876 */ 2877 list_insert_tail(&lofi_list, lsp); 2878 lsp->ls_dev = makedevice(getmajor(dev), LOFI_ID2MINOR(id)); 2879 2880 list_create(&lsp->ls_comp_cache, sizeof (struct lofi_comp_cache), 2881 offsetof(struct lofi_comp_cache, lc_list)); 2882 2883 /* 2884 * save open mode so file can be closed properly and vnode counts 2885 * updated correctly. 2886 */ 2887 lsp->ls_openflag = flag; 2888 2889 lsp->ls_vp = vp; 2890 lsp->ls_stacked_vp = vp; 2891 2892 lsp->ls_vp_size = vattr.va_size; 2893 lsp->ls_vp_comp_size = lsp->ls_vp_size; 2894 2895 /* 2896 * Try to handle stacked lofs vnodes. 2897 */ 2898 if (vp->v_type == VREG) { 2899 vnode_t *realvp; 2900 2901 if (VOP_REALVP(vp, &realvp, NULL) == 0) { 2902 /* 2903 * We need to use the realvp for uniqueness 2904 * checking, but keep the stacked vp for 2905 * LOFI_GET_FILENAME display. 2906 */ 2907 VN_HOLD(realvp); 2908 lsp->ls_vp = realvp; 2909 } 2910 } 2911 2912 lsp->ls_lbshift = highbit(DEV_BSIZE) - 1; 2913 lsp->ls_pbshift = lsp->ls_lbshift; 2914 2915 lsp->ls_readonly = klip->li_readonly; 2916 lsp->ls_uncomp_seg_sz = 0; 2917 lsp->ls_comp_algorithm[0] = '\0'; 2918 lsp->ls_crypto_offset = 0; 2919 2920 (void) snprintf(namebuf, sizeof (namebuf), "%s_taskq_%d", 2921 LOFI_DRIVER_NAME, id); 2922 lsp->ls_taskq = taskq_create_proc(namebuf, lofi_taskq_nthreads, 2923 minclsyspri, 1, lofi_taskq_maxalloc, curzone->zone_zsched, 0); 2924 2925 if ((error = lofi_init_crypto(lsp, klip)) != 0) 2926 goto err; 2927 2928 if ((error = lofi_init_compress(lsp)) != 0) 2929 goto err; 2930 2931 fake_disk_geometry(lsp); 2932 2933 /* For unlabeled lofi add Nblocks and Size */ 2934 if (klip->li_labeled == B_FALSE) { 2935 error = ddi_prop_update_int64(lsp->ls_dev, lsp->ls_dip, 2936 SIZE_PROP_NAME, lsp->ls_vp_size - lsp->ls_crypto_offset); 2937 if (error != DDI_PROP_SUCCESS) { 2938 error = EINVAL; 2939 goto err; 2940 } 2941 error = ddi_prop_update_int64(lsp->ls_dev, lsp->ls_dip, 2942 NBLOCKS_PROP_NAME, 2943 (lsp->ls_vp_size - lsp->ls_crypto_offset) / DEV_BSIZE); 2944 if (error != DDI_PROP_SUCCESS) { 2945 error = EINVAL; 2946 goto err; 2947 } 2948 } 2949 2950 /* 2951 * Notify we are ready to rock. 2952 */ 2953 mutex_enter(&lsp->ls_vp_lock); 2954 lsp->ls_vp_ready = B_TRUE; 2955 cv_broadcast(&lsp->ls_vp_cv); 2956 mutex_exit(&lsp->ls_vp_lock); 2957 mutex_exit(&lofi_lock); 2958 2959 lofi_copy_devpath(klip); 2960 2961 if (rvalp) 2962 *rvalp = id; 2963 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 2964 free_lofi_ioctl(klip); 2965 return (0); 2966 2967 err: 2968 if (lsp != NULL) { 2969 lofi_destroy(lsp, credp); 2970 } else { 2971 if (vp != NULL) { 2972 (void) VOP_PUTPAGE(vp, 0, 0, B_FREE, credp, NULL); 2973 (void) VOP_CLOSE(vp, flag, 1, 0, credp, NULL); 2974 VN_RELE(vp); 2975 } 2976 } 2977 2978 mutex_exit(&lofi_lock); 2979 free_lofi_ioctl(klip); 2980 return (error); 2981 } 2982 2983 /* 2984 * unmap a file. 2985 */ 2986 static int 2987 lofi_unmap_file(struct lofi_ioctl *ulip, int byfilename, 2988 struct cred *credp, int ioctl_flag) 2989 { 2990 struct lofi_state *lsp; 2991 struct lofi_ioctl *klip; 2992 char namebuf[MAXNAMELEN]; 2993 int err; 2994 2995 err = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 2996 if (err != 0) 2997 return (err); 2998 2999 mutex_enter(&lofi_lock); 3000 if (byfilename) { 3001 if ((err = file_to_lofi(klip->li_filename, klip->li_readonly, 3002 &lsp)) != 0) { 3003 goto done; 3004 } 3005 } else if (klip->li_id == 0) { 3006 err = ENXIO; 3007 goto done; 3008 } else { 3009 lsp = ddi_get_soft_state(lofi_statep, klip->li_id); 3010 } 3011 3012 if (lsp == NULL || lsp->ls_vp == NULL || lofi_access(lsp) != 0) { 3013 err = ENXIO; 3014 goto done; 3015 } 3016 3017 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 3018 (void) snprintf(namebuf, sizeof (namebuf), "%u", klip->li_id); 3019 3020 /* 3021 * If it's still held open, we'll do one of three things: 3022 * 3023 * If no flag is set, just return EBUSY. 3024 * 3025 * If the 'cleanup' flag is set, unmap and remove the device when 3026 * the last user finishes. 3027 * 3028 * If the 'force' flag is set, then we forcibly close the underlying 3029 * file. Subsequent operations will fail, and the DKIOCSTATE ioctl 3030 * will return DKIO_DEV_GONE. When the device is last closed, the 3031 * device will be cleaned up appropriately. 3032 * 3033 * This is complicated by the fact that we may have outstanding 3034 * dispatched I/Os. Rather than having a single mutex to serialize all 3035 * I/O, we keep a count of the number of outstanding I/O requests 3036 * (ls_vp_iocount), as well as a flag to indicate that no new I/Os 3037 * should be dispatched (ls_vp_closereq). 3038 * 3039 * We set the flag, wait for the number of outstanding I/Os to reach 0, 3040 * and then close the underlying vnode. 3041 */ 3042 if (is_opened(lsp)) { 3043 if (klip->li_force) { 3044 /* Mark the device for cleanup. */ 3045 lofi_set_cleanup(lsp); 3046 mutex_enter(&lsp->ls_vp_lock); 3047 lsp->ls_vp_closereq = B_TRUE; 3048 /* Wake up any threads waiting on dkiocstate. */ 3049 cv_broadcast(&lsp->ls_vp_cv); 3050 while (lsp->ls_vp_iocount > 0) 3051 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 3052 mutex_exit(&lsp->ls_vp_lock); 3053 } else if (klip->li_cleanup) { 3054 lofi_set_cleanup(lsp); 3055 } else { 3056 err = EBUSY; 3057 } 3058 } else { 3059 lofi_free_dev(lsp); 3060 lofi_destroy(lsp, credp); 3061 } 3062 3063 /* Remove name from devlink cache */ 3064 mutex_enter(&lofi_devlink_cache.ln_lock); 3065 (void) nvlist_remove_all(lofi_devlink_cache.ln_data, namebuf); 3066 mutex_exit(&lofi_devlink_cache.ln_lock); 3067 done: 3068 mutex_exit(&lofi_lock); 3069 if (err == 0) 3070 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3071 free_lofi_ioctl(klip); 3072 return (err); 3073 } 3074 3075 /* 3076 * get the filename given the minor number, or the minor number given 3077 * the name. 3078 */ 3079 /*ARGSUSED*/ 3080 static int 3081 lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which, 3082 struct cred *credp, int ioctl_flag) 3083 { 3084 struct lofi_ioctl *klip; 3085 struct lofi_state *lsp; 3086 int error; 3087 3088 error = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 3089 if (error != 0) 3090 return (error); 3091 3092 switch (which) { 3093 case LOFI_GET_FILENAME: 3094 if (klip->li_id == 0) { 3095 free_lofi_ioctl(klip); 3096 return (EINVAL); 3097 } 3098 3099 mutex_enter(&lofi_lock); 3100 lsp = ddi_get_soft_state(lofi_statep, klip->li_id); 3101 if (lsp == NULL || lofi_access(lsp) != 0) { 3102 mutex_exit(&lofi_lock); 3103 free_lofi_ioctl(klip); 3104 return (ENXIO); 3105 } 3106 3107 /* 3108 * This may fail if, for example, we're trying to look 3109 * up a zoned NFS path from the global zone. 3110 */ 3111 if (vnodetopath(NULL, lsp->ls_stacked_vp, klip->li_filename, 3112 sizeof (klip->li_filename), CRED()) != 0) { 3113 (void) strlcpy(klip->li_filename, "?", 3114 sizeof (klip->li_filename)); 3115 } 3116 3117 klip->li_readonly = lsp->ls_readonly; 3118 klip->li_labeled = lsp->ls_cmlbhandle != NULL; 3119 3120 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm, 3121 sizeof (klip->li_algorithm)); 3122 klip->li_crypto_enabled = lsp->ls_crypto_enabled; 3123 mutex_exit(&lofi_lock); 3124 3125 lofi_copy_devpath(klip); 3126 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3127 free_lofi_ioctl(klip); 3128 return (error); 3129 case LOFI_GET_MINOR: 3130 mutex_enter(&lofi_lock); 3131 error = file_to_lofi(klip->li_filename, 3132 klip->li_readonly, &lsp); 3133 if (error != 0) { 3134 mutex_exit(&lofi_lock); 3135 free_lofi_ioctl(klip); 3136 return (error); 3137 } 3138 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 3139 3140 klip->li_readonly = lsp->ls_readonly; 3141 klip->li_labeled = lsp->ls_cmlbhandle != NULL; 3142 mutex_exit(&lofi_lock); 3143 3144 lofi_copy_devpath(klip); 3145 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3146 3147 free_lofi_ioctl(klip); 3148 return (error); 3149 case LOFI_CHECK_COMPRESSED: 3150 mutex_enter(&lofi_lock); 3151 error = file_to_lofi(klip->li_filename, 3152 klip->li_readonly, &lsp); 3153 if (error != 0) { 3154 mutex_exit(&lofi_lock); 3155 free_lofi_ioctl(klip); 3156 return (error); 3157 } 3158 3159 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 3160 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm, 3161 sizeof (klip->li_algorithm)); 3162 3163 mutex_exit(&lofi_lock); 3164 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3165 free_lofi_ioctl(klip); 3166 return (error); 3167 default: 3168 free_lofi_ioctl(klip); 3169 return (EINVAL); 3170 } 3171 } 3172 3173 static int 3174 uscsi_is_inquiry(intptr_t arg, int flag, union scsi_cdb *cdb, 3175 struct uscsi_cmd *uscmd) 3176 { 3177 int rval; 3178 3179 #ifdef _MULTI_DATAMODEL 3180 switch (ddi_model_convert_from(flag & FMODELS)) { 3181 case DDI_MODEL_ILP32: { 3182 struct uscsi_cmd32 ucmd32; 3183 3184 if (ddi_copyin((void *)arg, &ucmd32, sizeof (ucmd32), flag)) { 3185 rval = EFAULT; 3186 goto err; 3187 } 3188 uscsi_cmd32touscsi_cmd((&ucmd32), uscmd); 3189 break; 3190 } 3191 case DDI_MODEL_NONE: 3192 if (ddi_copyin((void *)arg, uscmd, sizeof (*uscmd), flag)) { 3193 rval = EFAULT; 3194 goto err; 3195 } 3196 break; 3197 default: 3198 rval = EFAULT; 3199 goto err; 3200 } 3201 #else 3202 if (ddi_copyin((void *)arg, uscmd, sizeof (*uscmd), flag)) { 3203 rval = EFAULT; 3204 goto err; 3205 } 3206 #endif /* _MULTI_DATAMODEL */ 3207 if (ddi_copyin(uscmd->uscsi_cdb, cdb, uscmd->uscsi_cdblen, flag)) { 3208 rval = EFAULT; 3209 goto err; 3210 } 3211 if (cdb->scc_cmd == SCMD_INQUIRY) { 3212 return (0); 3213 } 3214 err: 3215 return (rval); 3216 } 3217 3218 static int 3219 lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, 3220 int *rvalp) 3221 { 3222 int error; 3223 enum dkio_state dkstate; 3224 struct lofi_state *lsp; 3225 dk_efi_t user_efi; 3226 int id; 3227 3228 id = LOFI_MINOR2ID(getminor(dev)); 3229 3230 /* lofi ioctls only apply to the master device */ 3231 if (id == 0) { 3232 struct lofi_ioctl *lip = (struct lofi_ioctl *)arg; 3233 3234 /* 3235 * the query command only need read-access - i.e., normal 3236 * users are allowed to do those on the ctl device as 3237 * long as they can open it read-only. 3238 */ 3239 switch (cmd) { 3240 case LOFI_MAP_FILE: 3241 if ((flag & FWRITE) == 0) 3242 return (EPERM); 3243 return (lofi_map_file(dev, lip, 1, rvalp, credp, flag)); 3244 case LOFI_MAP_FILE_MINOR: 3245 if ((flag & FWRITE) == 0) 3246 return (EPERM); 3247 return (lofi_map_file(dev, lip, 0, rvalp, credp, flag)); 3248 case LOFI_UNMAP_FILE: 3249 if ((flag & FWRITE) == 0) 3250 return (EPERM); 3251 return (lofi_unmap_file(lip, 1, credp, flag)); 3252 case LOFI_UNMAP_FILE_MINOR: 3253 if ((flag & FWRITE) == 0) 3254 return (EPERM); 3255 return (lofi_unmap_file(lip, 0, credp, flag)); 3256 case LOFI_GET_FILENAME: 3257 return (lofi_get_info(dev, lip, LOFI_GET_FILENAME, 3258 credp, flag)); 3259 case LOFI_GET_MINOR: 3260 return (lofi_get_info(dev, lip, LOFI_GET_MINOR, 3261 credp, flag)); 3262 3263 /* 3264 * This API made limited sense when this value was fixed 3265 * at LOFI_MAX_FILES. However, its use to iterate 3266 * across all possible devices in lofiadm means we don't 3267 * want to return L_MAXMIN, but the highest 3268 * *allocated* id. 3269 */ 3270 case LOFI_GET_MAXMINOR: 3271 id = 0; 3272 3273 mutex_enter(&lofi_lock); 3274 3275 for (lsp = list_head(&lofi_list); lsp != NULL; 3276 lsp = list_next(&lofi_list, lsp)) { 3277 int i; 3278 if (lofi_access(lsp) != 0) 3279 continue; 3280 3281 i = ddi_get_instance(lsp->ls_dip); 3282 if (i > id) 3283 id = i; 3284 } 3285 3286 mutex_exit(&lofi_lock); 3287 3288 error = ddi_copyout(&id, &lip->li_id, 3289 sizeof (id), flag); 3290 if (error) 3291 return (EFAULT); 3292 return (0); 3293 3294 case LOFI_CHECK_COMPRESSED: 3295 return (lofi_get_info(dev, lip, LOFI_CHECK_COMPRESSED, 3296 credp, flag)); 3297 default: 3298 return (EINVAL); 3299 } 3300 } 3301 3302 mutex_enter(&lofi_lock); 3303 lsp = ddi_get_soft_state(lofi_statep, id); 3304 if (lsp == NULL || lsp->ls_cleanup) { 3305 mutex_exit(&lofi_lock); 3306 return (ENXIO); 3307 } 3308 mutex_exit(&lofi_lock); 3309 3310 if (ddi_prop_exists(DDI_DEV_T_ANY, lsp->ls_dip, DDI_PROP_DONTPASS, 3311 "labeled") == 1) { 3312 error = cmlb_ioctl(lsp->ls_cmlbhandle, dev, cmd, arg, flag, 3313 credp, rvalp, 0); 3314 if (error != ENOTTY) 3315 return (error); 3316 } 3317 3318 /* 3319 * We explicitly allow DKIOCSTATE, but all other ioctls should fail with 3320 * EIO as if the device was no longer present. 3321 */ 3322 if (lsp->ls_vp == NULL && cmd != DKIOCSTATE) 3323 return (EIO); 3324 3325 /* these are for faking out utilities like newfs */ 3326 switch (cmd) { 3327 case DKIOCGMEDIAINFO: 3328 case DKIOCGMEDIAINFOEXT: { 3329 struct dk_minfo_ext media_info; 3330 int shift = lsp->ls_lbshift; 3331 int size; 3332 3333 if (cmd == DKIOCGMEDIAINFOEXT) { 3334 media_info.dki_pbsize = 1U << lsp->ls_pbshift; 3335 size = sizeof (struct dk_minfo_ext); 3336 } else { 3337 size = sizeof (struct dk_minfo); 3338 } 3339 3340 media_info.dki_media_type = DK_FIXED_DISK; 3341 media_info.dki_lbsize = 1U << shift; 3342 media_info.dki_capacity = 3343 (lsp->ls_vp_size - lsp->ls_crypto_offset) >> shift; 3344 3345 if (ddi_copyout(&media_info, (void *)arg, size, flag)) 3346 return (EFAULT); 3347 return (0); 3348 } 3349 case DKIOCREMOVABLE: { 3350 int i = 0; 3351 if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), flag)) 3352 return (EFAULT); 3353 return (0); 3354 } 3355 3356 case DKIOCGVTOC: { 3357 struct vtoc vt; 3358 fake_disk_vtoc(lsp, &vt); 3359 3360 switch (ddi_model_convert_from(flag & FMODELS)) { 3361 case DDI_MODEL_ILP32: { 3362 struct vtoc32 vtoc32; 3363 3364 vtoctovtoc32(vt, vtoc32); 3365 if (ddi_copyout(&vtoc32, (void *)arg, 3366 sizeof (struct vtoc32), flag)) 3367 return (EFAULT); 3368 break; 3369 } 3370 3371 case DDI_MODEL_NONE: 3372 if (ddi_copyout(&vt, (void *)arg, 3373 sizeof (struct vtoc), flag)) 3374 return (EFAULT); 3375 break; 3376 } 3377 return (0); 3378 } 3379 case DKIOCINFO: { 3380 struct dk_cinfo ci; 3381 fake_disk_info(dev, &ci); 3382 if (ddi_copyout(&ci, (void *)arg, sizeof (ci), flag)) 3383 return (EFAULT); 3384 return (0); 3385 } 3386 case DKIOCG_VIRTGEOM: 3387 case DKIOCG_PHYGEOM: 3388 case DKIOCGGEOM: 3389 error = ddi_copyout(&lsp->ls_dkg, (void *)arg, 3390 sizeof (struct dk_geom), flag); 3391 if (error) 3392 return (EFAULT); 3393 return (0); 3394 case DKIOCSTATE: 3395 /* 3396 * Normally, lofi devices are always in the INSERTED state. If 3397 * a device is forcefully unmapped, then the device transitions 3398 * to the DKIO_DEV_GONE state. 3399 */ 3400 if (ddi_copyin((void *)arg, &dkstate, sizeof (dkstate), 3401 flag) != 0) 3402 return (EFAULT); 3403 3404 mutex_enter(&lsp->ls_vp_lock); 3405 while (((dkstate == DKIO_INSERTED && lsp->ls_vp != NULL) || 3406 (dkstate == DKIO_DEV_GONE && lsp->ls_vp == NULL)) && 3407 !lsp->ls_cleanup) { 3408 /* 3409 * By virtue of having the device open, we know that 3410 * 'lsp' will remain valid when we return. 3411 */ 3412 if (!cv_wait_sig(&lsp->ls_vp_cv, &lsp->ls_vp_lock)) { 3413 mutex_exit(&lsp->ls_vp_lock); 3414 return (EINTR); 3415 } 3416 } 3417 3418 dkstate = (!lsp->ls_cleanup && lsp->ls_vp != NULL ? 3419 DKIO_INSERTED : DKIO_DEV_GONE); 3420 mutex_exit(&lsp->ls_vp_lock); 3421 3422 if (ddi_copyout(&dkstate, (void *)arg, 3423 sizeof (dkstate), flag) != 0) 3424 return (EFAULT); 3425 return (0); 3426 case USCSICMD: { 3427 struct uscsi_cmd uscmd; 3428 union scsi_cdb cdb; 3429 3430 if (uscsi_is_inquiry(arg, flag, &cdb, &uscmd) == 0) { 3431 struct scsi_inquiry inq = {0}; 3432 3433 lofi_create_inquiry(lsp, &inq); 3434 if (ddi_copyout(&inq, uscmd.uscsi_bufaddr, 3435 uscmd.uscsi_buflen, flag) != 0) 3436 return (EFAULT); 3437 return (0); 3438 } else if (cdb.scc_cmd == SCMD_READ_CAPACITY) { 3439 struct scsi_capacity capacity; 3440 3441 capacity.capacity = 3442 BE_32((lsp->ls_vp_size - lsp->ls_crypto_offset) >> 3443 lsp->ls_lbshift); 3444 capacity.lbasize = BE_32(1 << lsp->ls_lbshift); 3445 if (ddi_copyout(&capacity, uscmd.uscsi_bufaddr, 3446 uscmd.uscsi_buflen, flag) != 0) 3447 return (EFAULT); 3448 return (0); 3449 } 3450 3451 uscmd.uscsi_rqstatus = 0xff; 3452 #ifdef _MULTI_DATAMODEL 3453 switch (ddi_model_convert_from(flag & FMODELS)) { 3454 case DDI_MODEL_ILP32: { 3455 struct uscsi_cmd32 ucmd32; 3456 uscsi_cmdtouscsi_cmd32((&uscmd), (&ucmd32)); 3457 if (ddi_copyout(&ucmd32, (void *)arg, sizeof (ucmd32), 3458 flag) != 0) 3459 return (EFAULT); 3460 break; 3461 } 3462 case DDI_MODEL_NONE: 3463 if (ddi_copyout(&uscmd, (void *)arg, sizeof (uscmd), 3464 flag) != 0) 3465 return (EFAULT); 3466 break; 3467 default: 3468 return (EFAULT); 3469 } 3470 #else 3471 if (ddi_copyout(&uscmd, (void *)arg, sizeof (uscmd), flag) != 0) 3472 return (EFAULT); 3473 #endif /* _MULTI_DATAMODEL */ 3474 return (0); 3475 } 3476 3477 case DKIOCGMBOOT: 3478 return (lofi_urw(lsp, FREAD, 0, 1 << lsp->ls_lbshift, 3479 arg, flag, credp)); 3480 3481 case DKIOCSMBOOT: 3482 return (lofi_urw(lsp, FWRITE, 0, 1 << lsp->ls_lbshift, 3483 arg, flag, credp)); 3484 3485 case DKIOCGETEFI: 3486 if (ddi_copyin((void *)arg, &user_efi, 3487 sizeof (dk_efi_t), flag) != 0) 3488 return (EFAULT); 3489 3490 return (lofi_urw(lsp, FREAD, 3491 user_efi.dki_lba * (1 << lsp->ls_lbshift), 3492 user_efi.dki_length, (intptr_t)user_efi.dki_data, 3493 flag, credp)); 3494 3495 case DKIOCSETEFI: 3496 if (ddi_copyin((void *)arg, &user_efi, 3497 sizeof (dk_efi_t), flag) != 0) 3498 return (EFAULT); 3499 3500 return (lofi_urw(lsp, FWRITE, 3501 user_efi.dki_lba * (1 << lsp->ls_lbshift), 3502 user_efi.dki_length, (intptr_t)user_efi.dki_data, 3503 flag, credp)); 3504 3505 default: 3506 #ifdef DEBUG 3507 cmn_err(CE_WARN, "lofi_ioctl: %d is not implemented\n", cmd); 3508 #endif /* DEBUG */ 3509 return (ENOTTY); 3510 } 3511 } 3512 3513 static int 3514 lofi_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags, 3515 char *name, caddr_t valuep, int *lengthp) 3516 { 3517 struct lofi_state *lsp; 3518 int rc; 3519 3520 lsp = ddi_get_soft_state(lofi_statep, ddi_get_instance(dip)); 3521 if (lsp == NULL) { 3522 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 3523 name, valuep, lengthp)); 3524 } 3525 3526 rc = cmlb_prop_op(lsp->ls_cmlbhandle, dev, dip, prop_op, mod_flags, 3527 name, valuep, lengthp, LOFI_PART(getminor(dev)), NULL); 3528 if (rc == DDI_PROP_SUCCESS) 3529 return (rc); 3530 3531 return (ddi_prop_op(DDI_DEV_T_ANY, dip, prop_op, mod_flags, 3532 name, valuep, lengthp)); 3533 } 3534 3535 static struct cb_ops lofi_cb_ops = { 3536 lofi_open, /* open */ 3537 lofi_close, /* close */ 3538 lofi_strategy, /* strategy */ 3539 nodev, /* print */ 3540 nodev, /* dump */ 3541 lofi_read, /* read */ 3542 lofi_write, /* write */ 3543 lofi_ioctl, /* ioctl */ 3544 nodev, /* devmap */ 3545 nodev, /* mmap */ 3546 nodev, /* segmap */ 3547 nochpoll, /* poll */ 3548 lofi_prop_op, /* prop_op */ 3549 0, /* streamtab */ 3550 D_64BIT | D_NEW | D_MP, /* Driver compatibility flag */ 3551 CB_REV, 3552 lofi_aread, 3553 lofi_awrite 3554 }; 3555 3556 static struct dev_ops lofi_ops = { 3557 DEVO_REV, /* devo_rev, */ 3558 0, /* refcnt */ 3559 lofi_info, /* info */ 3560 nulldev, /* identify */ 3561 nulldev, /* probe */ 3562 lofi_attach, /* attach */ 3563 lofi_detach, /* detach */ 3564 nodev, /* reset */ 3565 &lofi_cb_ops, /* driver operations */ 3566 NULL, /* no bus operations */ 3567 NULL, /* power */ 3568 ddi_quiesce_not_needed, /* quiesce */ 3569 }; 3570 3571 static struct modldrv modldrv = { 3572 &mod_driverops, 3573 "loopback file driver", 3574 &lofi_ops, 3575 }; 3576 3577 static struct modlinkage modlinkage = { 3578 MODREV_1, 3579 &modldrv, 3580 NULL 3581 }; 3582 3583 int 3584 _init(void) 3585 { 3586 int error; 3587 3588 list_create(&lofi_list, sizeof (struct lofi_state), 3589 offsetof(struct lofi_state, ls_list)); 3590 3591 error = ddi_soft_state_init((void **)&lofi_statep, 3592 sizeof (struct lofi_state), 0); 3593 if (error) { 3594 list_destroy(&lofi_list); 3595 return (error); 3596 } 3597 3598 /* 3599 * The minor number is stored as id << LOFI_CMLB_SHIFT as 3600 * we need to reserve space for cmlb minor numbers. 3601 * This will leave out 4096 id values on 32bit kernel, which should 3602 * still suffice. 3603 */ 3604 lofi_id = id_space_create("lofi_id", 1, 3605 (1 << (L_BITSMINOR - LOFI_CMLB_SHIFT))); 3606 3607 if (lofi_id == NULL) { 3608 ddi_soft_state_fini((void **)&lofi_statep); 3609 list_destroy(&lofi_list); 3610 return (DDI_FAILURE); 3611 } 3612 3613 mutex_init(&lofi_lock, NULL, MUTEX_DRIVER, NULL); 3614 3615 error = mod_install(&modlinkage); 3616 3617 if (error) { 3618 id_space_destroy(lofi_id); 3619 mutex_destroy(&lofi_lock); 3620 ddi_soft_state_fini((void **)&lofi_statep); 3621 list_destroy(&lofi_list); 3622 } 3623 3624 return (error); 3625 } 3626 3627 int 3628 _fini(void) 3629 { 3630 int error; 3631 3632 mutex_enter(&lofi_lock); 3633 3634 if (!list_is_empty(&lofi_list)) { 3635 mutex_exit(&lofi_lock); 3636 return (EBUSY); 3637 } 3638 3639 mutex_exit(&lofi_lock); 3640 3641 error = mod_remove(&modlinkage); 3642 if (error) 3643 return (error); 3644 3645 mutex_destroy(&lofi_lock); 3646 id_space_destroy(lofi_id); 3647 ddi_soft_state_fini((void **)&lofi_statep); 3648 list_destroy(&lofi_list); 3649 3650 return (error); 3651 } 3652 3653 int 3654 _info(struct modinfo *modinfop) 3655 { 3656 return (mod_info(&modlinkage, modinfop)); 3657 }