1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  *
  24  * Portions Copyright 2010 Robert Milkowski
  25  *
  26  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  27  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  28  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  29  * Copyright (c) 2014 Integros [integros.com]
  30  */
  31 
  32 /*
  33  * ZFS volume emulation driver.
  34  *
  35  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  36  * Volumes are accessed through the symbolic links named:
  37  *
  38  * /dev/zvol/dsk/<pool_name>/<dataset_name>
  39  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
  40  *
  41  * These links are created by the /dev filesystem (sdev_zvolops.c).
  42  * Volumes are persistent through reboot.  No user command needs to be
  43  * run before opening and using a device.
  44  */
  45 
  46 #include <sys/types.h>
  47 #include <sys/param.h>
  48 #include <sys/errno.h>
  49 #include <sys/uio.h>
  50 #include <sys/buf.h>
  51 #include <sys/modctl.h>
  52 #include <sys/open.h>
  53 #include <sys/kmem.h>
  54 #include <sys/conf.h>
  55 #include <sys/cmn_err.h>
  56 #include <sys/stat.h>
  57 #include <sys/zap.h>
  58 #include <sys/spa.h>
  59 #include <sys/spa_impl.h>
  60 #include <sys/zio.h>
  61 #include <sys/dmu_traverse.h>
  62 #include <sys/dnode.h>
  63 #include <sys/dsl_dataset.h>
  64 #include <sys/dsl_prop.h>
  65 #include <sys/dkio.h>
  66 #include <sys/efi_partition.h>
  67 #include <sys/byteorder.h>
  68 #include <sys/pathname.h>
  69 #include <sys/ddi.h>
  70 #include <sys/sunddi.h>
  71 #include <sys/crc32.h>
  72 #include <sys/dirent.h>
  73 #include <sys/policy.h>
  74 #include <sys/fs/zfs.h>
  75 #include <sys/zfs_ioctl.h>
  76 #include <sys/mkdev.h>
  77 #include <sys/zil.h>
  78 #include <sys/refcount.h>
  79 #include <sys/zfs_znode.h>
  80 #include <sys/zfs_rlock.h>
  81 #include <sys/vdev_disk.h>
  82 #include <sys/vdev_impl.h>
  83 #include <sys/vdev_raidz.h>
  84 #include <sys/zvol.h>
  85 #include <sys/dumphdr.h>
  86 #include <sys/zil_impl.h>
  87 #include <sys/dbuf.h>
  88 #include <sys/dmu_tx.h>
  89 #include <sys/zfeature.h>
  90 #include <sys/zio_checksum.h>
  91 #include <sys/zil_impl.h>
  92 #include <sys/dkioc_free_util.h>
  93 #include <sys/zfs_rlock.h>
  94 
  95 #include "zfs_namecheck.h"
  96 
  97 void *zfsdev_state;
  98 static char *zvol_tag = "zvol_tag";
  99 
 100 #define ZVOL_DUMPSIZE           "dumpsize"
 101 
 102 /*
 103  * This lock protects the zfsdev_state structure from being modified
 104  * while it's being used, e.g. an open that comes in before a create
 105  * finishes.  It also protects temporary opens of the dataset so that,
 106  * e.g., an open doesn't get a spurious EBUSY.
 107  */
 108 kmutex_t zfsdev_state_lock;
 109 static uint32_t zvol_minors;
 110 
 111 typedef struct zvol_extent {
 112         list_node_t     ze_node;
 113         dva_t           ze_dva;         /* dva associated with this extent */
 114         uint64_t        ze_nblks;       /* number of blocks in extent */
 115 } zvol_extent_t;
 116 
 117 /*
 118  * The in-core state of each volume.
 119  */
 120 typedef struct zvol_state {
 121         char            zv_name[MAXPATHLEN]; /* pool/dd name */
 122         uint64_t        zv_volsize;     /* amount of space we advertise */
 123         uint64_t        zv_volblocksize; /* volume block size */
 124         minor_t         zv_minor;       /* minor number */
 125         uint8_t         zv_min_bs;      /* minimum addressable block shift */
 126         uint8_t         zv_flags;       /* readonly, dumpified, etc. */
 127         objset_t        *zv_objset;     /* objset handle */
 128         uint32_t        zv_open_count[OTYPCNT]; /* open counts */
 129         uint32_t        zv_total_opens; /* total open count */
 130         zilog_t         *zv_zilog;      /* ZIL handle */
 131         list_t          zv_extents;     /* List of extents for dump */
 132         rangelock_t     zv_rangelock;
 133         dnode_t         *zv_dn;         /* dnode hold */
 134 } zvol_state_t;
 135 
 136 /*
 137  * zvol specific flags
 138  */
 139 #define ZVOL_RDONLY     0x1
 140 #define ZVOL_DUMPIFIED  0x2
 141 #define ZVOL_EXCL       0x4
 142 #define ZVOL_WCE        0x8
 143 
 144 /*
 145  * zvol maximum transfer in one DMU tx.
 146  */
 147 int zvol_maxphys = DMU_MAX_ACCESS/2;
 148 
 149 /*
 150  * Toggle unmap functionality.
 151  */
 152 boolean_t zvol_unmap_enabled = B_TRUE;
 153 
 154 /*
 155  * If true, unmaps requested as synchronous are executed synchronously,
 156  * otherwise all unmaps are asynchronous.
 157  */
 158 boolean_t zvol_unmap_sync_enabled = B_FALSE;
 159 
 160 extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
 161     nvlist_t *, nvlist_t *);
 162 static int zvol_remove_zv(zvol_state_t *);
 163 static int zvol_get_data(void *arg, lr_write_t *lr, char *buf,
 164     struct lwb *lwb, zio_t *zio);
 165 static int zvol_dumpify(zvol_state_t *zv);
 166 static int zvol_dump_fini(zvol_state_t *zv);
 167 static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
 168 
 169 static void
 170 zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
 171 {
 172         dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor);
 173 
 174         zv->zv_volsize = volsize;
 175         VERIFY(ddi_prop_update_int64(dev, zfs_dip,
 176             "Size", volsize) == DDI_SUCCESS);
 177         VERIFY(ddi_prop_update_int64(dev, zfs_dip,
 178             "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
 179 
 180         /* Notify specfs to invalidate the cached size */
 181         spec_size_invalidate(dev, VBLK);
 182         spec_size_invalidate(dev, VCHR);
 183 }
 184 
 185 int
 186 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
 187 {
 188         if (volsize == 0)
 189                 return (SET_ERROR(EINVAL));
 190 
 191         if (volsize % blocksize != 0)
 192                 return (SET_ERROR(EINVAL));
 193 
 194 #ifdef _ILP32
 195         if (volsize - 1 > SPEC_MAXOFFSET_T)
 196                 return (SET_ERROR(EOVERFLOW));
 197 #endif
 198         return (0);
 199 }
 200 
 201 int
 202 zvol_check_volblocksize(uint64_t volblocksize)
 203 {
 204         if (volblocksize < SPA_MINBLOCKSIZE ||
 205             volblocksize > SPA_OLD_MAXBLOCKSIZE ||
 206             !ISP2(volblocksize))
 207                 return (SET_ERROR(EDOM));
 208 
 209         return (0);
 210 }
 211 
 212 int
 213 zvol_get_stats(objset_t *os, nvlist_t *nv)
 214 {
 215         int error;
 216         dmu_object_info_t doi;
 217         uint64_t val;
 218 
 219         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
 220         if (error)
 221                 return (error);
 222 
 223         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
 224 
 225         error = dmu_object_info(os, ZVOL_OBJ, &doi);
 226 
 227         if (error == 0) {
 228                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
 229                     doi.doi_data_block_size);
 230         }
 231 
 232         return (error);
 233 }
 234 
 235 static zvol_state_t *
 236 zvol_minor_lookup(const char *name)
 237 {
 238         minor_t minor;
 239         zvol_state_t *zv;
 240 
 241         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 242 
 243         for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
 244                 zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 245                 if (zv == NULL)
 246                         continue;
 247                 if (strcmp(zv->zv_name, name) == 0)
 248                         return (zv);
 249         }
 250 
 251         return (NULL);
 252 }
 253 
 254 /* extent mapping arg */
 255 struct maparg {
 256         zvol_state_t    *ma_zv;
 257         uint64_t        ma_blks;
 258 };
 259 
 260 /*ARGSUSED*/
 261 static int
 262 zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 263     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 264 {
 265         struct maparg *ma = arg;
 266         zvol_extent_t *ze;
 267         int bs = ma->ma_zv->zv_volblocksize;
 268 
 269         if (bp == NULL || BP_IS_HOLE(bp) ||
 270             zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
 271                 return (0);
 272 
 273         VERIFY(!BP_IS_EMBEDDED(bp));
 274 
 275         VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
 276         ma->ma_blks++;
 277 
 278         /* Abort immediately if we have encountered gang blocks */
 279         if (BP_IS_GANG(bp))
 280                 return (SET_ERROR(EFRAGS));
 281 
 282         /*
 283          * See if the block is at the end of the previous extent.
 284          */
 285         ze = list_tail(&ma->ma_zv->zv_extents);
 286         if (ze &&
 287             DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
 288             DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
 289             DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
 290                 ze->ze_nblks++;
 291                 return (0);
 292         }
 293 
 294         dprintf_bp(bp, "%s", "next blkptr:");
 295 
 296         /* start a new extent */
 297         ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
 298         ze->ze_dva = bp->blk_dva[0];      /* structure assignment */
 299         ze->ze_nblks = 1;
 300         list_insert_tail(&ma->ma_zv->zv_extents, ze);
 301         return (0);
 302 }
 303 
 304 static void
 305 zvol_free_extents(zvol_state_t *zv)
 306 {
 307         zvol_extent_t *ze;
 308 
 309         while (ze = list_head(&zv->zv_extents)) {
 310                 list_remove(&zv->zv_extents, ze);
 311                 kmem_free(ze, sizeof (zvol_extent_t));
 312         }
 313 }
 314 
 315 static int
 316 zvol_get_lbas(zvol_state_t *zv)
 317 {
 318         objset_t *os = zv->zv_objset;
 319         struct maparg   ma;
 320         int             err;
 321 
 322         ma.ma_zv = zv;
 323         ma.ma_blks = 0;
 324         zvol_free_extents(zv);
 325 
 326         /* commit any in-flight changes before traversing the dataset */
 327         txg_wait_synced(dmu_objset_pool(os), 0);
 328         err = traverse_dataset(dmu_objset_ds(os), 0,
 329             TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
 330         if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
 331                 zvol_free_extents(zv);
 332                 return (err ? err : EIO);
 333         }
 334 
 335         return (0);
 336 }
 337 
 338 /* ARGSUSED */
 339 void
 340 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 341 {
 342         zfs_creat_t *zct = arg;
 343         nvlist_t *nvprops = zct->zct_props;
 344         int error;
 345         uint64_t volblocksize, volsize;
 346 
 347         VERIFY(nvlist_lookup_uint64(nvprops,
 348             zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
 349         if (nvlist_lookup_uint64(nvprops,
 350             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
 351                 volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 352 
 353         /*
 354          * These properties must be removed from the list so the generic
 355          * property setting step won't apply to them.
 356          */
 357         VERIFY(nvlist_remove_all(nvprops,
 358             zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
 359         (void) nvlist_remove_all(nvprops,
 360             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
 361 
 362         error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
 363             DMU_OT_NONE, 0, tx);
 364         ASSERT(error == 0);
 365 
 366         error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
 367             DMU_OT_NONE, 0, tx);
 368         ASSERT(error == 0);
 369 
 370         error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
 371         ASSERT(error == 0);
 372 }
 373 
 374 /*
 375  * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
 376  * implement DKIOCFREE/free-long-range.
 377  */
 378 static int
 379 zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 380 {
 381         zvol_state_t *zv = arg1;
 382         lr_truncate_t *lr = arg2;
 383         uint64_t offset, length;
 384 
 385         if (byteswap)
 386                 byteswap_uint64_array(lr, sizeof (*lr));
 387 
 388         offset = lr->lr_offset;
 389         length = lr->lr_length;
 390 
 391         return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
 392 }
 393 
 394 /*
 395  * Replay a TX_WRITE ZIL transaction that didn't get committed
 396  * after a system failure
 397  */
 398 static int
 399 zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 400 {
 401         zvol_state_t *zv = arg1;
 402         lr_write_t *lr = arg2;
 403         objset_t *os = zv->zv_objset;
 404         char *data = (char *)(lr + 1);  /* data follows lr_write_t */
 405         uint64_t offset, length;
 406         dmu_tx_t *tx;
 407         int error;
 408 
 409         if (byteswap)
 410                 byteswap_uint64_array(lr, sizeof (*lr));
 411 
 412         offset = lr->lr_offset;
 413         length = lr->lr_length;
 414 
 415         /* If it's a dmu_sync() block, write the whole block */
 416         if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 417                 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 418                 if (length < blocksize) {
 419                         offset -= offset % blocksize;
 420                         length = blocksize;
 421                 }
 422         }
 423 
 424         tx = dmu_tx_create(os);
 425         dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
 426         error = dmu_tx_assign(tx, TXG_WAIT);
 427         if (error) {
 428                 dmu_tx_abort(tx);
 429         } else {
 430                 dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
 431                 dmu_tx_commit(tx);
 432         }
 433 
 434         return (error);
 435 }
 436 
 437 /* ARGSUSED */
 438 static int
 439 zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
 440 {
 441         return (SET_ERROR(ENOTSUP));
 442 }
 443 
 444 /*
 445  * Callback vectors for replaying records.
 446  * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
 447  */
 448 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 449         zvol_replay_err,        /* 0 no such transaction type */
 450         zvol_replay_err,        /* TX_CREATE */
 451         zvol_replay_err,        /* TX_MKDIR */
 452         zvol_replay_err,        /* TX_MKXATTR */
 453         zvol_replay_err,        /* TX_SYMLINK */
 454         zvol_replay_err,        /* TX_REMOVE */
 455         zvol_replay_err,        /* TX_RMDIR */
 456         zvol_replay_err,        /* TX_LINK */
 457         zvol_replay_err,        /* TX_RENAME */
 458         zvol_replay_write,      /* TX_WRITE */
 459         zvol_replay_truncate,   /* TX_TRUNCATE */
 460         zvol_replay_err,        /* TX_SETATTR */
 461         zvol_replay_err,        /* TX_ACL */
 462         zvol_replay_err,        /* TX_CREATE_ACL */
 463         zvol_replay_err,        /* TX_CREATE_ATTR */
 464         zvol_replay_err,        /* TX_CREATE_ACL_ATTR */
 465         zvol_replay_err,        /* TX_MKDIR_ACL */
 466         zvol_replay_err,        /* TX_MKDIR_ATTR */
 467         zvol_replay_err,        /* TX_MKDIR_ACL_ATTR */
 468         zvol_replay_err,        /* TX_WRITE2 */
 469 };
 470 
 471 int
 472 zvol_name2minor(const char *name, minor_t *minor)
 473 {
 474         zvol_state_t *zv;
 475 
 476         mutex_enter(&zfsdev_state_lock);
 477         zv = zvol_minor_lookup(name);
 478         if (minor && zv)
 479                 *minor = zv->zv_minor;
 480         mutex_exit(&zfsdev_state_lock);
 481         return (zv ? 0 : -1);
 482 }
 483 
 484 /*
 485  * Create a minor node (plus a whole lot more) for the specified volume.
 486  */
 487 int
 488 zvol_create_minor(const char *name)
 489 {
 490         zfs_soft_state_t *zs;
 491         zvol_state_t *zv;
 492         objset_t *os;
 493         dmu_object_info_t doi;
 494         minor_t minor = 0;
 495         char chrbuf[30], blkbuf[30];
 496         int error;
 497 
 498         mutex_enter(&zfsdev_state_lock);
 499 
 500         if (zvol_minor_lookup(name) != NULL) {
 501                 mutex_exit(&zfsdev_state_lock);
 502                 return (SET_ERROR(EEXIST));
 503         }
 504 
 505         /* lie and say we're read-only */
 506         error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
 507 
 508         if (error) {
 509                 mutex_exit(&zfsdev_state_lock);
 510                 return (error);
 511         }
 512 
 513         if ((minor = zfsdev_minor_alloc()) == 0) {
 514                 dmu_objset_disown(os, FTAG);
 515                 mutex_exit(&zfsdev_state_lock);
 516                 return (SET_ERROR(ENXIO));
 517         }
 518 
 519         if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
 520                 dmu_objset_disown(os, FTAG);
 521                 mutex_exit(&zfsdev_state_lock);
 522                 return (SET_ERROR(EAGAIN));
 523         }
 524         (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
 525             (char *)name);
 526 
 527         (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
 528 
 529         if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
 530             minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 531                 ddi_soft_state_free(zfsdev_state, minor);
 532                 dmu_objset_disown(os, FTAG);
 533                 mutex_exit(&zfsdev_state_lock);
 534                 return (SET_ERROR(EAGAIN));
 535         }
 536 
 537         (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
 538 
 539         if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
 540             minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 541                 ddi_remove_minor_node(zfs_dip, chrbuf);
 542                 ddi_soft_state_free(zfsdev_state, minor);
 543                 dmu_objset_disown(os, FTAG);
 544                 mutex_exit(&zfsdev_state_lock);
 545                 return (SET_ERROR(EAGAIN));
 546         }
 547 
 548         zs = ddi_get_soft_state(zfsdev_state, minor);
 549         zs->zss_type = ZSST_ZVOL;
 550         zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 551         (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
 552         zv->zv_min_bs = DEV_BSHIFT;
 553         zv->zv_minor = minor;
 554         zv->zv_objset = os;
 555         if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
 556                 zv->zv_flags |= ZVOL_RDONLY;
 557         rangelock_init(&zv->zv_rangelock, NULL, NULL);
 558         list_create(&zv->zv_extents, sizeof (zvol_extent_t),
 559             offsetof(zvol_extent_t, ze_node));
 560         /* get and cache the blocksize */
 561         error = dmu_object_info(os, ZVOL_OBJ, &doi);
 562         ASSERT(error == 0);
 563         zv->zv_volblocksize = doi.doi_data_block_size;
 564 
 565         if (spa_writeable(dmu_objset_spa(os))) {
 566                 if (zil_replay_disable)
 567                         zil_destroy(dmu_objset_zil(os), B_FALSE);
 568                 else
 569                         zil_replay(os, zv, zvol_replay_vector);
 570         }
 571         dmu_objset_disown(os, FTAG);
 572         zv->zv_objset = NULL;
 573 
 574         zvol_minors++;
 575 
 576         mutex_exit(&zfsdev_state_lock);
 577 
 578         return (0);
 579 }
 580 
 581 /*
 582  * Remove minor node for the specified volume.
 583  */
 584 static int
 585 zvol_remove_zv(zvol_state_t *zv)
 586 {
 587         char nmbuf[20];
 588         minor_t minor = zv->zv_minor;
 589 
 590         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 591         if (zv->zv_total_opens != 0)
 592                 return (SET_ERROR(EBUSY));
 593 
 594         (void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
 595         ddi_remove_minor_node(zfs_dip, nmbuf);
 596 
 597         (void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
 598         ddi_remove_minor_node(zfs_dip, nmbuf);
 599 
 600         rangelock_fini(&zv->zv_rangelock);
 601 
 602         kmem_free(zv, sizeof (zvol_state_t));
 603 
 604         ddi_soft_state_free(zfsdev_state, minor);
 605 
 606         zvol_minors--;
 607         return (0);
 608 }
 609 
 610 int
 611 zvol_remove_minor(const char *name)
 612 {
 613         zvol_state_t *zv;
 614         int rc;
 615 
 616         mutex_enter(&zfsdev_state_lock);
 617         if ((zv = zvol_minor_lookup(name)) == NULL) {
 618                 mutex_exit(&zfsdev_state_lock);
 619                 return (SET_ERROR(ENXIO));
 620         }
 621         rc = zvol_remove_zv(zv);
 622         mutex_exit(&zfsdev_state_lock);
 623         return (rc);
 624 }
 625 
 626 int
 627 zvol_first_open(zvol_state_t *zv)
 628 {
 629         objset_t *os;
 630         uint64_t volsize;
 631         int error;
 632         uint64_t readonly;
 633 
 634         /* lie and say we're read-only */
 635         error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
 636             zvol_tag, &os);
 637         if (error)
 638                 return (error);
 639 
 640         zv->zv_objset = os;
 641         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 642         if (error) {
 643                 ASSERT(error == 0);
 644                 dmu_objset_disown(os, zvol_tag);
 645                 return (error);
 646         }
 647 
 648         error = dnode_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dn);
 649         if (error) {
 650                 dmu_objset_disown(os, zvol_tag);
 651                 return (error);
 652         }
 653 
 654         zvol_size_changed(zv, volsize);
 655         zv->zv_zilog = zil_open(os, zvol_get_data);
 656 
 657         VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
 658             NULL) == 0);
 659         if (readonly || dmu_objset_is_snapshot(os) ||
 660             !spa_writeable(dmu_objset_spa(os)))
 661                 zv->zv_flags |= ZVOL_RDONLY;
 662         else
 663                 zv->zv_flags &= ~ZVOL_RDONLY;
 664         return (error);
 665 }
 666 
 667 void
 668 zvol_last_close(zvol_state_t *zv)
 669 {
 670         zil_close(zv->zv_zilog);
 671         zv->zv_zilog = NULL;
 672 
 673         dnode_rele(zv->zv_dn, zvol_tag);
 674         zv->zv_dn = NULL;
 675 
 676         /*
 677          * Evict cached data
 678          */
 679         if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
 680             !(zv->zv_flags & ZVOL_RDONLY))
 681                 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 682         dmu_objset_evict_dbufs(zv->zv_objset);
 683 
 684         dmu_objset_disown(zv->zv_objset, zvol_tag);
 685         zv->zv_objset = NULL;
 686 }
 687 
 688 int
 689 zvol_prealloc(zvol_state_t *zv)
 690 {
 691         objset_t *os = zv->zv_objset;
 692         dmu_tx_t *tx;
 693         uint64_t refd, avail, usedobjs, availobjs;
 694         uint64_t resid = zv->zv_volsize;
 695         uint64_t off = 0;
 696 
 697         /* Check the space usage before attempting to allocate the space */
 698         dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
 699         if (avail < zv->zv_volsize)
 700                 return (SET_ERROR(ENOSPC));
 701 
 702         /* Free old extents if they exist */
 703         zvol_free_extents(zv);
 704 
 705         while (resid != 0) {
 706                 int error;
 707                 uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
 708 
 709                 tx = dmu_tx_create(os);
 710                 dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
 711                 error = dmu_tx_assign(tx, TXG_WAIT);
 712                 if (error) {
 713                         dmu_tx_abort(tx);
 714                         (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
 715                         return (error);
 716                 }
 717                 dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
 718                 dmu_tx_commit(tx);
 719                 off += bytes;
 720                 resid -= bytes;
 721         }
 722         txg_wait_synced(dmu_objset_pool(os), 0);
 723 
 724         return (0);
 725 }
 726 
 727 static int
 728 zvol_update_volsize(objset_t *os, uint64_t volsize)
 729 {
 730         dmu_tx_t *tx;
 731         int error;
 732 
 733         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 734 
 735         tx = dmu_tx_create(os);
 736         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 737         dmu_tx_mark_netfree(tx);
 738         error = dmu_tx_assign(tx, TXG_WAIT);
 739         if (error) {
 740                 dmu_tx_abort(tx);
 741                 return (error);
 742         }
 743 
 744         error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
 745             &volsize, tx);
 746         dmu_tx_commit(tx);
 747 
 748         if (error == 0)
 749                 error = dmu_free_long_range(os,
 750                     ZVOL_OBJ, volsize, DMU_OBJECT_END);
 751         return (error);
 752 }
 753 
 754 void
 755 zvol_remove_minors(const char *name)
 756 {
 757         zvol_state_t *zv;
 758         char *namebuf;
 759         minor_t minor;
 760 
 761         namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
 762         (void) strncpy(namebuf, name, strlen(name));
 763         (void) strcat(namebuf, "/");
 764         mutex_enter(&zfsdev_state_lock);
 765         for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
 766 
 767                 zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 768                 if (zv == NULL)
 769                         continue;
 770                 if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
 771                         (void) zvol_remove_zv(zv);
 772         }
 773         kmem_free(namebuf, strlen(name) + 2);
 774 
 775         mutex_exit(&zfsdev_state_lock);
 776 }
 777 
 778 static int
 779 zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
 780 {
 781         uint64_t old_volsize = 0ULL;
 782         int error = 0;
 783 
 784         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 785 
 786         /*
 787          * Reinitialize the dump area to the new size. If we
 788          * failed to resize the dump area then restore it back to
 789          * its original size.  We must set the new volsize prior
 790          * to calling dumpvp_resize() to ensure that the devices'
 791          * size(9P) is not visible by the dump subsystem.
 792          */
 793         old_volsize = zv->zv_volsize;
 794         zvol_size_changed(zv, volsize);
 795 
 796         if (zv->zv_flags & ZVOL_DUMPIFIED) {
 797                 if ((error = zvol_dumpify(zv)) != 0 ||
 798                     (error = dumpvp_resize()) != 0) {
 799                         int dumpify_error;
 800 
 801                         (void) zvol_update_volsize(zv->zv_objset, old_volsize);
 802                         zvol_size_changed(zv, old_volsize);
 803                         dumpify_error = zvol_dumpify(zv);
 804                         error = dumpify_error ? dumpify_error : error;
 805                 }
 806         }
 807 
 808         /*
 809          * Generate a LUN expansion event.
 810          */
 811         if (error == 0) {
 812                 sysevent_id_t eid;
 813                 nvlist_t *attr;
 814                 char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 815 
 816                 (void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
 817                     zv->zv_minor);
 818 
 819                 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 820                 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
 821 
 822                 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
 823                     ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
 824 
 825                 nvlist_free(attr);
 826                 kmem_free(physpath, MAXPATHLEN);
 827         }
 828         return (error);
 829 }
 830 
 831 int
 832 zvol_set_volsize(const char *name, uint64_t volsize)
 833 {
 834         zvol_state_t *zv = NULL;
 835         objset_t *os;
 836         int error;
 837         dmu_object_info_t doi;
 838         uint64_t readonly;
 839         boolean_t owned = B_FALSE;
 840 
 841         error = dsl_prop_get_integer(name,
 842             zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
 843         if (error != 0)
 844                 return (error);
 845         if (readonly)
 846                 return (SET_ERROR(EROFS));
 847 
 848         mutex_enter(&zfsdev_state_lock);
 849         zv = zvol_minor_lookup(name);
 850 
 851         if (zv == NULL || zv->zv_objset == NULL) {
 852                 if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
 853                     FTAG, &os)) != 0) {
 854                         mutex_exit(&zfsdev_state_lock);
 855                         return (error);
 856                 }
 857                 owned = B_TRUE;
 858                 if (zv != NULL)
 859                         zv->zv_objset = os;
 860         } else {
 861                 os = zv->zv_objset;
 862         }
 863 
 864         if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
 865             (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0)
 866                 goto out;
 867 
 868         error = zvol_update_volsize(os, volsize);
 869 
 870         if (error == 0 && zv != NULL)
 871                 error = zvol_update_live_volsize(zv, volsize);
 872 out:
 873         if (owned) {
 874                 dmu_objset_disown(os, FTAG);
 875                 if (zv != NULL)
 876                         zv->zv_objset = NULL;
 877         }
 878         mutex_exit(&zfsdev_state_lock);
 879         return (error);
 880 }
 881 
 882 /*ARGSUSED*/
 883 int
 884 zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
 885 {
 886         zvol_state_t *zv;
 887         int err = 0;
 888 
 889         mutex_enter(&zfsdev_state_lock);
 890 
 891         zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
 892         if (zv == NULL) {
 893                 mutex_exit(&zfsdev_state_lock);
 894                 return (SET_ERROR(ENXIO));
 895         }
 896 
 897         if (zv->zv_total_opens == 0)
 898                 err = zvol_first_open(zv);
 899         if (err) {
 900                 mutex_exit(&zfsdev_state_lock);
 901                 return (err);
 902         }
 903         if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
 904                 err = SET_ERROR(EROFS);
 905                 goto out;
 906         }
 907         if (zv->zv_flags & ZVOL_EXCL) {
 908                 err = SET_ERROR(EBUSY);
 909                 goto out;
 910         }
 911         if (flag & FEXCL) {
 912                 if (zv->zv_total_opens != 0) {
 913                         err = SET_ERROR(EBUSY);
 914                         goto out;
 915                 }
 916                 zv->zv_flags |= ZVOL_EXCL;
 917         }
 918 
 919         if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
 920                 zv->zv_open_count[otyp]++;
 921                 zv->zv_total_opens++;
 922         }
 923         mutex_exit(&zfsdev_state_lock);
 924 
 925         return (err);
 926 out:
 927         if (zv->zv_total_opens == 0)
 928                 zvol_last_close(zv);
 929         mutex_exit(&zfsdev_state_lock);
 930         return (err);
 931 }
 932 
 933 /*ARGSUSED*/
 934 int
 935 zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
 936 {
 937         minor_t minor = getminor(dev);
 938         zvol_state_t *zv;
 939         int error = 0;
 940 
 941         mutex_enter(&zfsdev_state_lock);
 942 
 943         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 944         if (zv == NULL) {
 945                 mutex_exit(&zfsdev_state_lock);
 946                 return (SET_ERROR(ENXIO));
 947         }
 948 
 949         if (zv->zv_flags & ZVOL_EXCL) {
 950                 ASSERT(zv->zv_total_opens == 1);
 951                 zv->zv_flags &= ~ZVOL_EXCL;
 952         }
 953 
 954         /*
 955          * If the open count is zero, this is a spurious close.
 956          * That indicates a bug in the kernel / DDI framework.
 957          */
 958         ASSERT(zv->zv_open_count[otyp] != 0);
 959         ASSERT(zv->zv_total_opens != 0);
 960 
 961         /*
 962          * You may get multiple opens, but only one close.
 963          */
 964         zv->zv_open_count[otyp]--;
 965         zv->zv_total_opens--;
 966 
 967         if (zv->zv_total_opens == 0)
 968                 zvol_last_close(zv);
 969 
 970         mutex_exit(&zfsdev_state_lock);
 971         return (error);
 972 }
 973 
 974 /* ARGSUSED */
 975 static void
 976 zvol_get_done(zgd_t *zgd, int error)
 977 {
 978         if (zgd->zgd_db)
 979                 dmu_buf_rele(zgd->zgd_db, zgd);
 980 
 981         rangelock_exit(zgd->zgd_lr);
 982 
 983         kmem_free(zgd, sizeof (zgd_t));
 984 }
 985 
 986 /*
 987  * Get data to generate a TX_WRITE intent log record.
 988  */
 989 static int
 990 zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 991 {
 992         zvol_state_t *zv = arg;
 993         uint64_t offset = lr->lr_offset;
 994         uint64_t size = lr->lr_length;       /* length of user data */
 995         dmu_buf_t *db;
 996         zgd_t *zgd;
 997         int error;
 998 
 999         ASSERT3P(lwb, !=, NULL);
1000         ASSERT3P(zio, !=, NULL);
1001         ASSERT3U(size, !=, 0);
1002 
1003         zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1004         zgd->zgd_lwb = lwb;
1005 
1006         /*
1007          * Write records come in two flavors: immediate and indirect.
1008          * For small writes it's cheaper to store the data with the
1009          * log record (immediate); for large writes it's cheaper to
1010          * sync the data and get a pointer to it (indirect) so that
1011          * we don't have to write the data twice.
1012          */
1013         if (buf != NULL) { /* immediate write */
1014                 zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
1015                     RL_READER);
1016                 error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
1017                     DMU_READ_NO_PREFETCH);
1018         } else { /* indirect write */
1019                 /*
1020                  * Have to lock the whole block to ensure when it's written out
1021                  * and its checksum is being calculated that no one can change
1022                  * the data. Contrarily to zfs_get_data we need not re-check
1023                  * blocksize after we get the lock because it cannot be changed.
1024                  */
1025                 size = zv->zv_volblocksize;
1026                 offset = P2ALIGN(offset, size);
1027                 zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
1028                     RL_READER);
1029                 error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
1030                     DMU_READ_NO_PREFETCH);
1031                 if (error == 0) {
1032                         blkptr_t *bp = &lr->lr_blkptr;
1033 
1034                         zgd->zgd_db = db;
1035                         zgd->zgd_bp = bp;
1036 
1037                         ASSERT(db->db_offset == offset);
1038                         ASSERT(db->db_size == size);
1039 
1040                         error = dmu_sync(zio, lr->lr_common.lrc_txg,
1041                             zvol_get_done, zgd);
1042 
1043                         if (error == 0)
1044                                 return (0);
1045                 }
1046         }
1047 
1048         zvol_get_done(zgd, error);
1049 
1050         return (error);
1051 }
1052 
1053 /*
1054  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
1055  *
1056  * We store data in the log buffers if it's small enough.
1057  * Otherwise we will later flush the data out via dmu_sync().
1058  */
1059 ssize_t zvol_immediate_write_sz = 32768;
1060 
1061 static void
1062 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1063     boolean_t sync)
1064 {
1065         uint32_t blocksize = zv->zv_volblocksize;
1066         zilog_t *zilog = zv->zv_zilog;
1067         itx_wr_state_t write_state;
1068 
1069         if (zil_replaying(zilog, tx))
1070                 return;
1071 
1072         if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1073                 write_state = WR_INDIRECT;
1074         else if (!spa_has_slogs(zilog->zl_spa) &&
1075             resid >= blocksize && blocksize > zvol_immediate_write_sz)
1076                 write_state = WR_INDIRECT;
1077         else if (sync)
1078                 write_state = WR_COPIED;
1079         else
1080                 write_state = WR_NEED_COPY;
1081 
1082         while (resid) {
1083                 itx_t *itx;
1084                 lr_write_t *lr;
1085                 itx_wr_state_t wr_state = write_state;
1086                 ssize_t len = resid;
1087 
1088                 if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
1089                         wr_state = WR_NEED_COPY;
1090                 else if (wr_state == WR_INDIRECT)
1091                         len = MIN(blocksize - P2PHASE(off, blocksize), resid);
1092 
1093                 itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1094                     (wr_state == WR_COPIED ? len : 0));
1095                 lr = (lr_write_t *)&itx->itx_lr;
1096                 if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
1097                     off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
1098                         zil_itx_destroy(itx);
1099                         itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1100                         lr = (lr_write_t *)&itx->itx_lr;
1101                         wr_state = WR_NEED_COPY;
1102                 }
1103 
1104                 itx->itx_wr_state = wr_state;
1105                 lr->lr_foid = ZVOL_OBJ;
1106                 lr->lr_offset = off;
1107                 lr->lr_length = len;
1108                 lr->lr_blkoff = 0;
1109                 BP_ZERO(&lr->lr_blkptr);
1110 
1111                 itx->itx_private = zv;
1112                 itx->itx_sync = sync;
1113 
1114                 zil_itx_assign(zilog, itx, tx);
1115 
1116                 off += len;
1117                 resid -= len;
1118         }
1119 }
1120 
1121 static int
1122 zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1123     uint64_t size, boolean_t doread, boolean_t isdump)
1124 {
1125         vdev_disk_t *dvd;
1126         int c;
1127         int numerrors = 0;
1128 
1129         if (vd->vdev_ops == &vdev_mirror_ops ||
1130             vd->vdev_ops == &vdev_replacing_ops ||
1131             vd->vdev_ops == &vdev_spare_ops) {
1132                 for (c = 0; c < vd->vdev_children; c++) {
1133                         int err = zvol_dumpio_vdev(vd->vdev_child[c],
1134                             addr, offset, origoffset, size, doread, isdump);
1135                         if (err != 0) {
1136                                 numerrors++;
1137                         } else if (doread) {
1138                                 break;
1139                         }
1140                 }
1141         }
1142 
1143         if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
1144                 return (numerrors < vd->vdev_children ? 0 : EIO);
1145 
1146         if (doread && !vdev_readable(vd))
1147                 return (SET_ERROR(EIO));
1148         else if (!doread && !vdev_writeable(vd))
1149                 return (SET_ERROR(EIO));
1150 
1151         if (vd->vdev_ops == &vdev_raidz_ops) {
1152                 return (vdev_raidz_physio(vd,
1153                     addr, size, offset, origoffset, doread, isdump));
1154         }
1155 
1156         offset += VDEV_LABEL_START_SIZE;
1157 
1158         if (ddi_in_panic() || isdump) {
1159                 ASSERT(!doread);
1160                 if (doread)
1161                         return (SET_ERROR(EIO));
1162                 dvd = vd->vdev_tsd;
1163                 ASSERT3P(dvd, !=, NULL);
1164                 return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1165                     lbtodb(size)));
1166         } else {
1167                 dvd = vd->vdev_tsd;
1168                 ASSERT3P(dvd, !=, NULL);
1169                 return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1170                     offset, doread ? B_READ : B_WRITE));
1171         }
1172 }
1173 
1174 static int
1175 zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1176     boolean_t doread, boolean_t isdump)
1177 {
1178         vdev_t *vd;
1179         int error;
1180         zvol_extent_t *ze;
1181         spa_t *spa = dmu_objset_spa(zv->zv_objset);
1182 
1183         /* Must be sector aligned, and not stradle a block boundary. */
1184         if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1185             P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1186                 return (SET_ERROR(EINVAL));
1187         }
1188         ASSERT(size <= zv->zv_volblocksize);
1189 
1190         /* Locate the extent this belongs to */
1191         ze = list_head(&zv->zv_extents);
1192         while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
1193                 offset -= ze->ze_nblks * zv->zv_volblocksize;
1194                 ze = list_next(&zv->zv_extents, ze);
1195         }
1196 
1197         if (ze == NULL)
1198                 return (SET_ERROR(EINVAL));
1199 
1200         if (!ddi_in_panic())
1201                 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1202 
1203         vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
1204         offset += DVA_GET_OFFSET(&ze->ze_dva);
1205         error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
1206             size, doread, isdump);
1207 
1208         if (!ddi_in_panic())
1209                 spa_config_exit(spa, SCL_STATE, FTAG);
1210 
1211         return (error);
1212 }
1213 
1214 int
1215 zvol_strategy(buf_t *bp)
1216 {
1217         zfs_soft_state_t *zs = NULL;
1218         zvol_state_t *zv;
1219         uint64_t off, volsize;
1220         size_t resid;
1221         char *addr;
1222         objset_t *os;
1223         int error = 0;
1224         boolean_t doread = bp->b_flags & B_READ;
1225         boolean_t is_dumpified;
1226         boolean_t sync;
1227 
1228         if (getminor(bp->b_edev) == 0) {
1229                 error = SET_ERROR(EINVAL);
1230         } else {
1231                 zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
1232                 if (zs == NULL)
1233                         error = SET_ERROR(ENXIO);
1234                 else if (zs->zss_type != ZSST_ZVOL)
1235                         error = SET_ERROR(EINVAL);
1236         }
1237 
1238         if (error) {
1239                 bioerror(bp, error);
1240                 biodone(bp);
1241                 return (0);
1242         }
1243 
1244         zv = zs->zss_data;
1245 
1246         if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
1247                 bioerror(bp, EROFS);
1248                 biodone(bp);
1249                 return (0);
1250         }
1251 
1252         off = ldbtob(bp->b_blkno);
1253         volsize = zv->zv_volsize;
1254 
1255         os = zv->zv_objset;
1256         ASSERT(os != NULL);
1257 
1258         bp_mapin(bp);
1259         addr = bp->b_un.b_addr;
1260         resid = bp->b_bcount;
1261 
1262         if (resid > 0 && (off < 0 || off >= volsize)) {
1263                 bioerror(bp, EIO);
1264                 biodone(bp);
1265                 return (0);
1266         }
1267 
1268         is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
1269         sync = ((!(bp->b_flags & B_ASYNC) &&
1270             !(zv->zv_flags & ZVOL_WCE)) ||
1271             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
1272             !doread && !is_dumpified;
1273 
1274         /*
1275          * There must be no buffer changes when doing a dmu_sync() because
1276          * we can't change the data whilst calculating the checksum.
1277          */
1278         locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, off, resid,
1279             doread ? RL_READER : RL_WRITER);
1280 
1281         while (resid != 0 && off < volsize) {
1282                 size_t size = MIN(resid, zvol_maxphys);
1283                 if (is_dumpified) {
1284                         size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1285                         error = zvol_dumpio(zv, addr, off, size,
1286                             doread, B_FALSE);
1287                 } else if (doread) {
1288                         error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1289                             DMU_READ_PREFETCH);
1290                 } else {
1291                         dmu_tx_t *tx = dmu_tx_create(os);
1292                         dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1293                         error = dmu_tx_assign(tx, TXG_WAIT);
1294                         if (error) {
1295                                 dmu_tx_abort(tx);
1296                         } else {
1297                                 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
1298                                 zvol_log_write(zv, tx, off, size, sync);
1299                                 dmu_tx_commit(tx);
1300                         }
1301                 }
1302                 if (error) {
1303                         /* convert checksum errors into IO errors */
1304                         if (error == ECKSUM)
1305                                 error = SET_ERROR(EIO);
1306                         break;
1307                 }
1308                 off += size;
1309                 addr += size;
1310                 resid -= size;
1311         }
1312         rangelock_exit(lr);
1313 
1314         if ((bp->b_resid = resid) == bp->b_bcount)
1315                 bioerror(bp, off > volsize ? EINVAL : error);
1316 
1317         if (sync)
1318                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1319         biodone(bp);
1320 
1321         return (0);
1322 }
1323 
1324 /*
1325  * Set the buffer count to the zvol maximum transfer.
1326  * Using our own routine instead of the default minphys()
1327  * means that for larger writes we write bigger buffers on X86
1328  * (128K instead of 56K) and flush the disk write cache less often
1329  * (every zvol_maxphys - currently 1MB) instead of minphys (currently
1330  * 56K on X86 and 128K on sparc).
1331  */
1332 void
1333 zvol_minphys(struct buf *bp)
1334 {
1335         if (bp->b_bcount > zvol_maxphys)
1336                 bp->b_bcount = zvol_maxphys;
1337 }
1338 
1339 int
1340 zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
1341 {
1342         minor_t minor = getminor(dev);
1343         zvol_state_t *zv;
1344         int error = 0;
1345         uint64_t size;
1346         uint64_t boff;
1347         uint64_t resid;
1348 
1349         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1350         if (zv == NULL)
1351                 return (SET_ERROR(ENXIO));
1352 
1353         if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
1354                 return (SET_ERROR(EINVAL));
1355 
1356         boff = ldbtob(blkno);
1357         resid = ldbtob(nblocks);
1358 
1359         VERIFY3U(boff + resid, <=, zv->zv_volsize);
1360 
1361         while (resid) {
1362                 size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
1363                 error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
1364                 if (error)
1365                         break;
1366                 boff += size;
1367                 addr += size;
1368                 resid -= size;
1369         }
1370 
1371         return (error);
1372 }
1373 
1374 /*ARGSUSED*/
1375 int
1376 zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
1377 {
1378         minor_t minor = getminor(dev);
1379         zvol_state_t *zv;
1380         uint64_t volsize;
1381         int error = 0;
1382 
1383         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1384         if (zv == NULL)
1385                 return (SET_ERROR(ENXIO));
1386 
1387         volsize = zv->zv_volsize;
1388         if (uio->uio_resid > 0 &&
1389             (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1390                 return (SET_ERROR(EIO));
1391 
1392         if (zv->zv_flags & ZVOL_DUMPIFIED) {
1393                 error = physio(zvol_strategy, NULL, dev, B_READ,
1394                     zvol_minphys, uio);
1395                 return (error);
1396         }
1397 
1398         locked_range_t *lr = rangelock_enter(&zv->zv_rangelock,
1399             uio->uio_loffset, uio->uio_resid, RL_READER);
1400         while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1401                 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1402 
1403                 /* don't read past the end */
1404                 if (bytes > volsize - uio->uio_loffset)
1405                         bytes = volsize - uio->uio_loffset;
1406 
1407                 error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
1408                 if (error) {
1409                         /* convert checksum errors into IO errors */
1410                         if (error == ECKSUM)
1411                                 error = SET_ERROR(EIO);
1412                         break;
1413                 }
1414         }
1415         rangelock_exit(lr);
1416 
1417         return (error);
1418 }
1419 
1420 /*ARGSUSED*/
1421 int
1422 zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
1423 {
1424         minor_t minor = getminor(dev);
1425         zvol_state_t *zv;
1426         uint64_t volsize;
1427         int error = 0;
1428         boolean_t sync;
1429 
1430         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1431         if (zv == NULL)
1432                 return (SET_ERROR(ENXIO));
1433 
1434         volsize = zv->zv_volsize;
1435         if (uio->uio_resid > 0 &&
1436             (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1437                 return (SET_ERROR(EIO));
1438 
1439         if (zv->zv_flags & ZVOL_DUMPIFIED) {
1440                 error = physio(zvol_strategy, NULL, dev, B_WRITE,
1441                     zvol_minphys, uio);
1442                 return (error);
1443         }
1444 
1445         sync = !(zv->zv_flags & ZVOL_WCE) ||
1446             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1447 
1448         locked_range_t *lr = rangelock_enter(&zv->zv_rangelock,
1449             uio->uio_loffset, uio->uio_resid, RL_WRITER);
1450         while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1451                 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1452                 uint64_t off = uio->uio_loffset;
1453                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1454 
1455                 if (bytes > volsize - off)   /* don't write past the end */
1456                         bytes = volsize - off;
1457 
1458                 dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1459                 error = dmu_tx_assign(tx, TXG_WAIT);
1460                 if (error) {
1461                         dmu_tx_abort(tx);
1462                         break;
1463                 }
1464                 error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx);
1465                 if (error == 0)
1466                         zvol_log_write(zv, tx, off, bytes, sync);
1467                 dmu_tx_commit(tx);
1468 
1469                 if (error)
1470                         break;
1471         }
1472         rangelock_exit(lr);
1473 
1474         if (sync)
1475                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1476         return (error);
1477 }
1478 
1479 int
1480 zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
1481 {
1482         struct uuid uuid = EFI_RESERVED;
1483         efi_gpe_t gpe = { 0 };
1484         uint32_t crc;
1485         dk_efi_t efi;
1486         int length;
1487         char *ptr;
1488 
1489         if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
1490                 return (SET_ERROR(EFAULT));
1491         ptr = (char *)(uintptr_t)efi.dki_data_64;
1492         length = efi.dki_length;
1493         /*
1494          * Some clients may attempt to request a PMBR for the
1495          * zvol.  Currently this interface will return EINVAL to
1496          * such requests.  These requests could be supported by
1497          * adding a check for lba == 0 and consing up an appropriate
1498          * PMBR.
1499          */
1500         if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
1501                 return (SET_ERROR(EINVAL));
1502 
1503         gpe.efi_gpe_StartingLBA = LE_64(34ULL);
1504         gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
1505         UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
1506 
1507         if (efi.dki_lba == 1) {
1508                 efi_gpt_t gpt = { 0 };
1509 
1510                 gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
1511                 gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
1512                 gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
1513                 gpt.efi_gpt_MyLBA = LE_64(1ULL);
1514                 gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
1515                 gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
1516                 gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
1517                 gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
1518                 gpt.efi_gpt_SizeOfPartitionEntry =
1519                     LE_32(sizeof (efi_gpe_t));
1520                 CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
1521                 gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
1522                 CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
1523                 gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
1524                 if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
1525                     flag))
1526                         return (SET_ERROR(EFAULT));
1527                 ptr += sizeof (gpt);
1528                 length -= sizeof (gpt);
1529         }
1530         if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
1531             length), flag))
1532                 return (SET_ERROR(EFAULT));
1533         return (0);
1534 }
1535 
1536 /*
1537  * BEGIN entry points to allow external callers access to the volume.
1538  */
1539 /*
1540  * Return the volume parameters needed for access from an external caller.
1541  * These values are invariant as long as the volume is held open.
1542  */
1543 int
1544 zvol_get_volume_params(minor_t minor, uint64_t *blksize,
1545     uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
1546     void **rl_hdl, void **dnode_hdl)
1547 {
1548         zvol_state_t *zv;
1549 
1550         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1551         if (zv == NULL)
1552                 return (SET_ERROR(ENXIO));
1553         if (zv->zv_flags & ZVOL_DUMPIFIED)
1554                 return (SET_ERROR(ENXIO));
1555 
1556         ASSERT(blksize && max_xfer_len && minor_hdl &&
1557             objset_hdl && zil_hdl && rl_hdl && dnode_hdl);
1558 
1559         *blksize = zv->zv_volblocksize;
1560         *max_xfer_len = (uint64_t)zvol_maxphys;
1561         *minor_hdl = zv;
1562         *objset_hdl = zv->zv_objset;
1563         *zil_hdl = zv->zv_zilog;
1564         *rl_hdl = &zv->zv_rangelock;
1565         *dnode_hdl = zv->zv_dn;
1566         return (0);
1567 }
1568 
1569 /*
1570  * Return the current volume size to an external caller.
1571  * The size can change while the volume is open.
1572  */
1573 uint64_t
1574 zvol_get_volume_size(void *minor_hdl)
1575 {
1576         zvol_state_t *zv = minor_hdl;
1577 
1578         return (zv->zv_volsize);
1579 }
1580 
1581 /*
1582  * Return the current WCE setting to an external caller.
1583  * The WCE setting can change while the volume is open.
1584  */
1585 int
1586 zvol_get_volume_wce(void *minor_hdl)
1587 {
1588         zvol_state_t *zv = minor_hdl;
1589 
1590         return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
1591 }
1592 
1593 /*
1594  * Entry point for external callers to zvol_log_write
1595  */
1596 void
1597 zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
1598     boolean_t sync)
1599 {
1600         zvol_state_t *zv = minor_hdl;
1601 
1602         zvol_log_write(zv, tx, off, resid, sync);
1603 }
1604 /*
1605  * END entry points to allow external callers access to the volume.
1606  */
1607 
1608 /*
1609  * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
1610  */
1611 static void
1612 zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
1613     boolean_t sync)
1614 {
1615         itx_t *itx;
1616         lr_truncate_t *lr;
1617         zilog_t *zilog = zv->zv_zilog;
1618 
1619         if (zil_replaying(zilog, tx))
1620                 return;
1621 
1622         itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
1623         lr = (lr_truncate_t *)&itx->itx_lr;
1624         lr->lr_foid = ZVOL_OBJ;
1625         lr->lr_offset = off;
1626         lr->lr_length = len;
1627 
1628         itx->itx_sync = sync;
1629         zil_itx_assign(zilog, itx, tx);
1630 }
1631 
1632 /*
1633  * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
1634  * Also a dirtbag dkio ioctl for unmap/free-block functionality.
1635  */
1636 /*ARGSUSED*/
1637 int
1638 zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
1639 {
1640         zvol_state_t *zv;
1641         struct dk_callback *dkc;
1642         int error = 0;
1643         locked_range_t *lr;
1644 
1645         mutex_enter(&zfsdev_state_lock);
1646 
1647         zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
1648 
1649         if (zv == NULL) {
1650                 mutex_exit(&zfsdev_state_lock);
1651                 return (SET_ERROR(ENXIO));
1652         }
1653         ASSERT(zv->zv_total_opens > 0);
1654 
1655         switch (cmd) {
1656 
1657         case DKIOCINFO:
1658         {
1659                 struct dk_cinfo dki;
1660 
1661                 bzero(&dki, sizeof (dki));
1662                 (void) strcpy(dki.dki_cname, "zvol");
1663                 (void) strcpy(dki.dki_dname, "zvol");
1664                 dki.dki_ctype = DKC_UNKNOWN;
1665                 dki.dki_unit = getminor(dev);
1666                 dki.dki_maxtransfer =
1667                     1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
1668                 mutex_exit(&zfsdev_state_lock);
1669                 if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
1670                         error = SET_ERROR(EFAULT);
1671                 return (error);
1672         }
1673 
1674         case DKIOCGMEDIAINFO:
1675         {
1676                 struct dk_minfo dkm;
1677 
1678                 bzero(&dkm, sizeof (dkm));
1679                 dkm.dki_lbsize = 1U << zv->zv_min_bs;
1680                 dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1681                 dkm.dki_media_type = DK_UNKNOWN;
1682                 mutex_exit(&zfsdev_state_lock);
1683                 if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
1684                         error = SET_ERROR(EFAULT);
1685                 return (error);
1686         }
1687 
1688         case DKIOCGMEDIAINFOEXT:
1689         {
1690                 struct dk_minfo_ext dkmext;
1691 
1692                 bzero(&dkmext, sizeof (dkmext));
1693                 dkmext.dki_lbsize = 1U << zv->zv_min_bs;
1694                 dkmext.dki_pbsize = zv->zv_volblocksize;
1695                 dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1696                 dkmext.dki_media_type = DK_UNKNOWN;
1697                 mutex_exit(&zfsdev_state_lock);
1698                 if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
1699                         error = SET_ERROR(EFAULT);
1700                 return (error);
1701         }
1702 
1703         case DKIOCGETEFI:
1704         {
1705                 uint64_t vs = zv->zv_volsize;
1706                 uint8_t bs = zv->zv_min_bs;
1707 
1708                 mutex_exit(&zfsdev_state_lock);
1709                 error = zvol_getefi((void *)arg, flag, vs, bs);
1710                 return (error);
1711         }
1712 
1713         case DKIOCFLUSHWRITECACHE:
1714                 dkc = (struct dk_callback *)arg;
1715                 mutex_exit(&zfsdev_state_lock);
1716                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1717                 if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
1718                         (*dkc->dkc_callback)(dkc->dkc_cookie, error);
1719                         error = 0;
1720                 }
1721                 return (error);
1722 
1723         case DKIOCGETWCE:
1724         {
1725                 int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
1726                 if (ddi_copyout(&wce, (void *)arg, sizeof (int),
1727                     flag))
1728                         error = SET_ERROR(EFAULT);
1729                 break;
1730         }
1731         case DKIOCSETWCE:
1732         {
1733                 int wce;
1734                 if (ddi_copyin((void *)arg, &wce, sizeof (int),
1735                     flag)) {
1736                         error = SET_ERROR(EFAULT);
1737                         break;
1738                 }
1739                 if (wce) {
1740                         zv->zv_flags |= ZVOL_WCE;
1741                         mutex_exit(&zfsdev_state_lock);
1742                 } else {
1743                         zv->zv_flags &= ~ZVOL_WCE;
1744                         mutex_exit(&zfsdev_state_lock);
1745                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
1746                 }
1747                 return (0);
1748         }
1749 
1750         case DKIOCGGEOM:
1751         case DKIOCGVTOC:
1752                 /*
1753                  * commands using these (like prtvtoc) expect ENOTSUP
1754                  * since we're emulating an EFI label
1755                  */
1756                 error = SET_ERROR(ENOTSUP);
1757                 break;
1758 
1759         case DKIOCDUMPINIT:
1760                 lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize,
1761                     RL_WRITER);
1762                 error = zvol_dumpify(zv);
1763                 rangelock_exit(lr);
1764                 break;
1765 
1766         case DKIOCDUMPFINI:
1767                 if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1768                         break;
1769                 lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize,
1770                     RL_WRITER);
1771                 error = zvol_dump_fini(zv);
1772                 rangelock_exit(lr);
1773                 break;
1774 
1775         case DKIOCFREE:
1776         {
1777                 dkioc_free_list_t *dfl;
1778                 dmu_tx_t *tx;
1779 
1780                 if (!zvol_unmap_enabled)
1781                         break;
1782 
1783                 if (!(flag & FKIOCTL)) {
1784                         error = dfl_copyin((void *)arg, &dfl, flag, KM_SLEEP);
1785                         if (error != 0)
1786                                 break;
1787                 } else {
1788                         dfl = (dkioc_free_list_t *)arg;
1789                         ASSERT3U(dfl->dfl_num_exts, <=, DFL_COPYIN_MAX_EXTS);
1790                         if (dfl->dfl_num_exts > DFL_COPYIN_MAX_EXTS) {
1791                                 error = SET_ERROR(EINVAL);
1792                                 break;
1793                         }
1794                 }
1795 
1796                 mutex_exit(&zfsdev_state_lock);
1797 
1798                 for (int i = 0; i < dfl->dfl_num_exts; i++) {
1799                         uint64_t start = dfl->dfl_exts[i].dfle_start,
1800                             length = dfl->dfl_exts[i].dfle_length,
1801                             end = start + length;
1802 
1803                         /*
1804                          * Apply Postel's Law to length-checking.  If they
1805                          * overshoot, just blank out until the end, if there's
1806                          * a need to blank out anything.
1807                          */
1808                         if (start >= zv->zv_volsize)
1809                                 continue;       /* No need to do anything... */
1810                         if (end > zv->zv_volsize) {
1811                                 end = DMU_OBJECT_END;
1812                                 length = end - start;
1813                         }
1814 
1815                         lr = rangelock_enter(&zv->zv_rangelock, start, length,
1816                             RL_WRITER);
1817                         tx = dmu_tx_create(zv->zv_objset);
1818                         error = dmu_tx_assign(tx, TXG_WAIT);
1819                         if (error != 0) {
1820                                 dmu_tx_abort(tx);
1821                         } else {
1822                                 zvol_log_truncate(zv, tx, start, length,
1823                                     B_TRUE);
1824                                 dmu_tx_commit(tx);
1825                                 error = dmu_free_long_range(zv->zv_objset,
1826                                     ZVOL_OBJ, start, length);
1827                         }
1828 
1829                         rangelock_exit(lr);
1830 
1831                         if (error != 0)
1832                                 break;
1833                 }
1834 
1835                 /*
1836                  * If the write-cache is disabled, 'sync' property
1837                  * is set to 'always', or if the caller is asking for
1838                  * a synchronous free, commit this operation to the zil.
1839                  * This will sync any previous uncommitted writes to the
1840                  * zvol object.
1841                  * Can be overridden by the zvol_unmap_sync_enabled tunable.
1842                  */
1843                 if ((error == 0) && zvol_unmap_sync_enabled &&
1844                     (!(zv->zv_flags & ZVOL_WCE) ||
1845                     (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) ||
1846                     (dfl->dfl_flags & DF_WAIT_SYNC))) {
1847                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
1848                 }
1849 
1850                 if (!(flag & FKIOCTL))
1851                         dfl_free(dfl);
1852 
1853                 return (error);
1854         }
1855 
1856         default:
1857                 error = SET_ERROR(ENOTTY);
1858                 break;
1859 
1860         }
1861         mutex_exit(&zfsdev_state_lock);
1862         return (error);
1863 }
1864 
1865 int
1866 zvol_busy(void)
1867 {
1868         return (zvol_minors != 0);
1869 }
1870 
1871 void
1872 zvol_init(void)
1873 {
1874         VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
1875             1) == 0);
1876         mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
1877 }
1878 
1879 void
1880 zvol_fini(void)
1881 {
1882         mutex_destroy(&zfsdev_state_lock);
1883         ddi_soft_state_fini(&zfsdev_state);
1884 }
1885 
1886 /*ARGSUSED*/
1887 static int
1888 zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
1889 {
1890         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1891 
1892         if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1893                 return (1);
1894         return (0);
1895 }
1896 
1897 /*ARGSUSED*/
1898 static void
1899 zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
1900 {
1901         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1902 
1903         spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
1904 }
1905 
1906 static int
1907 zvol_dump_init(zvol_state_t *zv, boolean_t resize)
1908 {
1909         dmu_tx_t *tx;
1910         int error;
1911         objset_t *os = zv->zv_objset;
1912         spa_t *spa = dmu_objset_spa(os);
1913         vdev_t *vd = spa->spa_root_vdev;
1914         nvlist_t *nv = NULL;
1915         uint64_t version = spa_version(spa);
1916         uint64_t checksum, compress, refresrv, vbs, dedup;
1917 
1918         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
1919         ASSERT(vd->vdev_ops == &vdev_root_ops);
1920 
1921         error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
1922             DMU_OBJECT_END);
1923         if (error != 0)
1924                 return (error);
1925         /* wait for dmu_free_long_range to actually free the blocks */
1926         txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
1927 
1928         /*
1929          * If the pool on which the dump device is being initialized has more
1930          * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
1931          * enabled.  If so, bump that feature's counter to indicate that the
1932          * feature is active. We also check the vdev type to handle the
1933          * following case:
1934          *   # zpool create test raidz disk1 disk2 disk3
1935          *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
1936          *   the raidz vdev itself has 3 children.
1937          */
1938         if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
1939                 if (!spa_feature_is_enabled(spa,
1940                     SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1941                         return (SET_ERROR(ENOTSUP));
1942                 (void) dsl_sync_task(spa_name(spa),
1943                     zfs_mvdev_dump_feature_check,
1944                     zfs_mvdev_dump_activate_feature_sync, NULL,
1945                     2, ZFS_SPACE_CHECK_RESERVED);
1946         }
1947 
1948         if (!resize) {
1949                 error = dsl_prop_get_integer(zv->zv_name,
1950                     zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
1951                 if (error == 0) {
1952                         error = dsl_prop_get_integer(zv->zv_name,
1953                             zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum,
1954                             NULL);
1955                 }
1956                 if (error == 0) {
1957                         error = dsl_prop_get_integer(zv->zv_name,
1958                             zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
1959                             &refresrv, NULL);
1960                 }
1961                 if (error == 0) {
1962                         error = dsl_prop_get_integer(zv->zv_name,
1963                             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs,
1964                             NULL);
1965                 }
1966                 if (version >= SPA_VERSION_DEDUP && error == 0) {
1967                         error = dsl_prop_get_integer(zv->zv_name,
1968                             zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
1969                 }
1970         }
1971         if (error != 0)
1972                 return (error);
1973 
1974         tx = dmu_tx_create(os);
1975         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
1976         dmu_tx_hold_bonus(tx, ZVOL_OBJ);
1977         error = dmu_tx_assign(tx, TXG_WAIT);
1978         if (error != 0) {
1979                 dmu_tx_abort(tx);
1980                 return (error);
1981         }
1982 
1983         /*
1984          * If we are resizing the dump device then we only need to
1985          * update the refreservation to match the newly updated
1986          * zvolsize. Otherwise, we save off the original state of the
1987          * zvol so that we can restore them if the zvol is ever undumpified.
1988          */
1989         if (resize) {
1990                 error = zap_update(os, ZVOL_ZAP_OBJ,
1991                     zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1992                     &zv->zv_volsize, tx);
1993         } else {
1994                 error = zap_update(os, ZVOL_ZAP_OBJ,
1995                     zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
1996                     &compress, tx);
1997                 if (error == 0) {
1998                         error = zap_update(os, ZVOL_ZAP_OBJ,
1999                             zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1,
2000                             &checksum, tx);
2001                 }
2002                 if (error == 0) {
2003                         error = zap_update(os, ZVOL_ZAP_OBJ,
2004                             zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
2005                             &refresrv, tx);
2006                 }
2007                 if (error == 0) {
2008                         error = zap_update(os, ZVOL_ZAP_OBJ,
2009                             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
2010                             &vbs, tx);
2011                 }
2012                 if (error == 0) {
2013                         error = dmu_object_set_blocksize(
2014                             os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
2015                 }
2016                 if (version >= SPA_VERSION_DEDUP && error == 0) {
2017                         error = zap_update(os, ZVOL_ZAP_OBJ,
2018                             zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
2019                             &dedup, tx);
2020                 }
2021                 if (error == 0)
2022                         zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
2023         }
2024         dmu_tx_commit(tx);
2025 
2026         /*
2027          * We only need update the zvol's property if we are initializing
2028          * the dump area for the first time.
2029          */
2030         if (error == 0 && !resize) {
2031                 /*
2032                  * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
2033                  * function.  Otherwise, use the old default -- OFF.
2034                  */
2035                 checksum = spa_feature_is_active(spa,
2036                     SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
2037                     ZIO_CHECKSUM_OFF;
2038 
2039                 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2040                 VERIFY(nvlist_add_uint64(nv,
2041                     zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
2042                 VERIFY(nvlist_add_uint64(nv,
2043                     zfs_prop_to_name(ZFS_PROP_COMPRESSION),
2044                     ZIO_COMPRESS_OFF) == 0);
2045                 VERIFY(nvlist_add_uint64(nv,
2046                     zfs_prop_to_name(ZFS_PROP_CHECKSUM),
2047                     checksum) == 0);
2048                 if (version >= SPA_VERSION_DEDUP) {
2049                         VERIFY(nvlist_add_uint64(nv,
2050                             zfs_prop_to_name(ZFS_PROP_DEDUP),
2051                             ZIO_CHECKSUM_OFF) == 0);
2052                 }
2053 
2054                 error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2055                     nv, NULL);
2056                 nvlist_free(nv);
2057         }
2058 
2059         /* Allocate the space for the dump */
2060         if (error == 0)
2061                 error = zvol_prealloc(zv);
2062         return (error);
2063 }
2064 
2065 static int
2066 zvol_dumpify(zvol_state_t *zv)
2067 {
2068         int error = 0;
2069         uint64_t dumpsize = 0;
2070         dmu_tx_t *tx;
2071         objset_t *os = zv->zv_objset;
2072 
2073         if (zv->zv_flags & ZVOL_RDONLY)
2074                 return (SET_ERROR(EROFS));
2075 
2076         if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
2077             8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
2078                 boolean_t resize = (dumpsize > 0);
2079 
2080                 if ((error = zvol_dump_init(zv, resize)) != 0) {
2081                         (void) zvol_dump_fini(zv);
2082                         return (error);
2083                 }
2084         }
2085 
2086         /*
2087          * Build up our lba mapping.
2088          */
2089         error = zvol_get_lbas(zv);
2090         if (error) {
2091                 (void) zvol_dump_fini(zv);
2092                 return (error);
2093         }
2094 
2095         tx = dmu_tx_create(os);
2096         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2097         error = dmu_tx_assign(tx, TXG_WAIT);
2098         if (error) {
2099                 dmu_tx_abort(tx);
2100                 (void) zvol_dump_fini(zv);
2101                 return (error);
2102         }
2103 
2104         zv->zv_flags |= ZVOL_DUMPIFIED;
2105         error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
2106             &zv->zv_volsize, tx);
2107         dmu_tx_commit(tx);
2108 
2109         if (error) {
2110                 (void) zvol_dump_fini(zv);
2111                 return (error);
2112         }
2113 
2114         txg_wait_synced(dmu_objset_pool(os), 0);
2115         return (0);
2116 }
2117 
2118 static int
2119 zvol_dump_fini(zvol_state_t *zv)
2120 {
2121         dmu_tx_t *tx;
2122         objset_t *os = zv->zv_objset;
2123         nvlist_t *nv;
2124         int error = 0;
2125         uint64_t checksum, compress, refresrv, vbs, dedup;
2126         uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
2127 
2128         /*
2129          * Attempt to restore the zvol back to its pre-dumpified state.
2130          * This is a best-effort attempt as it's possible that not all
2131          * of these properties were initialized during the dumpify process
2132          * (i.e. error during zvol_dump_init).
2133          */
2134 
2135         tx = dmu_tx_create(os);
2136         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2137         error = dmu_tx_assign(tx, TXG_WAIT);
2138         if (error) {
2139                 dmu_tx_abort(tx);
2140                 return (error);
2141         }
2142         (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
2143         dmu_tx_commit(tx);
2144 
2145         (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2146             zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
2147         (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2148             zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
2149         (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2150             zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
2151         (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2152             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
2153 
2154         VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2155         (void) nvlist_add_uint64(nv,
2156             zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
2157         (void) nvlist_add_uint64(nv,
2158             zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
2159         (void) nvlist_add_uint64(nv,
2160             zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
2161         if (version >= SPA_VERSION_DEDUP &&
2162             zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2163             zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
2164                 (void) nvlist_add_uint64(nv,
2165                     zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
2166         }
2167         (void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2168             nv, NULL);
2169         nvlist_free(nv);
2170 
2171         zvol_free_extents(zv);
2172         zv->zv_flags &= ~ZVOL_DUMPIFIED;
2173         (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
2174         /* wait for dmu_free_long_range to actually free the blocks */
2175         txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2176         tx = dmu_tx_create(os);
2177         dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2178         error = dmu_tx_assign(tx, TXG_WAIT);
2179         if (error) {
2180                 dmu_tx_abort(tx);
2181                 return (error);
2182         }
2183         if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
2184                 zv->zv_volblocksize = vbs;
2185         dmu_tx_commit(tx);
2186 
2187         return (0);
2188 }