1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  *
  24  * Portions Copyright 2010 Robert Milkowski
  25  *
  26  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  27  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  28  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  29  * Copyright (c) 2014 Integros [integros.com]
  30  */
  31 
  32 /*
  33  * ZFS volume emulation driver.
  34  *
  35  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  36  * Volumes are accessed through the symbolic links named:
  37  *
  38  * /dev/zvol/dsk/<pool_name>/<dataset_name>
  39  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
  40  *
  41  * These links are created by the /dev filesystem (sdev_zvolops.c).
  42  * Volumes are persistent through reboot.  No user command needs to be
  43  * run before opening and using a device.
  44  */
  45 
  46 #include <sys/types.h>
  47 #include <sys/param.h>
  48 #include <sys/errno.h>
  49 #include <sys/uio.h>
  50 #include <sys/buf.h>
  51 #include <sys/modctl.h>
  52 #include <sys/open.h>
  53 #include <sys/kmem.h>
  54 #include <sys/conf.h>
  55 #include <sys/cmn_err.h>
  56 #include <sys/stat.h>
  57 #include <sys/zap.h>
  58 #include <sys/spa.h>
  59 #include <sys/spa_impl.h>
  60 #include <sys/zio.h>
  61 #include <sys/dmu_traverse.h>
  62 #include <sys/dnode.h>
  63 #include <sys/dsl_dataset.h>
  64 #include <sys/dsl_prop.h>
  65 #include <sys/dkio.h>
  66 #include <sys/efi_partition.h>
  67 #include <sys/byteorder.h>
  68 #include <sys/pathname.h>
  69 #include <sys/ddi.h>
  70 #include <sys/sunddi.h>
  71 #include <sys/crc32.h>
  72 #include <sys/dirent.h>
  73 #include <sys/policy.h>
  74 #include <sys/fs/zfs.h>
  75 #include <sys/zfs_ioctl.h>
  76 #include <sys/mkdev.h>
  77 #include <sys/zil.h>
  78 #include <sys/refcount.h>
  79 #include <sys/zfs_znode.h>
  80 #include <sys/zfs_rlock.h>
  81 #include <sys/vdev_disk.h>
  82 #include <sys/vdev_impl.h>
  83 #include <sys/vdev_raidz.h>
  84 #include <sys/zvol.h>
  85 #include <sys/dumphdr.h>
  86 #include <sys/zil_impl.h>
  87 #include <sys/dbuf.h>
  88 #include <sys/dmu_tx.h>
  89 #include <sys/zfeature.h>
  90 #include <sys/zio_checksum.h>
  91 
  92 #include "zfs_namecheck.h"
  93 
  94 void *zfsdev_state;
  95 static char *zvol_tag = "zvol_tag";
  96 
  97 #define ZVOL_DUMPSIZE           "dumpsize"
  98 
  99 /*
 100  * This lock protects the zfsdev_state structure from being modified
 101  * while it's being used, e.g. an open that comes in before a create
 102  * finishes.  It also protects temporary opens of the dataset so that,
 103  * e.g., an open doesn't get a spurious EBUSY.
 104  */
 105 kmutex_t zfsdev_state_lock;
 106 static uint32_t zvol_minors;
 107 
 108 typedef struct zvol_extent {
 109         list_node_t     ze_node;
 110         dva_t           ze_dva;         /* dva associated with this extent */
 111         uint64_t        ze_nblks;       /* number of blocks in extent */
 112 } zvol_extent_t;
 113 
 114 /*
 115  * The in-core state of each volume.
 116  */
 117 typedef struct zvol_state {
 118         char            zv_name[MAXPATHLEN]; /* pool/dd name */
 119         uint64_t        zv_volsize;     /* amount of space we advertise */
 120         uint64_t        zv_volblocksize; /* volume block size */
 121         minor_t         zv_minor;       /* minor number */
 122         uint8_t         zv_min_bs;      /* minimum addressable block shift */
 123         uint8_t         zv_flags;       /* readonly, dumpified, etc. */
 124         objset_t        *zv_objset;     /* objset handle */
 125         uint32_t        zv_open_count[OTYPCNT]; /* open counts */
 126         uint32_t        zv_total_opens; /* total open count */
 127         zilog_t         *zv_zilog;      /* ZIL handle */
 128         list_t          zv_extents;     /* List of extents for dump */
 129         znode_t         zv_znode;       /* for range locking */
 130         dmu_buf_t       *zv_dbuf;       /* bonus handle */
 131 } zvol_state_t;
 132 
 133 /*
 134  * zvol specific flags
 135  */
 136 #define ZVOL_RDONLY     0x1
 137 #define ZVOL_DUMPIFIED  0x2
 138 #define ZVOL_EXCL       0x4
 139 #define ZVOL_WCE        0x8
 140 
 141 /*
 142  * zvol maximum transfer in one DMU tx.
 143  */
 144 int zvol_maxphys = DMU_MAX_ACCESS/2;
 145 
 146 /*
 147  * Toggle unmap functionality.
 148  */
 149 boolean_t zvol_unmap_enabled = B_TRUE;
 150 
 151 extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
 152     nvlist_t *, nvlist_t *);
 153 static int zvol_remove_zv(zvol_state_t *);
 154 static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
 155 static int zvol_dumpify(zvol_state_t *zv);
 156 static int zvol_dump_fini(zvol_state_t *zv);
 157 static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
 158 
 159 static void
 160 zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
 161 {
 162         dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor);
 163 
 164         zv->zv_volsize = volsize;
 165         VERIFY(ddi_prop_update_int64(dev, zfs_dip,
 166             "Size", volsize) == DDI_SUCCESS);
 167         VERIFY(ddi_prop_update_int64(dev, zfs_dip,
 168             "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
 169 
 170         /* Notify specfs to invalidate the cached size */
 171         spec_size_invalidate(dev, VBLK);
 172         spec_size_invalidate(dev, VCHR);
 173 }
 174 
 175 int
 176 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
 177 {
 178         if (volsize == 0)
 179                 return (SET_ERROR(EINVAL));
 180 
 181         if (volsize % blocksize != 0)
 182                 return (SET_ERROR(EINVAL));
 183 
 184 #ifdef _ILP32
 185         if (volsize - 1 > SPEC_MAXOFFSET_T)
 186                 return (SET_ERROR(EOVERFLOW));
 187 #endif
 188         return (0);
 189 }
 190 
 191 int
 192 zvol_check_volblocksize(uint64_t volblocksize)
 193 {
 194         if (volblocksize < SPA_MINBLOCKSIZE ||
 195             volblocksize > SPA_OLD_MAXBLOCKSIZE ||
 196             !ISP2(volblocksize))
 197                 return (SET_ERROR(EDOM));
 198 
 199         return (0);
 200 }
 201 
 202 int
 203 zvol_get_stats(objset_t *os, nvlist_t *nv)
 204 {
 205         int error;
 206         dmu_object_info_t doi;
 207         uint64_t val;
 208 
 209         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
 210         if (error)
 211                 return (error);
 212 
 213         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
 214 
 215         error = dmu_object_info(os, ZVOL_OBJ, &doi);
 216 
 217         if (error == 0) {
 218                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
 219                     doi.doi_data_block_size);
 220         }
 221 
 222         return (error);
 223 }
 224 
 225 static zvol_state_t *
 226 zvol_minor_lookup(const char *name)
 227 {
 228         minor_t minor;
 229         zvol_state_t *zv;
 230 
 231         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 232 
 233         for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
 234                 zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 235                 if (zv == NULL)
 236                         continue;
 237                 if (strcmp(zv->zv_name, name) == 0)
 238                         return (zv);
 239         }
 240 
 241         return (NULL);
 242 }
 243 
 244 /* extent mapping arg */
 245 struct maparg {
 246         zvol_state_t    *ma_zv;
 247         uint64_t        ma_blks;
 248 };
 249 
 250 /*ARGSUSED*/
 251 static int
 252 zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 253     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 254 {
 255         struct maparg *ma = arg;
 256         zvol_extent_t *ze;
 257         int bs = ma->ma_zv->zv_volblocksize;
 258 
 259         if (bp == NULL || BP_IS_HOLE(bp) ||
 260             zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
 261                 return (0);
 262 
 263         VERIFY(!BP_IS_EMBEDDED(bp));
 264 
 265         VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
 266         ma->ma_blks++;
 267 
 268         /* Abort immediately if we have encountered gang blocks */
 269         if (BP_IS_GANG(bp))
 270                 return (SET_ERROR(EFRAGS));
 271 
 272         /*
 273          * See if the block is at the end of the previous extent.
 274          */
 275         ze = list_tail(&ma->ma_zv->zv_extents);
 276         if (ze &&
 277             DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
 278             DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
 279             DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
 280                 ze->ze_nblks++;
 281                 return (0);
 282         }
 283 
 284         dprintf_bp(bp, "%s", "next blkptr:");
 285 
 286         /* start a new extent */
 287         ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
 288         ze->ze_dva = bp->blk_dva[0];      /* structure assignment */
 289         ze->ze_nblks = 1;
 290         list_insert_tail(&ma->ma_zv->zv_extents, ze);
 291         return (0);
 292 }
 293 
 294 static void
 295 zvol_free_extents(zvol_state_t *zv)
 296 {
 297         zvol_extent_t *ze;
 298 
 299         while (ze = list_head(&zv->zv_extents)) {
 300                 list_remove(&zv->zv_extents, ze);
 301                 kmem_free(ze, sizeof (zvol_extent_t));
 302         }
 303 }
 304 
 305 static int
 306 zvol_get_lbas(zvol_state_t *zv)
 307 {
 308         objset_t *os = zv->zv_objset;
 309         struct maparg   ma;
 310         int             err;
 311 
 312         ma.ma_zv = zv;
 313         ma.ma_blks = 0;
 314         zvol_free_extents(zv);
 315 
 316         /* commit any in-flight changes before traversing the dataset */
 317         txg_wait_synced(dmu_objset_pool(os), 0);
 318         err = traverse_dataset(dmu_objset_ds(os), 0,
 319             TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
 320         if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
 321                 zvol_free_extents(zv);
 322                 return (err ? err : EIO);
 323         }
 324 
 325         return (0);
 326 }
 327 
 328 /* ARGSUSED */
 329 void
 330 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 331 {
 332         zfs_creat_t *zct = arg;
 333         nvlist_t *nvprops = zct->zct_props;
 334         int error;
 335         uint64_t volblocksize, volsize;
 336 
 337         VERIFY(nvlist_lookup_uint64(nvprops,
 338             zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
 339         if (nvlist_lookup_uint64(nvprops,
 340             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
 341                 volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 342 
 343         /*
 344          * These properties must be removed from the list so the generic
 345          * property setting step won't apply to them.
 346          */
 347         VERIFY(nvlist_remove_all(nvprops,
 348             zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
 349         (void) nvlist_remove_all(nvprops,
 350             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
 351 
 352         error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
 353             DMU_OT_NONE, 0, tx);
 354         ASSERT(error == 0);
 355 
 356         error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
 357             DMU_OT_NONE, 0, tx);
 358         ASSERT(error == 0);
 359 
 360         error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
 361         ASSERT(error == 0);
 362 }
 363 
 364 /*
 365  * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
 366  * implement DKIOCFREE/free-long-range.
 367  */
 368 static int
 369 zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
 370 {
 371         uint64_t offset, length;
 372 
 373         if (byteswap)
 374                 byteswap_uint64_array(lr, sizeof (*lr));
 375 
 376         offset = lr->lr_offset;
 377         length = lr->lr_length;
 378 
 379         return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
 380 }
 381 
 382 /*
 383  * Replay a TX_WRITE ZIL transaction that didn't get committed
 384  * after a system failure
 385  */
 386 static int
 387 zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
 388 {
 389         objset_t *os = zv->zv_objset;
 390         char *data = (char *)(lr + 1);  /* data follows lr_write_t */
 391         uint64_t offset, length;
 392         dmu_tx_t *tx;
 393         int error;
 394 
 395         if (byteswap)
 396                 byteswap_uint64_array(lr, sizeof (*lr));
 397 
 398         offset = lr->lr_offset;
 399         length = lr->lr_length;
 400 
 401         /* If it's a dmu_sync() block, write the whole block */
 402         if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 403                 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 404                 if (length < blocksize) {
 405                         offset -= offset % blocksize;
 406                         length = blocksize;
 407                 }
 408         }
 409 
 410         tx = dmu_tx_create(os);
 411         dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
 412         error = dmu_tx_assign(tx, TXG_WAIT);
 413         if (error) {
 414                 dmu_tx_abort(tx);
 415         } else {
 416                 dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
 417                 dmu_tx_commit(tx);
 418         }
 419 
 420         return (error);
 421 }
 422 
 423 /* ARGSUSED */
 424 static int
 425 zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
 426 {
 427         return (SET_ERROR(ENOTSUP));
 428 }
 429 
 430 /*
 431  * Callback vectors for replaying records.
 432  * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
 433  */
 434 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 435         zvol_replay_err,        /* 0 no such transaction type */
 436         zvol_replay_err,        /* TX_CREATE */
 437         zvol_replay_err,        /* TX_MKDIR */
 438         zvol_replay_err,        /* TX_MKXATTR */
 439         zvol_replay_err,        /* TX_SYMLINK */
 440         zvol_replay_err,        /* TX_REMOVE */
 441         zvol_replay_err,        /* TX_RMDIR */
 442         zvol_replay_err,        /* TX_LINK */
 443         zvol_replay_err,        /* TX_RENAME */
 444         zvol_replay_write,      /* TX_WRITE */
 445         zvol_replay_truncate,   /* TX_TRUNCATE */
 446         zvol_replay_err,        /* TX_SETATTR */
 447         zvol_replay_err,        /* TX_ACL */
 448         zvol_replay_err,        /* TX_CREATE_ACL */
 449         zvol_replay_err,        /* TX_CREATE_ATTR */
 450         zvol_replay_err,        /* TX_CREATE_ACL_ATTR */
 451         zvol_replay_err,        /* TX_MKDIR_ACL */
 452         zvol_replay_err,        /* TX_MKDIR_ATTR */
 453         zvol_replay_err,        /* TX_MKDIR_ACL_ATTR */
 454         zvol_replay_err,        /* TX_WRITE2 */
 455 };
 456 
 457 int
 458 zvol_name2minor(const char *name, minor_t *minor)
 459 {
 460         zvol_state_t *zv;
 461 
 462         mutex_enter(&zfsdev_state_lock);
 463         zv = zvol_minor_lookup(name);
 464         if (minor && zv)
 465                 *minor = zv->zv_minor;
 466         mutex_exit(&zfsdev_state_lock);
 467         return (zv ? 0 : -1);
 468 }
 469 
 470 /*
 471  * Create a minor node (plus a whole lot more) for the specified volume.
 472  */
 473 int
 474 zvol_create_minor(const char *name)
 475 {
 476         zfs_soft_state_t *zs;
 477         zvol_state_t *zv;
 478         objset_t *os;
 479         dmu_object_info_t doi;
 480         minor_t minor = 0;
 481         char chrbuf[30], blkbuf[30];
 482         int error;
 483 
 484         mutex_enter(&zfsdev_state_lock);
 485 
 486         if (zvol_minor_lookup(name) != NULL) {
 487                 mutex_exit(&zfsdev_state_lock);
 488                 return (SET_ERROR(EEXIST));
 489         }
 490 
 491         /* lie and say we're read-only */
 492         error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
 493 
 494         if (error) {
 495                 mutex_exit(&zfsdev_state_lock);
 496                 return (error);
 497         }
 498 
 499         if ((minor = zfsdev_minor_alloc()) == 0) {
 500                 dmu_objset_disown(os, FTAG);
 501                 mutex_exit(&zfsdev_state_lock);
 502                 return (SET_ERROR(ENXIO));
 503         }
 504 
 505         if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
 506                 dmu_objset_disown(os, FTAG);
 507                 mutex_exit(&zfsdev_state_lock);
 508                 return (SET_ERROR(EAGAIN));
 509         }
 510         (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
 511             (char *)name);
 512 
 513         (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
 514 
 515         if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
 516             minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 517                 ddi_soft_state_free(zfsdev_state, minor);
 518                 dmu_objset_disown(os, FTAG);
 519                 mutex_exit(&zfsdev_state_lock);
 520                 return (SET_ERROR(EAGAIN));
 521         }
 522 
 523         (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
 524 
 525         if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
 526             minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 527                 ddi_remove_minor_node(zfs_dip, chrbuf);
 528                 ddi_soft_state_free(zfsdev_state, minor);
 529                 dmu_objset_disown(os, FTAG);
 530                 mutex_exit(&zfsdev_state_lock);
 531                 return (SET_ERROR(EAGAIN));
 532         }
 533 
 534         zs = ddi_get_soft_state(zfsdev_state, minor);
 535         zs->zss_type = ZSST_ZVOL;
 536         zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 537         (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
 538         zv->zv_min_bs = DEV_BSHIFT;
 539         zv->zv_minor = minor;
 540         zv->zv_objset = os;
 541         if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
 542                 zv->zv_flags |= ZVOL_RDONLY;
 543         mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 544         avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
 545             sizeof (rl_t), offsetof(rl_t, r_node));
 546         list_create(&zv->zv_extents, sizeof (zvol_extent_t),
 547             offsetof(zvol_extent_t, ze_node));
 548         /* get and cache the blocksize */
 549         error = dmu_object_info(os, ZVOL_OBJ, &doi);
 550         ASSERT(error == 0);
 551         zv->zv_volblocksize = doi.doi_data_block_size;
 552 
 553         if (spa_writeable(dmu_objset_spa(os))) {
 554                 if (zil_replay_disable)
 555                         zil_destroy(dmu_objset_zil(os), B_FALSE);
 556                 else
 557                         zil_replay(os, zv, zvol_replay_vector);
 558         }
 559         dmu_objset_disown(os, FTAG);
 560         zv->zv_objset = NULL;
 561 
 562         zvol_minors++;
 563 
 564         mutex_exit(&zfsdev_state_lock);
 565 
 566         return (0);
 567 }
 568 
 569 /*
 570  * Remove minor node for the specified volume.
 571  */
 572 static int
 573 zvol_remove_zv(zvol_state_t *zv)
 574 {
 575         char nmbuf[20];
 576         minor_t minor = zv->zv_minor;
 577 
 578         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 579         if (zv->zv_total_opens != 0)
 580                 return (SET_ERROR(EBUSY));
 581 
 582         (void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
 583         ddi_remove_minor_node(zfs_dip, nmbuf);
 584 
 585         (void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
 586         ddi_remove_minor_node(zfs_dip, nmbuf);
 587 
 588         avl_destroy(&zv->zv_znode.z_range_avl);
 589         mutex_destroy(&zv->zv_znode.z_range_lock);
 590 
 591         kmem_free(zv, sizeof (zvol_state_t));
 592 
 593         ddi_soft_state_free(zfsdev_state, minor);
 594 
 595         zvol_minors--;
 596         return (0);
 597 }
 598 
 599 int
 600 zvol_remove_minor(const char *name)
 601 {
 602         zvol_state_t *zv;
 603         int rc;
 604 
 605         mutex_enter(&zfsdev_state_lock);
 606         if ((zv = zvol_minor_lookup(name)) == NULL) {
 607                 mutex_exit(&zfsdev_state_lock);
 608                 return (SET_ERROR(ENXIO));
 609         }
 610         rc = zvol_remove_zv(zv);
 611         mutex_exit(&zfsdev_state_lock);
 612         return (rc);
 613 }
 614 
 615 int
 616 zvol_first_open(zvol_state_t *zv)
 617 {
 618         objset_t *os;
 619         uint64_t volsize;
 620         int error;
 621         uint64_t readonly;
 622 
 623         /* lie and say we're read-only */
 624         error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
 625             zvol_tag, &os);
 626         if (error)
 627                 return (error);
 628 
 629         zv->zv_objset = os;
 630         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 631         if (error) {
 632                 ASSERT(error == 0);
 633                 dmu_objset_disown(os, zvol_tag);
 634                 return (error);
 635         }
 636 
 637         error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
 638         if (error) {
 639                 dmu_objset_disown(os, zvol_tag);
 640                 return (error);
 641         }
 642 
 643         zvol_size_changed(zv, volsize);
 644         zv->zv_zilog = zil_open(os, zvol_get_data);
 645 
 646         VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
 647             NULL) == 0);
 648         if (readonly || dmu_objset_is_snapshot(os) ||
 649             !spa_writeable(dmu_objset_spa(os)))
 650                 zv->zv_flags |= ZVOL_RDONLY;
 651         else
 652                 zv->zv_flags &= ~ZVOL_RDONLY;
 653         return (error);
 654 }
 655 
 656 void
 657 zvol_last_close(zvol_state_t *zv)
 658 {
 659         zil_close(zv->zv_zilog);
 660         zv->zv_zilog = NULL;
 661 
 662         dmu_buf_rele(zv->zv_dbuf, zvol_tag);
 663         zv->zv_dbuf = NULL;
 664 
 665         /*
 666          * Evict cached data
 667          */
 668         if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
 669             !(zv->zv_flags & ZVOL_RDONLY))
 670                 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 671         dmu_objset_evict_dbufs(zv->zv_objset);
 672 
 673         dmu_objset_disown(zv->zv_objset, zvol_tag);
 674         zv->zv_objset = NULL;
 675 }
 676 
 677 int
 678 zvol_prealloc(zvol_state_t *zv)
 679 {
 680         objset_t *os = zv->zv_objset;
 681         dmu_tx_t *tx;
 682         uint64_t refd, avail, usedobjs, availobjs;
 683         uint64_t resid = zv->zv_volsize;
 684         uint64_t off = 0;
 685 
 686         /* Check the space usage before attempting to allocate the space */
 687         dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
 688         if (avail < zv->zv_volsize)
 689                 return (SET_ERROR(ENOSPC));
 690 
 691         /* Free old extents if they exist */
 692         zvol_free_extents(zv);
 693 
 694         while (resid != 0) {
 695                 int error;
 696                 uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
 697 
 698                 tx = dmu_tx_create(os);
 699                 dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
 700                 error = dmu_tx_assign(tx, TXG_WAIT);
 701                 if (error) {
 702                         dmu_tx_abort(tx);
 703                         (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
 704                         return (error);
 705                 }
 706                 dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
 707                 dmu_tx_commit(tx);
 708                 off += bytes;
 709                 resid -= bytes;
 710         }
 711         txg_wait_synced(dmu_objset_pool(os), 0);
 712 
 713         return (0);
 714 }
 715 
 716 static int
 717 zvol_update_volsize(objset_t *os, uint64_t volsize)
 718 {
 719         dmu_tx_t *tx;
 720         int error;
 721 
 722         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 723 
 724         tx = dmu_tx_create(os);
 725         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 726         dmu_tx_mark_netfree(tx);
 727         error = dmu_tx_assign(tx, TXG_WAIT);
 728         if (error) {
 729                 dmu_tx_abort(tx);
 730                 return (error);
 731         }
 732 
 733         error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
 734             &volsize, tx);
 735         dmu_tx_commit(tx);
 736 
 737         if (error == 0)
 738                 error = dmu_free_long_range(os,
 739                     ZVOL_OBJ, volsize, DMU_OBJECT_END);
 740         return (error);
 741 }
 742 
 743 void
 744 zvol_remove_minors(const char *name)
 745 {
 746         zvol_state_t *zv;
 747         char *namebuf;
 748         minor_t minor;
 749 
 750         namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
 751         (void) strncpy(namebuf, name, strlen(name));
 752         (void) strcat(namebuf, "/");
 753         mutex_enter(&zfsdev_state_lock);
 754         for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
 755 
 756                 zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 757                 if (zv == NULL)
 758                         continue;
 759                 if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
 760                         (void) zvol_remove_zv(zv);
 761         }
 762         kmem_free(namebuf, strlen(name) + 2);
 763 
 764         mutex_exit(&zfsdev_state_lock);
 765 }
 766 
 767 static int
 768 zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
 769 {
 770         uint64_t old_volsize = 0ULL;
 771         int error = 0;
 772 
 773         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 774 
 775         /*
 776          * Reinitialize the dump area to the new size. If we
 777          * failed to resize the dump area then restore it back to
 778          * its original size.  We must set the new volsize prior
 779          * to calling dumpvp_resize() to ensure that the devices'
 780          * size(9P) is not visible by the dump subsystem.
 781          */
 782         old_volsize = zv->zv_volsize;
 783         zvol_size_changed(zv, volsize);
 784 
 785         if (zv->zv_flags & ZVOL_DUMPIFIED) {
 786                 if ((error = zvol_dumpify(zv)) != 0 ||
 787                     (error = dumpvp_resize()) != 0) {
 788                         int dumpify_error;
 789 
 790                         (void) zvol_update_volsize(zv->zv_objset, old_volsize);
 791                         zvol_size_changed(zv, old_volsize);
 792                         dumpify_error = zvol_dumpify(zv);
 793                         error = dumpify_error ? dumpify_error : error;
 794                 }
 795         }
 796 
 797         /*
 798          * Generate a LUN expansion event.
 799          */
 800         if (error == 0) {
 801                 sysevent_id_t eid;
 802                 nvlist_t *attr;
 803                 char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 804 
 805                 (void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
 806                     zv->zv_minor);
 807 
 808                 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 809                 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
 810 
 811                 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
 812                     ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
 813 
 814                 nvlist_free(attr);
 815                 kmem_free(physpath, MAXPATHLEN);
 816         }
 817         return (error);
 818 }
 819 
 820 int
 821 zvol_set_volsize(const char *name, uint64_t volsize)
 822 {
 823         zvol_state_t *zv = NULL;
 824         objset_t *os;
 825         int error;
 826         dmu_object_info_t doi;
 827         uint64_t readonly;
 828         boolean_t owned = B_FALSE;
 829 
 830         error = dsl_prop_get_integer(name,
 831             zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
 832         if (error != 0)
 833                 return (error);
 834         if (readonly)
 835                 return (SET_ERROR(EROFS));
 836 
 837         mutex_enter(&zfsdev_state_lock);
 838         zv = zvol_minor_lookup(name);
 839 
 840         if (zv == NULL || zv->zv_objset == NULL) {
 841                 if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
 842                     FTAG, &os)) != 0) {
 843                         mutex_exit(&zfsdev_state_lock);
 844                         return (error);
 845                 }
 846                 owned = B_TRUE;
 847                 if (zv != NULL)
 848                         zv->zv_objset = os;
 849         } else {
 850                 os = zv->zv_objset;
 851         }
 852 
 853         if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
 854             (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0)
 855                 goto out;
 856 
 857         error = zvol_update_volsize(os, volsize);
 858 
 859         if (error == 0 && zv != NULL)
 860                 error = zvol_update_live_volsize(zv, volsize);
 861 out:
 862         if (owned) {
 863                 dmu_objset_disown(os, FTAG);
 864                 if (zv != NULL)
 865                         zv->zv_objset = NULL;
 866         }
 867         mutex_exit(&zfsdev_state_lock);
 868         return (error);
 869 }
 870 
 871 /*ARGSUSED*/
 872 int
 873 zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
 874 {
 875         zvol_state_t *zv;
 876         int err = 0;
 877 
 878         mutex_enter(&zfsdev_state_lock);
 879 
 880         zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
 881         if (zv == NULL) {
 882                 mutex_exit(&zfsdev_state_lock);
 883                 return (SET_ERROR(ENXIO));
 884         }
 885 
 886         if (zv->zv_total_opens == 0)
 887                 err = zvol_first_open(zv);
 888         if (err) {
 889                 mutex_exit(&zfsdev_state_lock);
 890                 return (err);
 891         }
 892         if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
 893                 err = SET_ERROR(EROFS);
 894                 goto out;
 895         }
 896         if (zv->zv_flags & ZVOL_EXCL) {
 897                 err = SET_ERROR(EBUSY);
 898                 goto out;
 899         }
 900         if (flag & FEXCL) {
 901                 if (zv->zv_total_opens != 0) {
 902                         err = SET_ERROR(EBUSY);
 903                         goto out;
 904                 }
 905                 zv->zv_flags |= ZVOL_EXCL;
 906         }
 907 
 908         if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
 909                 zv->zv_open_count[otyp]++;
 910                 zv->zv_total_opens++;
 911         }
 912         mutex_exit(&zfsdev_state_lock);
 913 
 914         return (err);
 915 out:
 916         if (zv->zv_total_opens == 0)
 917                 zvol_last_close(zv);
 918         mutex_exit(&zfsdev_state_lock);
 919         return (err);
 920 }
 921 
 922 /*ARGSUSED*/
 923 int
 924 zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
 925 {
 926         minor_t minor = getminor(dev);
 927         zvol_state_t *zv;
 928         int error = 0;
 929 
 930         mutex_enter(&zfsdev_state_lock);
 931 
 932         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 933         if (zv == NULL) {
 934                 mutex_exit(&zfsdev_state_lock);
 935                 return (SET_ERROR(ENXIO));
 936         }
 937 
 938         if (zv->zv_flags & ZVOL_EXCL) {
 939                 ASSERT(zv->zv_total_opens == 1);
 940                 zv->zv_flags &= ~ZVOL_EXCL;
 941         }
 942 
 943         /*
 944          * If the open count is zero, this is a spurious close.
 945          * That indicates a bug in the kernel / DDI framework.
 946          */
 947         ASSERT(zv->zv_open_count[otyp] != 0);
 948         ASSERT(zv->zv_total_opens != 0);
 949 
 950         /*
 951          * You may get multiple opens, but only one close.
 952          */
 953         zv->zv_open_count[otyp]--;
 954         zv->zv_total_opens--;
 955 
 956         if (zv->zv_total_opens == 0)
 957                 zvol_last_close(zv);
 958 
 959         mutex_exit(&zfsdev_state_lock);
 960         return (error);
 961 }
 962 
 963 static void
 964 zvol_get_done(zgd_t *zgd, int error)
 965 {
 966         if (zgd->zgd_db)
 967                 dmu_buf_rele(zgd->zgd_db, zgd);
 968 
 969         zfs_range_unlock(zgd->zgd_rl);
 970 
 971         if (error == 0 && zgd->zgd_bp)
 972                 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 973 
 974         kmem_free(zgd, sizeof (zgd_t));
 975 }
 976 
 977 /*
 978  * Get data to generate a TX_WRITE intent log record.
 979  */
 980 static int
 981 zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 982 {
 983         zvol_state_t *zv = arg;
 984         objset_t *os = zv->zv_objset;
 985         uint64_t object = ZVOL_OBJ;
 986         uint64_t offset = lr->lr_offset;
 987         uint64_t size = lr->lr_length;       /* length of user data */
 988         blkptr_t *bp = &lr->lr_blkptr;
 989         dmu_buf_t *db;
 990         zgd_t *zgd;
 991         int error;
 992 
 993         ASSERT(zio != NULL);
 994         ASSERT(size != 0);
 995 
 996         zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 997         zgd->zgd_zilog = zv->zv_zilog;
 998         zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
 999 
1000         /*
1001          * Write records come in two flavors: immediate and indirect.
1002          * For small writes it's cheaper to store the data with the
1003          * log record (immediate); for large writes it's cheaper to
1004          * sync the data and get a pointer to it (indirect) so that
1005          * we don't have to write the data twice.
1006          */
1007         if (buf != NULL) {      /* immediate write */
1008                 error = dmu_read(os, object, offset, size, buf,
1009                     DMU_READ_NO_PREFETCH);
1010         } else {
1011                 size = zv->zv_volblocksize;
1012                 offset = P2ALIGN(offset, size);
1013                 error = dmu_buf_hold(os, object, offset, zgd, &db,
1014                     DMU_READ_NO_PREFETCH);
1015                 if (error == 0) {
1016                         blkptr_t *obp = dmu_buf_get_blkptr(db);
1017                         if (obp) {
1018                                 ASSERT(BP_IS_HOLE(bp));
1019                                 *bp = *obp;
1020                         }
1021 
1022                         zgd->zgd_db = db;
1023                         zgd->zgd_bp = bp;
1024 
1025                         ASSERT(db->db_offset == offset);
1026                         ASSERT(db->db_size == size);
1027 
1028                         error = dmu_sync(zio, lr->lr_common.lrc_txg,
1029                             zvol_get_done, zgd);
1030 
1031                         if (error == 0)
1032                                 return (0);
1033                 }
1034         }
1035 
1036         zvol_get_done(zgd, error);
1037 
1038         return (error);
1039 }
1040 
1041 /*
1042  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
1043  *
1044  * We store data in the log buffers if it's small enough.
1045  * Otherwise we will later flush the data out via dmu_sync().
1046  */
1047 ssize_t zvol_immediate_write_sz = 32768;
1048 
1049 static void
1050 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1051     boolean_t sync)
1052 {
1053         uint32_t blocksize = zv->zv_volblocksize;
1054         zilog_t *zilog = zv->zv_zilog;
1055         boolean_t slogging;
1056         ssize_t immediate_write_sz;
1057 
1058         if (zil_replaying(zilog, tx))
1059                 return;
1060 
1061         immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1062             ? 0 : zvol_immediate_write_sz;
1063 
1064         slogging = spa_has_slogs(zilog->zl_spa) &&
1065             (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
1066 
1067         while (resid) {
1068                 itx_t *itx;
1069                 lr_write_t *lr;
1070                 ssize_t len;
1071                 itx_wr_state_t write_state;
1072 
1073                 /*
1074                  * Unlike zfs_log_write() we can be called with
1075                  * upto DMU_MAX_ACCESS/2 (5MB) writes.
1076                  */
1077                 if (blocksize > immediate_write_sz && !slogging &&
1078                     resid >= blocksize && off % blocksize == 0) {
1079                         write_state = WR_INDIRECT; /* uses dmu_sync */
1080                         len = blocksize;
1081                 } else if (sync) {
1082                         write_state = WR_COPIED;
1083                         len = MIN(ZIL_MAX_LOG_DATA, resid);
1084                 } else {
1085                         write_state = WR_NEED_COPY;
1086                         len = MIN(ZIL_MAX_LOG_DATA, resid);
1087                 }
1088 
1089                 itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1090                     (write_state == WR_COPIED ? len : 0));
1091                 lr = (lr_write_t *)&itx->itx_lr;
1092                 if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
1093                     ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
1094                         zil_itx_destroy(itx);
1095                         itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1096                         lr = (lr_write_t *)&itx->itx_lr;
1097                         write_state = WR_NEED_COPY;
1098                 }
1099 
1100                 itx->itx_wr_state = write_state;
1101                 if (write_state == WR_NEED_COPY)
1102                         itx->itx_sod += len;
1103                 lr->lr_foid = ZVOL_OBJ;
1104                 lr->lr_offset = off;
1105                 lr->lr_length = len;
1106                 lr->lr_blkoff = 0;
1107                 BP_ZERO(&lr->lr_blkptr);
1108 
1109                 itx->itx_private = zv;
1110                 itx->itx_sync = sync;
1111 
1112                 zil_itx_assign(zilog, itx, tx);
1113 
1114                 off += len;
1115                 resid -= len;
1116         }
1117 }
1118 
1119 static int
1120 zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1121     uint64_t size, boolean_t doread, boolean_t isdump)
1122 {
1123         vdev_disk_t *dvd;
1124         int c;
1125         int numerrors = 0;
1126 
1127         if (vd->vdev_ops == &vdev_mirror_ops ||
1128             vd->vdev_ops == &vdev_replacing_ops ||
1129             vd->vdev_ops == &vdev_spare_ops) {
1130                 for (c = 0; c < vd->vdev_children; c++) {
1131                         int err = zvol_dumpio_vdev(vd->vdev_child[c],
1132                             addr, offset, origoffset, size, doread, isdump);
1133                         if (err != 0) {
1134                                 numerrors++;
1135                         } else if (doread) {
1136                                 break;
1137                         }
1138                 }
1139         }
1140 
1141         if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
1142                 return (numerrors < vd->vdev_children ? 0 : EIO);
1143 
1144         if (doread && !vdev_readable(vd))
1145                 return (SET_ERROR(EIO));
1146         else if (!doread && !vdev_writeable(vd))
1147                 return (SET_ERROR(EIO));
1148 
1149         if (vd->vdev_ops == &vdev_raidz_ops) {
1150                 return (vdev_raidz_physio(vd,
1151                     addr, size, offset, origoffset, doread, isdump));
1152         }
1153 
1154         offset += VDEV_LABEL_START_SIZE;
1155 
1156         if (ddi_in_panic() || isdump) {
1157                 ASSERT(!doread);
1158                 if (doread)
1159                         return (SET_ERROR(EIO));
1160                 dvd = vd->vdev_tsd;
1161                 ASSERT3P(dvd, !=, NULL);
1162                 return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1163                     lbtodb(size)));
1164         } else {
1165                 dvd = vd->vdev_tsd;
1166                 ASSERT3P(dvd, !=, NULL);
1167                 return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1168                     offset, doread ? B_READ : B_WRITE));
1169         }
1170 }
1171 
1172 static int
1173 zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1174     boolean_t doread, boolean_t isdump)
1175 {
1176         vdev_t *vd;
1177         int error;
1178         zvol_extent_t *ze;
1179         spa_t *spa = dmu_objset_spa(zv->zv_objset);
1180 
1181         /* Must be sector aligned, and not stradle a block boundary. */
1182         if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1183             P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1184                 return (SET_ERROR(EINVAL));
1185         }
1186         ASSERT(size <= zv->zv_volblocksize);
1187 
1188         /* Locate the extent this belongs to */
1189         ze = list_head(&zv->zv_extents);
1190         while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
1191                 offset -= ze->ze_nblks * zv->zv_volblocksize;
1192                 ze = list_next(&zv->zv_extents, ze);
1193         }
1194 
1195         if (ze == NULL)
1196                 return (SET_ERROR(EINVAL));
1197 
1198         if (!ddi_in_panic())
1199                 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1200 
1201         vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
1202         offset += DVA_GET_OFFSET(&ze->ze_dva);
1203         error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
1204             size, doread, isdump);
1205 
1206         if (!ddi_in_panic())
1207                 spa_config_exit(spa, SCL_STATE, FTAG);
1208 
1209         return (error);
1210 }
1211 
1212 int
1213 zvol_strategy(buf_t *bp)
1214 {
1215         zfs_soft_state_t *zs = NULL;
1216         zvol_state_t *zv;
1217         uint64_t off, volsize;
1218         size_t resid;
1219         char *addr;
1220         objset_t *os;
1221         rl_t *rl;
1222         int error = 0;
1223         boolean_t doread = bp->b_flags & B_READ;
1224         boolean_t is_dumpified;
1225         boolean_t sync;
1226 
1227         if (getminor(bp->b_edev) == 0) {
1228                 error = SET_ERROR(EINVAL);
1229         } else {
1230                 zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
1231                 if (zs == NULL)
1232                         error = SET_ERROR(ENXIO);
1233                 else if (zs->zss_type != ZSST_ZVOL)
1234                         error = SET_ERROR(EINVAL);
1235         }
1236 
1237         if (error) {
1238                 bioerror(bp, error);
1239                 biodone(bp);
1240                 return (0);
1241         }
1242 
1243         zv = zs->zss_data;
1244 
1245         if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
1246                 bioerror(bp, EROFS);
1247                 biodone(bp);
1248                 return (0);
1249         }
1250 
1251         off = ldbtob(bp->b_blkno);
1252         volsize = zv->zv_volsize;
1253 
1254         os = zv->zv_objset;
1255         ASSERT(os != NULL);
1256 
1257         bp_mapin(bp);
1258         addr = bp->b_un.b_addr;
1259         resid = bp->b_bcount;
1260 
1261         if (resid > 0 && (off < 0 || off >= volsize)) {
1262                 bioerror(bp, EIO);
1263                 biodone(bp);
1264                 return (0);
1265         }
1266 
1267         is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
1268         sync = ((!(bp->b_flags & B_ASYNC) &&
1269             !(zv->zv_flags & ZVOL_WCE)) ||
1270             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
1271             !doread && !is_dumpified;
1272 
1273         /*
1274          * There must be no buffer changes when doing a dmu_sync() because
1275          * we can't change the data whilst calculating the checksum.
1276          */
1277         rl = zfs_range_lock(&zv->zv_znode, off, resid,
1278             doread ? RL_READER : RL_WRITER);
1279 
1280         while (resid != 0 && off < volsize) {
1281                 size_t size = MIN(resid, zvol_maxphys);
1282                 if (is_dumpified) {
1283                         size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1284                         error = zvol_dumpio(zv, addr, off, size,
1285                             doread, B_FALSE);
1286                 } else if (doread) {
1287                         error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1288                             DMU_READ_PREFETCH);
1289                 } else {
1290                         dmu_tx_t *tx = dmu_tx_create(os);
1291                         dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1292                         error = dmu_tx_assign(tx, TXG_WAIT);
1293                         if (error) {
1294                                 dmu_tx_abort(tx);
1295                         } else {
1296                                 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
1297                                 zvol_log_write(zv, tx, off, size, sync);
1298                                 dmu_tx_commit(tx);
1299                         }
1300                 }
1301                 if (error) {
1302                         /* convert checksum errors into IO errors */
1303                         if (error == ECKSUM)
1304                                 error = SET_ERROR(EIO);
1305                         break;
1306                 }
1307                 off += size;
1308                 addr += size;
1309                 resid -= size;
1310         }
1311         zfs_range_unlock(rl);
1312 
1313         if ((bp->b_resid = resid) == bp->b_bcount)
1314                 bioerror(bp, off > volsize ? EINVAL : error);
1315 
1316         if (sync)
1317                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1318         biodone(bp);
1319 
1320         return (0);
1321 }
1322 
1323 /*
1324  * Set the buffer count to the zvol maximum transfer.
1325  * Using our own routine instead of the default minphys()
1326  * means that for larger writes we write bigger buffers on X86
1327  * (128K instead of 56K) and flush the disk write cache less often
1328  * (every zvol_maxphys - currently 1MB) instead of minphys (currently
1329  * 56K on X86 and 128K on sparc).
1330  */
1331 void
1332 zvol_minphys(struct buf *bp)
1333 {
1334         if (bp->b_bcount > zvol_maxphys)
1335                 bp->b_bcount = zvol_maxphys;
1336 }
1337 
1338 int
1339 zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
1340 {
1341         minor_t minor = getminor(dev);
1342         zvol_state_t *zv;
1343         int error = 0;
1344         uint64_t size;
1345         uint64_t boff;
1346         uint64_t resid;
1347 
1348         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1349         if (zv == NULL)
1350                 return (SET_ERROR(ENXIO));
1351 
1352         if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
1353                 return (SET_ERROR(EINVAL));
1354 
1355         boff = ldbtob(blkno);
1356         resid = ldbtob(nblocks);
1357 
1358         VERIFY3U(boff + resid, <=, zv->zv_volsize);
1359 
1360         while (resid) {
1361                 size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
1362                 error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
1363                 if (error)
1364                         break;
1365                 boff += size;
1366                 addr += size;
1367                 resid -= size;
1368         }
1369 
1370         return (error);
1371 }
1372 
1373 /*ARGSUSED*/
1374 int
1375 zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
1376 {
1377         minor_t minor = getminor(dev);
1378         zvol_state_t *zv;
1379         uint64_t volsize;
1380         rl_t *rl;
1381         int error = 0;
1382 
1383         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1384         if (zv == NULL)
1385                 return (SET_ERROR(ENXIO));
1386 
1387         volsize = zv->zv_volsize;
1388         if (uio->uio_resid > 0 &&
1389             (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1390                 return (SET_ERROR(EIO));
1391 
1392         if (zv->zv_flags & ZVOL_DUMPIFIED) {
1393                 error = physio(zvol_strategy, NULL, dev, B_READ,
1394                     zvol_minphys, uio);
1395                 return (error);
1396         }
1397 
1398         rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1399             RL_READER);
1400         while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1401                 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1402 
1403                 /* don't read past the end */
1404                 if (bytes > volsize - uio->uio_loffset)
1405                         bytes = volsize - uio->uio_loffset;
1406 
1407                 error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
1408                 if (error) {
1409                         /* convert checksum errors into IO errors */
1410                         if (error == ECKSUM)
1411                                 error = SET_ERROR(EIO);
1412                         break;
1413                 }
1414         }
1415         zfs_range_unlock(rl);
1416         return (error);
1417 }
1418 
1419 /*ARGSUSED*/
1420 int
1421 zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
1422 {
1423         minor_t minor = getminor(dev);
1424         zvol_state_t *zv;
1425         uint64_t volsize;
1426         rl_t *rl;
1427         int error = 0;
1428         boolean_t sync;
1429 
1430         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1431         if (zv == NULL)
1432                 return (SET_ERROR(ENXIO));
1433 
1434         volsize = zv->zv_volsize;
1435         if (uio->uio_resid > 0 &&
1436             (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1437                 return (SET_ERROR(EIO));
1438 
1439         if (zv->zv_flags & ZVOL_DUMPIFIED) {
1440                 error = physio(zvol_strategy, NULL, dev, B_WRITE,
1441                     zvol_minphys, uio);
1442                 return (error);
1443         }
1444 
1445         sync = !(zv->zv_flags & ZVOL_WCE) ||
1446             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1447 
1448         rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1449             RL_WRITER);
1450         while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1451                 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1452                 uint64_t off = uio->uio_loffset;
1453                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1454 
1455                 if (bytes > volsize - off)   /* don't write past the end */
1456                         bytes = volsize - off;
1457 
1458                 dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1459                 error = dmu_tx_assign(tx, TXG_WAIT);
1460                 if (error) {
1461                         dmu_tx_abort(tx);
1462                         break;
1463                 }
1464                 error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
1465                 if (error == 0)
1466                         zvol_log_write(zv, tx, off, bytes, sync);
1467                 dmu_tx_commit(tx);
1468 
1469                 if (error)
1470                         break;
1471         }
1472         zfs_range_unlock(rl);
1473         if (sync)
1474                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1475         return (error);
1476 }
1477 
1478 int
1479 zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
1480 {
1481         struct uuid uuid = EFI_RESERVED;
1482         efi_gpe_t gpe = { 0 };
1483         uint32_t crc;
1484         dk_efi_t efi;
1485         int length;
1486         char *ptr;
1487 
1488         if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
1489                 return (SET_ERROR(EFAULT));
1490         ptr = (char *)(uintptr_t)efi.dki_data_64;
1491         length = efi.dki_length;
1492         /*
1493          * Some clients may attempt to request a PMBR for the
1494          * zvol.  Currently this interface will return EINVAL to
1495          * such requests.  These requests could be supported by
1496          * adding a check for lba == 0 and consing up an appropriate
1497          * PMBR.
1498          */
1499         if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
1500                 return (SET_ERROR(EINVAL));
1501 
1502         gpe.efi_gpe_StartingLBA = LE_64(34ULL);
1503         gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
1504         UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
1505 
1506         if (efi.dki_lba == 1) {
1507                 efi_gpt_t gpt = { 0 };
1508 
1509                 gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
1510                 gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
1511                 gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
1512                 gpt.efi_gpt_MyLBA = LE_64(1ULL);
1513                 gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
1514                 gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
1515                 gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
1516                 gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
1517                 gpt.efi_gpt_SizeOfPartitionEntry =
1518                     LE_32(sizeof (efi_gpe_t));
1519                 CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
1520                 gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
1521                 CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
1522                 gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
1523                 if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
1524                     flag))
1525                         return (SET_ERROR(EFAULT));
1526                 ptr += sizeof (gpt);
1527                 length -= sizeof (gpt);
1528         }
1529         if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
1530             length), flag))
1531                 return (SET_ERROR(EFAULT));
1532         return (0);
1533 }
1534 
1535 /*
1536  * BEGIN entry points to allow external callers access to the volume.
1537  */
1538 /*
1539  * Return the volume parameters needed for access from an external caller.
1540  * These values are invariant as long as the volume is held open.
1541  */
1542 int
1543 zvol_get_volume_params(minor_t minor, uint64_t *blksize,
1544     uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
1545     void **rl_hdl, void **bonus_hdl)
1546 {
1547         zvol_state_t *zv;
1548 
1549         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1550         if (zv == NULL)
1551                 return (SET_ERROR(ENXIO));
1552         if (zv->zv_flags & ZVOL_DUMPIFIED)
1553                 return (SET_ERROR(ENXIO));
1554 
1555         ASSERT(blksize && max_xfer_len && minor_hdl &&
1556             objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
1557 
1558         *blksize = zv->zv_volblocksize;
1559         *max_xfer_len = (uint64_t)zvol_maxphys;
1560         *minor_hdl = zv;
1561         *objset_hdl = zv->zv_objset;
1562         *zil_hdl = zv->zv_zilog;
1563         *rl_hdl = &zv->zv_znode;
1564         *bonus_hdl = zv->zv_dbuf;
1565         return (0);
1566 }
1567 
1568 /*
1569  * Return the current volume size to an external caller.
1570  * The size can change while the volume is open.
1571  */
1572 uint64_t
1573 zvol_get_volume_size(void *minor_hdl)
1574 {
1575         zvol_state_t *zv = minor_hdl;
1576 
1577         return (zv->zv_volsize);
1578 }
1579 
1580 /*
1581  * Return the current WCE setting to an external caller.
1582  * The WCE setting can change while the volume is open.
1583  */
1584 int
1585 zvol_get_volume_wce(void *minor_hdl)
1586 {
1587         zvol_state_t *zv = minor_hdl;
1588 
1589         return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
1590 }
1591 
1592 /*
1593  * Entry point for external callers to zvol_log_write
1594  */
1595 void
1596 zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
1597     boolean_t sync)
1598 {
1599         zvol_state_t *zv = minor_hdl;
1600 
1601         zvol_log_write(zv, tx, off, resid, sync);
1602 }
1603 /*
1604  * END entry points to allow external callers access to the volume.
1605  */
1606 
1607 /*
1608  * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
1609  */
1610 static void
1611 zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
1612     boolean_t sync)
1613 {
1614         itx_t *itx;
1615         lr_truncate_t *lr;
1616         zilog_t *zilog = zv->zv_zilog;
1617 
1618         if (zil_replaying(zilog, tx))
1619                 return;
1620 
1621         itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
1622         lr = (lr_truncate_t *)&itx->itx_lr;
1623         lr->lr_foid = ZVOL_OBJ;
1624         lr->lr_offset = off;
1625         lr->lr_length = len;
1626 
1627         itx->itx_sync = sync;
1628         zil_itx_assign(zilog, itx, tx);
1629 }
1630 
1631 /*
1632  * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
1633  * Also a dirtbag dkio ioctl for unmap/free-block functionality.
1634  */
1635 /*ARGSUSED*/
1636 int
1637 zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
1638 {
1639         zvol_state_t *zv;
1640         struct dk_callback *dkc;
1641         int error = 0;
1642         rl_t *rl;
1643 
1644         mutex_enter(&zfsdev_state_lock);
1645 
1646         zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
1647 
1648         if (zv == NULL) {
1649                 mutex_exit(&zfsdev_state_lock);
1650                 return (SET_ERROR(ENXIO));
1651         }
1652         ASSERT(zv->zv_total_opens > 0);
1653 
1654         switch (cmd) {
1655 
1656         case DKIOCINFO:
1657         {
1658                 struct dk_cinfo dki;
1659 
1660                 bzero(&dki, sizeof (dki));
1661                 (void) strcpy(dki.dki_cname, "zvol");
1662                 (void) strcpy(dki.dki_dname, "zvol");
1663                 dki.dki_ctype = DKC_UNKNOWN;
1664                 dki.dki_unit = getminor(dev);
1665                 dki.dki_maxtransfer =
1666                     1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
1667                 mutex_exit(&zfsdev_state_lock);
1668                 if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
1669                         error = SET_ERROR(EFAULT);
1670                 return (error);
1671         }
1672 
1673         case DKIOCGMEDIAINFO:
1674         {
1675                 struct dk_minfo dkm;
1676 
1677                 bzero(&dkm, sizeof (dkm));
1678                 dkm.dki_lbsize = 1U << zv->zv_min_bs;
1679                 dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1680                 dkm.dki_media_type = DK_UNKNOWN;
1681                 mutex_exit(&zfsdev_state_lock);
1682                 if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
1683                         error = SET_ERROR(EFAULT);
1684                 return (error);
1685         }
1686 
1687         case DKIOCGMEDIAINFOEXT:
1688         {
1689                 struct dk_minfo_ext dkmext;
1690 
1691                 bzero(&dkmext, sizeof (dkmext));
1692                 dkmext.dki_lbsize = 1U << zv->zv_min_bs;
1693                 dkmext.dki_pbsize = zv->zv_volblocksize;
1694                 dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1695                 dkmext.dki_media_type = DK_UNKNOWN;
1696                 mutex_exit(&zfsdev_state_lock);
1697                 if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
1698                         error = SET_ERROR(EFAULT);
1699                 return (error);
1700         }
1701 
1702         case DKIOCGETEFI:
1703         {
1704                 uint64_t vs = zv->zv_volsize;
1705                 uint8_t bs = zv->zv_min_bs;
1706 
1707                 mutex_exit(&zfsdev_state_lock);
1708                 error = zvol_getefi((void *)arg, flag, vs, bs);
1709                 return (error);
1710         }
1711 
1712         case DKIOCFLUSHWRITECACHE:
1713                 dkc = (struct dk_callback *)arg;
1714                 mutex_exit(&zfsdev_state_lock);
1715                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1716                 if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
1717                         (*dkc->dkc_callback)(dkc->dkc_cookie, error);
1718                         error = 0;
1719                 }
1720                 return (error);
1721 
1722         case DKIOCGETWCE:
1723         {
1724                 int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
1725                 if (ddi_copyout(&wce, (void *)arg, sizeof (int),
1726                     flag))
1727                         error = SET_ERROR(EFAULT);
1728                 break;
1729         }
1730         case DKIOCSETWCE:
1731         {
1732                 int wce;
1733                 if (ddi_copyin((void *)arg, &wce, sizeof (int),
1734                     flag)) {
1735                         error = SET_ERROR(EFAULT);
1736                         break;
1737                 }
1738                 if (wce) {
1739                         zv->zv_flags |= ZVOL_WCE;
1740                         mutex_exit(&zfsdev_state_lock);
1741                 } else {
1742                         zv->zv_flags &= ~ZVOL_WCE;
1743                         mutex_exit(&zfsdev_state_lock);
1744                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
1745                 }
1746                 return (0);
1747         }
1748 
1749         case DKIOCGGEOM:
1750         case DKIOCGVTOC:
1751                 /*
1752                  * commands using these (like prtvtoc) expect ENOTSUP
1753                  * since we're emulating an EFI label
1754                  */
1755                 error = SET_ERROR(ENOTSUP);
1756                 break;
1757 
1758         case DKIOCDUMPINIT:
1759                 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1760                     RL_WRITER);
1761                 error = zvol_dumpify(zv);
1762                 zfs_range_unlock(rl);
1763                 break;
1764 
1765         case DKIOCDUMPFINI:
1766                 if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1767                         break;
1768                 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1769                     RL_WRITER);
1770                 error = zvol_dump_fini(zv);
1771                 zfs_range_unlock(rl);
1772                 break;
1773 
1774         case DKIOCFREE:
1775         {
1776                 dkioc_free_t df;
1777                 dmu_tx_t *tx;
1778 
1779                 if (!zvol_unmap_enabled)
1780                         break;
1781 
1782                 if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
1783                         error = SET_ERROR(EFAULT);
1784                         break;
1785                 }
1786 
1787                 /*
1788                  * Apply Postel's Law to length-checking.  If they overshoot,
1789                  * just blank out until the end, if there's a need to blank
1790                  * out anything.
1791                  */
1792                 if (df.df_start >= zv->zv_volsize)
1793                         break;  /* No need to do anything... */
1794 
1795                 mutex_exit(&zfsdev_state_lock);
1796 
1797                 rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
1798                     RL_WRITER);
1799                 tx = dmu_tx_create(zv->zv_objset);
1800                 dmu_tx_mark_netfree(tx);
1801                 error = dmu_tx_assign(tx, TXG_WAIT);
1802                 if (error != 0) {
1803                         dmu_tx_abort(tx);
1804                 } else {
1805                         zvol_log_truncate(zv, tx, df.df_start,
1806                             df.df_length, B_TRUE);
1807                         dmu_tx_commit(tx);
1808                         error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1809                             df.df_start, df.df_length);
1810                 }
1811 
1812                 zfs_range_unlock(rl);
1813 
1814                 if (error == 0) {
1815                         /*
1816                          * If the write-cache is disabled or 'sync' property
1817                          * is set to 'always' then treat this as a synchronous
1818                          * operation (i.e. commit to zil).
1819                          */
1820                         if (!(zv->zv_flags & ZVOL_WCE) ||
1821                             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS))
1822                                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1823 
1824                         /*
1825                          * If the caller really wants synchronous writes, and
1826                          * can't wait for them, don't return until the write
1827                          * is done.
1828                          */
1829                         if (df.df_flags & DF_WAIT_SYNC) {
1830                                 txg_wait_synced(
1831                                     dmu_objset_pool(zv->zv_objset), 0);
1832                         }
1833                 }
1834                 return (error);
1835         }
1836 
1837         default:
1838                 error = SET_ERROR(ENOTTY);
1839                 break;
1840 
1841         }
1842         mutex_exit(&zfsdev_state_lock);
1843         return (error);
1844 }
1845 
1846 int
1847 zvol_busy(void)
1848 {
1849         return (zvol_minors != 0);
1850 }
1851 
1852 void
1853 zvol_init(void)
1854 {
1855         VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
1856             1) == 0);
1857         mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
1858 }
1859 
1860 void
1861 zvol_fini(void)
1862 {
1863         mutex_destroy(&zfsdev_state_lock);
1864         ddi_soft_state_fini(&zfsdev_state);
1865 }
1866 
1867 /*ARGSUSED*/
1868 static int
1869 zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
1870 {
1871         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1872 
1873         if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1874                 return (1);
1875         return (0);
1876 }
1877 
1878 /*ARGSUSED*/
1879 static void
1880 zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
1881 {
1882         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1883 
1884         spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
1885 }
1886 
1887 static int
1888 zvol_dump_init(zvol_state_t *zv, boolean_t resize)
1889 {
1890         dmu_tx_t *tx;
1891         int error;
1892         objset_t *os = zv->zv_objset;
1893         spa_t *spa = dmu_objset_spa(os);
1894         vdev_t *vd = spa->spa_root_vdev;
1895         nvlist_t *nv = NULL;
1896         uint64_t version = spa_version(spa);
1897         uint64_t checksum, compress, refresrv, vbs, dedup;
1898 
1899         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
1900         ASSERT(vd->vdev_ops == &vdev_root_ops);
1901 
1902         error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
1903             DMU_OBJECT_END);
1904         if (error != 0)
1905                 return (error);
1906         /* wait for dmu_free_long_range to actually free the blocks */
1907         txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
1908 
1909         /*
1910          * If the pool on which the dump device is being initialized has more
1911          * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
1912          * enabled.  If so, bump that feature's counter to indicate that the
1913          * feature is active. We also check the vdev type to handle the
1914          * following case:
1915          *   # zpool create test raidz disk1 disk2 disk3
1916          *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
1917          *   the raidz vdev itself has 3 children.
1918          */
1919         if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
1920                 if (!spa_feature_is_enabled(spa,
1921                     SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1922                         return (SET_ERROR(ENOTSUP));
1923                 (void) dsl_sync_task(spa_name(spa),
1924                     zfs_mvdev_dump_feature_check,
1925                     zfs_mvdev_dump_activate_feature_sync, NULL,
1926                     2, ZFS_SPACE_CHECK_RESERVED);
1927         }
1928 
1929         if (!resize) {
1930                 error = dsl_prop_get_integer(zv->zv_name,
1931                     zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
1932                 if (error == 0) {
1933                         error = dsl_prop_get_integer(zv->zv_name,
1934                             zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum,
1935                             NULL);
1936                 }
1937                 if (error == 0) {
1938                         error = dsl_prop_get_integer(zv->zv_name,
1939                             zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
1940                             &refresrv, NULL);
1941                 }
1942                 if (error == 0) {
1943                         error = dsl_prop_get_integer(zv->zv_name,
1944                             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs,
1945                             NULL);
1946                 }
1947                 if (version >= SPA_VERSION_DEDUP && error == 0) {
1948                         error = dsl_prop_get_integer(zv->zv_name,
1949                             zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
1950                 }
1951         }
1952         if (error != 0)
1953                 return (error);
1954 
1955         tx = dmu_tx_create(os);
1956         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
1957         dmu_tx_hold_bonus(tx, ZVOL_OBJ);
1958         error = dmu_tx_assign(tx, TXG_WAIT);
1959         if (error != 0) {
1960                 dmu_tx_abort(tx);
1961                 return (error);
1962         }
1963 
1964         /*
1965          * If we are resizing the dump device then we only need to
1966          * update the refreservation to match the newly updated
1967          * zvolsize. Otherwise, we save off the original state of the
1968          * zvol so that we can restore them if the zvol is ever undumpified.
1969          */
1970         if (resize) {
1971                 error = zap_update(os, ZVOL_ZAP_OBJ,
1972                     zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1973                     &zv->zv_volsize, tx);
1974         } else {
1975                 error = zap_update(os, ZVOL_ZAP_OBJ,
1976                     zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
1977                     &compress, tx);
1978                 if (error == 0) {
1979                         error = zap_update(os, ZVOL_ZAP_OBJ,
1980                             zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1,
1981                             &checksum, tx);
1982                 }
1983                 if (error == 0) {
1984                         error = zap_update(os, ZVOL_ZAP_OBJ,
1985                             zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1986                             &refresrv, tx);
1987                 }
1988                 if (error == 0) {
1989                         error = zap_update(os, ZVOL_ZAP_OBJ,
1990                             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
1991                             &vbs, tx);
1992                 }
1993                 if (error == 0) {
1994                         error = dmu_object_set_blocksize(
1995                             os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
1996                 }
1997                 if (version >= SPA_VERSION_DEDUP && error == 0) {
1998                         error = zap_update(os, ZVOL_ZAP_OBJ,
1999                             zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
2000                             &dedup, tx);
2001                 }
2002                 if (error == 0)
2003                         zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
2004         }
2005         dmu_tx_commit(tx);
2006 
2007         /*
2008          * We only need update the zvol's property if we are initializing
2009          * the dump area for the first time.
2010          */
2011         if (error == 0 && !resize) {
2012                 /*
2013                  * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
2014                  * function.  Otherwise, use the old default -- OFF.
2015                  */
2016                 checksum = spa_feature_is_active(spa,
2017                     SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
2018                     ZIO_CHECKSUM_OFF;
2019 
2020                 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2021                 VERIFY(nvlist_add_uint64(nv,
2022                     zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
2023                 VERIFY(nvlist_add_uint64(nv,
2024                     zfs_prop_to_name(ZFS_PROP_COMPRESSION),
2025                     ZIO_COMPRESS_OFF) == 0);
2026                 VERIFY(nvlist_add_uint64(nv,
2027                     zfs_prop_to_name(ZFS_PROP_CHECKSUM),
2028                     checksum) == 0);
2029                 if (version >= SPA_VERSION_DEDUP) {
2030                         VERIFY(nvlist_add_uint64(nv,
2031                             zfs_prop_to_name(ZFS_PROP_DEDUP),
2032                             ZIO_CHECKSUM_OFF) == 0);
2033                 }
2034 
2035                 error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2036                     nv, NULL);
2037                 nvlist_free(nv);
2038         }
2039 
2040         /* Allocate the space for the dump */
2041         if (error == 0)
2042                 error = zvol_prealloc(zv);
2043         return (error);
2044 }
2045 
2046 static int
2047 zvol_dumpify(zvol_state_t *zv)
2048 {
2049         int error = 0;
2050         uint64_t dumpsize = 0;
2051         dmu_tx_t *tx;
2052         objset_t *os = zv->zv_objset;
2053 
2054         if (zv->zv_flags & ZVOL_RDONLY)
2055                 return (SET_ERROR(EROFS));
2056 
2057         if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
2058             8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
2059                 boolean_t resize = (dumpsize > 0);
2060 
2061                 if ((error = zvol_dump_init(zv, resize)) != 0) {
2062                         (void) zvol_dump_fini(zv);
2063                         return (error);
2064                 }
2065         }
2066 
2067         /*
2068          * Build up our lba mapping.
2069          */
2070         error = zvol_get_lbas(zv);
2071         if (error) {
2072                 (void) zvol_dump_fini(zv);
2073                 return (error);
2074         }
2075 
2076         tx = dmu_tx_create(os);
2077         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2078         error = dmu_tx_assign(tx, TXG_WAIT);
2079         if (error) {
2080                 dmu_tx_abort(tx);
2081                 (void) zvol_dump_fini(zv);
2082                 return (error);
2083         }
2084 
2085         zv->zv_flags |= ZVOL_DUMPIFIED;
2086         error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
2087             &zv->zv_volsize, tx);
2088         dmu_tx_commit(tx);
2089 
2090         if (error) {
2091                 (void) zvol_dump_fini(zv);
2092                 return (error);
2093         }
2094 
2095         txg_wait_synced(dmu_objset_pool(os), 0);
2096         return (0);
2097 }
2098 
2099 static int
2100 zvol_dump_fini(zvol_state_t *zv)
2101 {
2102         dmu_tx_t *tx;
2103         objset_t *os = zv->zv_objset;
2104         nvlist_t *nv;
2105         int error = 0;
2106         uint64_t checksum, compress, refresrv, vbs, dedup;
2107         uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
2108 
2109         /*
2110          * Attempt to restore the zvol back to its pre-dumpified state.
2111          * This is a best-effort attempt as it's possible that not all
2112          * of these properties were initialized during the dumpify process
2113          * (i.e. error during zvol_dump_init).
2114          */
2115 
2116         tx = dmu_tx_create(os);
2117         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2118         error = dmu_tx_assign(tx, TXG_WAIT);
2119         if (error) {
2120                 dmu_tx_abort(tx);
2121                 return (error);
2122         }
2123         (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
2124         dmu_tx_commit(tx);
2125 
2126         (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2127             zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
2128         (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2129             zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
2130         (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2131             zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
2132         (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2133             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
2134 
2135         VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2136         (void) nvlist_add_uint64(nv,
2137             zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
2138         (void) nvlist_add_uint64(nv,
2139             zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
2140         (void) nvlist_add_uint64(nv,
2141             zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
2142         if (version >= SPA_VERSION_DEDUP &&
2143             zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2144             zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
2145                 (void) nvlist_add_uint64(nv,
2146                     zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
2147         }
2148         (void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2149             nv, NULL);
2150         nvlist_free(nv);
2151 
2152         zvol_free_extents(zv);
2153         zv->zv_flags &= ~ZVOL_DUMPIFIED;
2154         (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
2155         /* wait for dmu_free_long_range to actually free the blocks */
2156         txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2157         tx = dmu_tx_create(os);
2158         dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2159         error = dmu_tx_assign(tx, TXG_WAIT);
2160         if (error) {
2161                 dmu_tx_abort(tx);
2162                 return (error);
2163         }
2164         if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
2165                 zv->zv_volblocksize = vbs;
2166         dmu_tx_commit(tx);
2167 
2168         return (0);
2169 }