1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  *
  24  * Portions Copyright 2010 Robert Milkowski
  25  *
  26  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  27  * Copyright (c) 2013 by Delphix. All rights reserved.
  28  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  29  */
  30 
  31 /*
  32  * ZFS volume emulation driver.
  33  *
  34  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  35  * Volumes are accessed through the symbolic links named:
  36  *
  37  * /dev/zvol/dsk/<pool_name>/<dataset_name>
  38  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
  39  *
  40  * These links are created by the /dev filesystem (sdev_zvolops.c).
  41  * Volumes are persistent through reboot.  No user command needs to be
  42  * run before opening and using a device.
  43  */
  44 
  45 #include <sys/types.h>
  46 #include <sys/param.h>
  47 #include <sys/errno.h>
  48 #include <sys/uio.h>
  49 #include <sys/buf.h>
  50 #include <sys/modctl.h>
  51 #include <sys/open.h>
  52 #include <sys/kmem.h>
  53 #include <sys/conf.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/stat.h>
  56 #include <sys/zap.h>
  57 #include <sys/spa.h>
  58 #include <sys/spa_impl.h>
  59 #include <sys/zio.h>
  60 #include <sys/dmu_traverse.h>
  61 #include <sys/dnode.h>
  62 #include <sys/dsl_dataset.h>
  63 #include <sys/dsl_prop.h>
  64 #include <sys/dkio.h>
  65 #include <sys/efi_partition.h>
  66 #include <sys/byteorder.h>
  67 #include <sys/pathname.h>
  68 #include <sys/ddi.h>
  69 #include <sys/sunddi.h>
  70 #include <sys/crc32.h>
  71 #include <sys/dirent.h>
  72 #include <sys/policy.h>
  73 #include <sys/fs/zfs.h>
  74 #include <sys/zfs_ioctl.h>
  75 #include <sys/mkdev.h>
  76 #include <sys/zil.h>
  77 #include <sys/refcount.h>
  78 #include <sys/zfs_znode.h>
  79 #include <sys/zfs_rlock.h>
  80 #include <sys/vdev_disk.h>
  81 #include <sys/vdev_impl.h>
  82 #include <sys/vdev_raidz.h>
  83 #include <sys/zvol.h>
  84 #include <sys/dumphdr.h>
  85 #include <sys/zil_impl.h>
  86 #include <sys/dbuf.h>
  87 #include <sys/dmu_tx.h>
  88 #include <sys/zfeature.h>
  89 #include <sys/zio_checksum.h>
  90 
  91 #include "zfs_namecheck.h"
  92 
  93 void *zfsdev_state;
  94 static char *zvol_tag = "zvol_tag";
  95 
  96 #define ZVOL_DUMPSIZE           "dumpsize"
  97 
  98 /*
  99  * This lock protects the zfsdev_state structure from being modified
 100  * while it's being used, e.g. an open that comes in before a create
 101  * finishes.  It also protects temporary opens of the dataset so that,
 102  * e.g., an open doesn't get a spurious EBUSY.
 103  */
 104 kmutex_t zfsdev_state_lock;
 105 static uint32_t zvol_minors;
 106 
 107 typedef struct zvol_extent {
 108         list_node_t     ze_node;
 109         dva_t           ze_dva;         /* dva associated with this extent */
 110         uint64_t        ze_nblks;       /* number of blocks in extent */
 111 } zvol_extent_t;
 112 
 113 /*
 114  * The in-core state of each volume.
 115  */
 116 typedef struct zvol_state {
 117         char            zv_name[MAXPATHLEN]; /* pool/dd name */
 118         uint64_t        zv_volsize;     /* amount of space we advertise */
 119         uint64_t        zv_volblocksize; /* volume block size */
 120         minor_t         zv_minor;       /* minor number */
 121         uint8_t         zv_min_bs;      /* minimum addressable block shift */
 122         uint8_t         zv_flags;       /* readonly, dumpified, etc. */
 123         objset_t        *zv_objset;     /* objset handle */
 124         uint32_t        zv_open_count[OTYPCNT]; /* open counts */
 125         uint32_t        zv_total_opens; /* total open count */
 126         zilog_t         *zv_zilog;      /* ZIL handle */
 127         list_t          zv_extents;     /* List of extents for dump */
 128         znode_t         zv_znode;       /* for range locking */
 129         dmu_buf_t       *zv_dbuf;       /* bonus handle */
 130 } zvol_state_t;
 131 
 132 /*
 133  * zvol specific flags
 134  */
 135 #define ZVOL_RDONLY     0x1
 136 #define ZVOL_DUMPIFIED  0x2
 137 #define ZVOL_EXCL       0x4
 138 #define ZVOL_WCE        0x8
 139 
 140 /*
 141  * zvol maximum transfer in one DMU tx.
 142  */
 143 int zvol_maxphys = DMU_MAX_ACCESS/2;
 144 
 145 extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
 146     nvlist_t *, nvlist_t *);
 147 static int zvol_remove_zv(zvol_state_t *);
 148 static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
 149 static int zvol_dumpify(zvol_state_t *zv);
 150 static int zvol_dump_fini(zvol_state_t *zv);
 151 static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
 152 
 153 static void
 154 zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
 155 {
 156         dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor);
 157 
 158         zv->zv_volsize = volsize;
 159         VERIFY(ddi_prop_update_int64(dev, zfs_dip,
 160             "Size", volsize) == DDI_SUCCESS);
 161         VERIFY(ddi_prop_update_int64(dev, zfs_dip,
 162             "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
 163 
 164         /* Notify specfs to invalidate the cached size */
 165         spec_size_invalidate(dev, VBLK);
 166         spec_size_invalidate(dev, VCHR);
 167 }
 168 
 169 int
 170 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
 171 {
 172         if (volsize == 0)
 173                 return (SET_ERROR(EINVAL));
 174 
 175         if (volsize % blocksize != 0)
 176                 return (SET_ERROR(EINVAL));
 177 
 178 #ifdef _ILP32
 179         if (volsize - 1 > SPEC_MAXOFFSET_T)
 180                 return (SET_ERROR(EOVERFLOW));
 181 #endif
 182         return (0);
 183 }
 184 
 185 int
 186 zvol_check_volblocksize(uint64_t volblocksize)
 187 {
 188         if (volblocksize < SPA_MINBLOCKSIZE ||
 189             volblocksize > SPA_MAXBLOCKSIZE ||
 190             !ISP2(volblocksize))
 191                 return (SET_ERROR(EDOM));
 192 
 193         return (0);
 194 }
 195 
 196 int
 197 zvol_get_stats(objset_t *os, nvlist_t *nv)
 198 {
 199         int error;
 200         dmu_object_info_t doi;
 201         uint64_t val;
 202 
 203         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
 204         if (error)
 205                 return (error);
 206 
 207         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
 208 
 209         error = dmu_object_info(os, ZVOL_OBJ, &doi);
 210 
 211         if (error == 0) {
 212                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
 213                     doi.doi_data_block_size);
 214         }
 215 
 216         return (error);
 217 }
 218 
 219 static zvol_state_t *
 220 zvol_minor_lookup(const char *name)
 221 {
 222         minor_t minor;
 223         zvol_state_t *zv;
 224 
 225         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 226 
 227         for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
 228                 zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 229                 if (zv == NULL)
 230                         continue;
 231                 if (strcmp(zv->zv_name, name) == 0)
 232                         return (zv);
 233         }
 234 
 235         return (NULL);
 236 }
 237 
 238 /* extent mapping arg */
 239 struct maparg {
 240         zvol_state_t    *ma_zv;
 241         uint64_t        ma_blks;
 242 };
 243 
 244 /*ARGSUSED*/
 245 static int
 246 zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 247     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 248 {
 249         struct maparg *ma = arg;
 250         zvol_extent_t *ze;
 251         int bs = ma->ma_zv->zv_volblocksize;
 252 
 253         if (bp == NULL || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
 254                 return (0);
 255 
 256         VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
 257         ma->ma_blks++;
 258 
 259         /* Abort immediately if we have encountered gang blocks */
 260         if (BP_IS_GANG(bp))
 261                 return (SET_ERROR(EFRAGS));
 262 
 263         /*
 264          * See if the block is at the end of the previous extent.
 265          */
 266         ze = list_tail(&ma->ma_zv->zv_extents);
 267         if (ze &&
 268             DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
 269             DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
 270             DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
 271                 ze->ze_nblks++;
 272                 return (0);
 273         }
 274 
 275         dprintf_bp(bp, "%s", "next blkptr:");
 276 
 277         /* start a new extent */
 278         ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
 279         ze->ze_dva = bp->blk_dva[0];      /* structure assignment */
 280         ze->ze_nblks = 1;
 281         list_insert_tail(&ma->ma_zv->zv_extents, ze);
 282         return (0);
 283 }
 284 
 285 static void
 286 zvol_free_extents(zvol_state_t *zv)
 287 {
 288         zvol_extent_t *ze;
 289 
 290         while (ze = list_head(&zv->zv_extents)) {
 291                 list_remove(&zv->zv_extents, ze);
 292                 kmem_free(ze, sizeof (zvol_extent_t));
 293         }
 294 }
 295 
 296 static int
 297 zvol_get_lbas(zvol_state_t *zv)
 298 {
 299         objset_t *os = zv->zv_objset;
 300         struct maparg   ma;
 301         int             err;
 302 
 303         ma.ma_zv = zv;
 304         ma.ma_blks = 0;
 305         zvol_free_extents(zv);
 306 
 307         /* commit any in-flight changes before traversing the dataset */
 308         txg_wait_synced(dmu_objset_pool(os), 0);
 309         err = traverse_dataset(dmu_objset_ds(os), 0,
 310             TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
 311         if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
 312                 zvol_free_extents(zv);
 313                 return (err ? err : EIO);
 314         }
 315 
 316         return (0);
 317 }
 318 
 319 /* ARGSUSED */
 320 void
 321 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 322 {
 323         zfs_creat_t *zct = arg;
 324         nvlist_t *nvprops = zct->zct_props;
 325         int error;
 326         uint64_t volblocksize, volsize;
 327 
 328         VERIFY(nvlist_lookup_uint64(nvprops,
 329             zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
 330         if (nvlist_lookup_uint64(nvprops,
 331             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
 332                 volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 333 
 334         /*
 335          * These properties must be removed from the list so the generic
 336          * property setting step won't apply to them.
 337          */
 338         VERIFY(nvlist_remove_all(nvprops,
 339             zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
 340         (void) nvlist_remove_all(nvprops,
 341             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
 342 
 343         error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
 344             DMU_OT_NONE, 0, tx);
 345         ASSERT(error == 0);
 346 
 347         error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
 348             DMU_OT_NONE, 0, tx);
 349         ASSERT(error == 0);
 350 
 351         error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
 352         ASSERT(error == 0);
 353 }
 354 
 355 /*
 356  * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
 357  * implement DKIOCFREE/free-long-range.
 358  */
 359 static int
 360 zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
 361 {
 362         uint64_t offset, length;
 363 
 364         if (byteswap)
 365                 byteswap_uint64_array(lr, sizeof (*lr));
 366 
 367         offset = lr->lr_offset;
 368         length = lr->lr_length;
 369 
 370         return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
 371 }
 372 
 373 /*
 374  * Replay a TX_WRITE ZIL transaction that didn't get committed
 375  * after a system failure
 376  */
 377 static int
 378 zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
 379 {
 380         objset_t *os = zv->zv_objset;
 381         char *data = (char *)(lr + 1);  /* data follows lr_write_t */
 382         uint64_t offset, length;
 383         dmu_tx_t *tx;
 384         int error;
 385 
 386         if (byteswap)
 387                 byteswap_uint64_array(lr, sizeof (*lr));
 388 
 389         offset = lr->lr_offset;
 390         length = lr->lr_length;
 391 
 392         /* If it's a dmu_sync() block, write the whole block */
 393         if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 394                 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 395                 if (length < blocksize) {
 396                         offset -= offset % blocksize;
 397                         length = blocksize;
 398                 }
 399         }
 400 
 401         tx = dmu_tx_create(os);
 402         dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
 403         error = dmu_tx_assign(tx, TXG_WAIT);
 404         if (error) {
 405                 dmu_tx_abort(tx);
 406         } else {
 407                 dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
 408                 dmu_tx_commit(tx);
 409         }
 410 
 411         return (error);
 412 }
 413 
 414 /* ARGSUSED */
 415 static int
 416 zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
 417 {
 418         return (SET_ERROR(ENOTSUP));
 419 }
 420 
 421 /*
 422  * Callback vectors for replaying records.
 423  * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
 424  */
 425 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 426         zvol_replay_err,        /* 0 no such transaction type */
 427         zvol_replay_err,        /* TX_CREATE */
 428         zvol_replay_err,        /* TX_MKDIR */
 429         zvol_replay_err,        /* TX_MKXATTR */
 430         zvol_replay_err,        /* TX_SYMLINK */
 431         zvol_replay_err,        /* TX_REMOVE */
 432         zvol_replay_err,        /* TX_RMDIR */
 433         zvol_replay_err,        /* TX_LINK */
 434         zvol_replay_err,        /* TX_RENAME */
 435         zvol_replay_write,      /* TX_WRITE */
 436         zvol_replay_truncate,   /* TX_TRUNCATE */
 437         zvol_replay_err,        /* TX_SETATTR */
 438         zvol_replay_err,        /* TX_ACL */
 439         zvol_replay_err,        /* TX_CREATE_ACL */
 440         zvol_replay_err,        /* TX_CREATE_ATTR */
 441         zvol_replay_err,        /* TX_CREATE_ACL_ATTR */
 442         zvol_replay_err,        /* TX_MKDIR_ACL */
 443         zvol_replay_err,        /* TX_MKDIR_ATTR */
 444         zvol_replay_err,        /* TX_MKDIR_ACL_ATTR */
 445         zvol_replay_err,        /* TX_WRITE2 */
 446 };
 447 
 448 int
 449 zvol_name2minor(const char *name, minor_t *minor)
 450 {
 451         zvol_state_t *zv;
 452 
 453         mutex_enter(&zfsdev_state_lock);
 454         zv = zvol_minor_lookup(name);
 455         if (minor && zv)
 456                 *minor = zv->zv_minor;
 457         mutex_exit(&zfsdev_state_lock);
 458         return (zv ? 0 : -1);
 459 }
 460 
 461 /*
 462  * Create a minor node (plus a whole lot more) for the specified volume.
 463  */
 464 int
 465 zvol_create_minor(const char *name)
 466 {
 467         zfs_soft_state_t *zs;
 468         zvol_state_t *zv;
 469         objset_t *os;
 470         dmu_object_info_t doi;
 471         minor_t minor = 0;
 472         char chrbuf[30], blkbuf[30];
 473         int error;
 474 
 475         mutex_enter(&zfsdev_state_lock);
 476 
 477         if (zvol_minor_lookup(name) != NULL) {
 478                 mutex_exit(&zfsdev_state_lock);
 479                 return (SET_ERROR(EEXIST));
 480         }
 481 
 482         /* lie and say we're read-only */
 483         error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
 484 
 485         if (error) {
 486                 mutex_exit(&zfsdev_state_lock);
 487                 return (error);
 488         }
 489 
 490         if ((minor = zfsdev_minor_alloc()) == 0) {
 491                 dmu_objset_disown(os, FTAG);
 492                 mutex_exit(&zfsdev_state_lock);
 493                 return (SET_ERROR(ENXIO));
 494         }
 495 
 496         if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
 497                 dmu_objset_disown(os, FTAG);
 498                 mutex_exit(&zfsdev_state_lock);
 499                 return (SET_ERROR(EAGAIN));
 500         }
 501         (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
 502             (char *)name);
 503 
 504         (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
 505 
 506         if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
 507             minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 508                 ddi_soft_state_free(zfsdev_state, minor);
 509                 dmu_objset_disown(os, FTAG);
 510                 mutex_exit(&zfsdev_state_lock);
 511                 return (SET_ERROR(EAGAIN));
 512         }
 513 
 514         (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
 515 
 516         if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
 517             minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 518                 ddi_remove_minor_node(zfs_dip, chrbuf);
 519                 ddi_soft_state_free(zfsdev_state, minor);
 520                 dmu_objset_disown(os, FTAG);
 521                 mutex_exit(&zfsdev_state_lock);
 522                 return (SET_ERROR(EAGAIN));
 523         }
 524 
 525         zs = ddi_get_soft_state(zfsdev_state, minor);
 526         zs->zss_type = ZSST_ZVOL;
 527         zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 528         (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
 529         zv->zv_min_bs = DEV_BSHIFT;
 530         zv->zv_minor = minor;
 531         zv->zv_objset = os;
 532         if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
 533                 zv->zv_flags |= ZVOL_RDONLY;
 534         mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 535         avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
 536             sizeof (rl_t), offsetof(rl_t, r_node));
 537         list_create(&zv->zv_extents, sizeof (zvol_extent_t),
 538             offsetof(zvol_extent_t, ze_node));
 539         /* get and cache the blocksize */
 540         error = dmu_object_info(os, ZVOL_OBJ, &doi);
 541         ASSERT(error == 0);
 542         zv->zv_volblocksize = doi.doi_data_block_size;
 543 
 544         if (spa_writeable(dmu_objset_spa(os))) {
 545                 if (zil_replay_disable)
 546                         zil_destroy(dmu_objset_zil(os), B_FALSE);
 547                 else
 548                         zil_replay(os, zv, zvol_replay_vector);
 549         }
 550         dmu_objset_disown(os, FTAG);
 551         zv->zv_objset = NULL;
 552 
 553         zvol_minors++;
 554 
 555         mutex_exit(&zfsdev_state_lock);
 556 
 557         return (0);
 558 }
 559 
 560 /*
 561  * Remove minor node for the specified volume.
 562  */
 563 static int
 564 zvol_remove_zv(zvol_state_t *zv)
 565 {
 566         char nmbuf[20];
 567         minor_t minor = zv->zv_minor;
 568 
 569         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 570         if (zv->zv_total_opens != 0)
 571                 return (SET_ERROR(EBUSY));
 572 
 573         (void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
 574         ddi_remove_minor_node(zfs_dip, nmbuf);
 575 
 576         (void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
 577         ddi_remove_minor_node(zfs_dip, nmbuf);
 578 
 579         avl_destroy(&zv->zv_znode.z_range_avl);
 580         mutex_destroy(&zv->zv_znode.z_range_lock);
 581 
 582         kmem_free(zv, sizeof (zvol_state_t));
 583 
 584         ddi_soft_state_free(zfsdev_state, minor);
 585 
 586         zvol_minors--;
 587         return (0);
 588 }
 589 
 590 int
 591 zvol_remove_minor(const char *name)
 592 {
 593         zvol_state_t *zv;
 594         int rc;
 595 
 596         mutex_enter(&zfsdev_state_lock);
 597         if ((zv = zvol_minor_lookup(name)) == NULL) {
 598                 mutex_exit(&zfsdev_state_lock);
 599                 return (SET_ERROR(ENXIO));
 600         }
 601         rc = zvol_remove_zv(zv);
 602         mutex_exit(&zfsdev_state_lock);
 603         return (rc);
 604 }
 605 
 606 int
 607 zvol_first_open(zvol_state_t *zv)
 608 {
 609         objset_t *os;
 610         uint64_t volsize;
 611         int error;
 612         uint64_t readonly;
 613 
 614         /* lie and say we're read-only */
 615         error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
 616             zvol_tag, &os);
 617         if (error)
 618                 return (error);
 619 
 620         zv->zv_objset = os;
 621         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 622         if (error) {
 623                 ASSERT(error == 0);
 624                 dmu_objset_disown(os, zvol_tag);
 625                 return (error);
 626         }
 627 
 628         error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
 629         if (error) {
 630                 dmu_objset_disown(os, zvol_tag);
 631                 return (error);
 632         }
 633 
 634         zvol_size_changed(zv, volsize);
 635         zv->zv_zilog = zil_open(os, zvol_get_data);
 636 
 637         VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
 638             NULL) == 0);
 639         if (readonly || dmu_objset_is_snapshot(os) ||
 640             !spa_writeable(dmu_objset_spa(os)))
 641                 zv->zv_flags |= ZVOL_RDONLY;
 642         else
 643                 zv->zv_flags &= ~ZVOL_RDONLY;
 644         return (error);
 645 }
 646 
 647 void
 648 zvol_last_close(zvol_state_t *zv)
 649 {
 650         zil_close(zv->zv_zilog);
 651         zv->zv_zilog = NULL;
 652 
 653         dmu_buf_rele(zv->zv_dbuf, zvol_tag);
 654         zv->zv_dbuf = NULL;
 655 
 656         /*
 657          * Evict cached data
 658          */
 659         if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
 660             !(zv->zv_flags & ZVOL_RDONLY))
 661                 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 662         dmu_objset_evict_dbufs(zv->zv_objset);
 663 
 664         dmu_objset_disown(zv->zv_objset, zvol_tag);
 665         zv->zv_objset = NULL;
 666 }
 667 
 668 int
 669 zvol_prealloc(zvol_state_t *zv)
 670 {
 671         objset_t *os = zv->zv_objset;
 672         dmu_tx_t *tx;
 673         uint64_t refd, avail, usedobjs, availobjs;
 674         uint64_t resid = zv->zv_volsize;
 675         uint64_t off = 0;
 676 
 677         /* Check the space usage before attempting to allocate the space */
 678         dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
 679         if (avail < zv->zv_volsize)
 680                 return (SET_ERROR(ENOSPC));
 681 
 682         /* Free old extents if they exist */
 683         zvol_free_extents(zv);
 684 
 685         while (resid != 0) {
 686                 int error;
 687                 uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
 688 
 689                 tx = dmu_tx_create(os);
 690                 dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
 691                 error = dmu_tx_assign(tx, TXG_WAIT);
 692                 if (error) {
 693                         dmu_tx_abort(tx);
 694                         (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
 695                         return (error);
 696                 }
 697                 dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
 698                 dmu_tx_commit(tx);
 699                 off += bytes;
 700                 resid -= bytes;
 701         }
 702         txg_wait_synced(dmu_objset_pool(os), 0);
 703 
 704         return (0);
 705 }
 706 
 707 static int
 708 zvol_update_volsize(objset_t *os, uint64_t volsize)
 709 {
 710         dmu_tx_t *tx;
 711         int error;
 712 
 713         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 714 
 715         tx = dmu_tx_create(os);
 716         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 717         error = dmu_tx_assign(tx, TXG_WAIT);
 718         if (error) {
 719                 dmu_tx_abort(tx);
 720                 return (error);
 721         }
 722 
 723         error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
 724             &volsize, tx);
 725         dmu_tx_commit(tx);
 726 
 727         if (error == 0)
 728                 error = dmu_free_long_range(os,
 729                     ZVOL_OBJ, volsize, DMU_OBJECT_END);
 730         return (error);
 731 }
 732 
 733 void
 734 zvol_remove_minors(const char *name)
 735 {
 736         zvol_state_t *zv;
 737         char *namebuf;
 738         minor_t minor;
 739 
 740         namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
 741         (void) strncpy(namebuf, name, strlen(name));
 742         (void) strcat(namebuf, "/");
 743         mutex_enter(&zfsdev_state_lock);
 744         for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
 745 
 746                 zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 747                 if (zv == NULL)
 748                         continue;
 749                 if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
 750                         (void) zvol_remove_zv(zv);
 751         }
 752         kmem_free(namebuf, strlen(name) + 2);
 753 
 754         mutex_exit(&zfsdev_state_lock);
 755 }
 756 
 757 static int
 758 zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
 759 {
 760         uint64_t old_volsize = 0ULL;
 761         int error = 0;
 762 
 763         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 764 
 765         /*
 766          * Reinitialize the dump area to the new size. If we
 767          * failed to resize the dump area then restore it back to
 768          * its original size.  We must set the new volsize prior
 769          * to calling dumpvp_resize() to ensure that the devices'
 770          * size(9P) is not visible by the dump subsystem.
 771          */
 772         old_volsize = zv->zv_volsize;
 773         zvol_size_changed(zv, volsize);
 774 
 775         if (zv->zv_flags & ZVOL_DUMPIFIED) {
 776                 if ((error = zvol_dumpify(zv)) != 0 ||
 777                     (error = dumpvp_resize()) != 0) {
 778                         int dumpify_error;
 779 
 780                         (void) zvol_update_volsize(zv->zv_objset, old_volsize);
 781                         zvol_size_changed(zv, old_volsize);
 782                         dumpify_error = zvol_dumpify(zv);
 783                         error = dumpify_error ? dumpify_error : error;
 784                 }
 785         }
 786 
 787         /*
 788          * Generate a LUN expansion event.
 789          */
 790         if (error == 0) {
 791                 sysevent_id_t eid;
 792                 nvlist_t *attr;
 793                 char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 794 
 795                 (void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
 796                     zv->zv_minor);
 797 
 798                 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 799                 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
 800 
 801                 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
 802                     ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
 803 
 804                 nvlist_free(attr);
 805                 kmem_free(physpath, MAXPATHLEN);
 806         }
 807         return (error);
 808 }
 809 
 810 int
 811 zvol_set_volsize(const char *name, uint64_t volsize)
 812 {
 813         zvol_state_t *zv = NULL;
 814         objset_t *os;
 815         int error;
 816         dmu_object_info_t doi;
 817         uint64_t readonly;
 818         boolean_t owned = B_FALSE;
 819 
 820         error = dsl_prop_get_integer(name,
 821             zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
 822         if (error != 0)
 823                 return (error);
 824         if (readonly)
 825                 return (SET_ERROR(EROFS));
 826 
 827         mutex_enter(&zfsdev_state_lock);
 828         zv = zvol_minor_lookup(name);
 829 
 830         if (zv == NULL || zv->zv_objset == NULL) {
 831                 if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
 832                     FTAG, &os)) != 0) {
 833                         mutex_exit(&zfsdev_state_lock);
 834                         return (error);
 835                 }
 836                 owned = B_TRUE;
 837                 if (zv != NULL)
 838                         zv->zv_objset = os;
 839         } else {
 840                 os = zv->zv_objset;
 841         }
 842 
 843         if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
 844             (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0)
 845                 goto out;
 846 
 847         error = zvol_update_volsize(os, volsize);
 848 
 849         if (error == 0 && zv != NULL)
 850                 error = zvol_update_live_volsize(zv, volsize);
 851 out:
 852         if (owned) {
 853                 dmu_objset_disown(os, FTAG);
 854                 if (zv != NULL)
 855                         zv->zv_objset = NULL;
 856         }
 857         mutex_exit(&zfsdev_state_lock);
 858         return (error);
 859 }
 860 
 861 /*ARGSUSED*/
 862 int
 863 zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
 864 {
 865         zvol_state_t *zv;
 866         int err = 0;
 867 
 868         mutex_enter(&zfsdev_state_lock);
 869 
 870         zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
 871         if (zv == NULL) {
 872                 mutex_exit(&zfsdev_state_lock);
 873                 return (SET_ERROR(ENXIO));
 874         }
 875 
 876         if (zv->zv_total_opens == 0)
 877                 err = zvol_first_open(zv);
 878         if (err) {
 879                 mutex_exit(&zfsdev_state_lock);
 880                 return (err);
 881         }
 882         if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
 883                 err = SET_ERROR(EROFS);
 884                 goto out;
 885         }
 886         if (zv->zv_flags & ZVOL_EXCL) {
 887                 err = SET_ERROR(EBUSY);
 888                 goto out;
 889         }
 890         if (flag & FEXCL) {
 891                 if (zv->zv_total_opens != 0) {
 892                         err = SET_ERROR(EBUSY);
 893                         goto out;
 894                 }
 895                 zv->zv_flags |= ZVOL_EXCL;
 896         }
 897 
 898         if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
 899                 zv->zv_open_count[otyp]++;
 900                 zv->zv_total_opens++;
 901         }
 902         mutex_exit(&zfsdev_state_lock);
 903 
 904         return (err);
 905 out:
 906         if (zv->zv_total_opens == 0)
 907                 zvol_last_close(zv);
 908         mutex_exit(&zfsdev_state_lock);
 909         return (err);
 910 }
 911 
 912 /*ARGSUSED*/
 913 int
 914 zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
 915 {
 916         minor_t minor = getminor(dev);
 917         zvol_state_t *zv;
 918         int error = 0;
 919 
 920         mutex_enter(&zfsdev_state_lock);
 921 
 922         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 923         if (zv == NULL) {
 924                 mutex_exit(&zfsdev_state_lock);
 925                 return (SET_ERROR(ENXIO));
 926         }
 927 
 928         if (zv->zv_flags & ZVOL_EXCL) {
 929                 ASSERT(zv->zv_total_opens == 1);
 930                 zv->zv_flags &= ~ZVOL_EXCL;
 931         }
 932 
 933         /*
 934          * If the open count is zero, this is a spurious close.
 935          * That indicates a bug in the kernel / DDI framework.
 936          */
 937         ASSERT(zv->zv_open_count[otyp] != 0);
 938         ASSERT(zv->zv_total_opens != 0);
 939 
 940         /*
 941          * You may get multiple opens, but only one close.
 942          */
 943         zv->zv_open_count[otyp]--;
 944         zv->zv_total_opens--;
 945 
 946         if (zv->zv_total_opens == 0)
 947                 zvol_last_close(zv);
 948 
 949         mutex_exit(&zfsdev_state_lock);
 950         return (error);
 951 }
 952 
 953 static void
 954 zvol_get_done(zgd_t *zgd, int error)
 955 {
 956         if (zgd->zgd_db)
 957                 dmu_buf_rele(zgd->zgd_db, zgd);
 958 
 959         zfs_range_unlock(zgd->zgd_rl);
 960 
 961         if (error == 0 && zgd->zgd_bp)
 962                 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 963 
 964         kmem_free(zgd, sizeof (zgd_t));
 965 }
 966 
 967 /*
 968  * Get data to generate a TX_WRITE intent log record.
 969  */
 970 static int
 971 zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 972 {
 973         zvol_state_t *zv = arg;
 974         objset_t *os = zv->zv_objset;
 975         uint64_t object = ZVOL_OBJ;
 976         uint64_t offset = lr->lr_offset;
 977         uint64_t size = lr->lr_length;       /* length of user data */
 978         blkptr_t *bp = &lr->lr_blkptr;
 979         dmu_buf_t *db;
 980         zgd_t *zgd;
 981         int error;
 982 
 983         ASSERT(zio != NULL);
 984         ASSERT(size != 0);
 985 
 986         zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 987         zgd->zgd_zilog = zv->zv_zilog;
 988         zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
 989 
 990         /*
 991          * Write records come in two flavors: immediate and indirect.
 992          * For small writes it's cheaper to store the data with the
 993          * log record (immediate); for large writes it's cheaper to
 994          * sync the data and get a pointer to it (indirect) so that
 995          * we don't have to write the data twice.
 996          */
 997         if (buf != NULL) {      /* immediate write */
 998                 error = dmu_read(os, object, offset, size, buf,
 999                     DMU_READ_NO_PREFETCH);
1000         } else {
1001                 size = zv->zv_volblocksize;
1002                 offset = P2ALIGN(offset, size);
1003                 error = dmu_buf_hold(os, object, offset, zgd, &db,
1004                     DMU_READ_NO_PREFETCH);
1005                 if (error == 0) {
1006                         blkptr_t *obp = dmu_buf_get_blkptr(db);
1007                         if (obp) {
1008                                 ASSERT(BP_IS_HOLE(bp));
1009                                 *bp = *obp;
1010                         }
1011 
1012                         zgd->zgd_db = db;
1013                         zgd->zgd_bp = bp;
1014 
1015                         ASSERT(db->db_offset == offset);
1016                         ASSERT(db->db_size == size);
1017 
1018                         error = dmu_sync(zio, lr->lr_common.lrc_txg,
1019                             zvol_get_done, zgd);
1020 
1021                         if (error == 0)
1022                                 return (0);
1023                 }
1024         }
1025 
1026         zvol_get_done(zgd, error);
1027 
1028         return (error);
1029 }
1030 
1031 /*
1032  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
1033  *
1034  * We store data in the log buffers if it's small enough.
1035  * Otherwise we will later flush the data out via dmu_sync().
1036  */
1037 ssize_t zvol_immediate_write_sz = 32768;
1038 
1039 static void
1040 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1041     boolean_t sync)
1042 {
1043         uint32_t blocksize = zv->zv_volblocksize;
1044         zilog_t *zilog = zv->zv_zilog;
1045         boolean_t slogging;
1046         ssize_t immediate_write_sz;
1047 
1048         if (zil_replaying(zilog, tx))
1049                 return;
1050 
1051         immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1052             ? 0 : zvol_immediate_write_sz;
1053 
1054         slogging = spa_has_slogs(zilog->zl_spa) &&
1055             (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
1056 
1057         while (resid) {
1058                 itx_t *itx;
1059                 lr_write_t *lr;
1060                 ssize_t len;
1061                 itx_wr_state_t write_state;
1062 
1063                 /*
1064                  * Unlike zfs_log_write() we can be called with
1065                  * upto DMU_MAX_ACCESS/2 (5MB) writes.
1066                  */
1067                 if (blocksize > immediate_write_sz && !slogging &&
1068                     resid >= blocksize && off % blocksize == 0) {
1069                         write_state = WR_INDIRECT; /* uses dmu_sync */
1070                         len = blocksize;
1071                 } else if (sync) {
1072                         write_state = WR_COPIED;
1073                         len = MIN(ZIL_MAX_LOG_DATA, resid);
1074                 } else {
1075                         write_state = WR_NEED_COPY;
1076                         len = MIN(ZIL_MAX_LOG_DATA, resid);
1077                 }
1078 
1079                 itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1080                     (write_state == WR_COPIED ? len : 0));
1081                 lr = (lr_write_t *)&itx->itx_lr;
1082                 if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
1083                     ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
1084                         zil_itx_destroy(itx);
1085                         itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1086                         lr = (lr_write_t *)&itx->itx_lr;
1087                         write_state = WR_NEED_COPY;
1088                 }
1089 
1090                 itx->itx_wr_state = write_state;
1091                 if (write_state == WR_NEED_COPY)
1092                         itx->itx_sod += len;
1093                 lr->lr_foid = ZVOL_OBJ;
1094                 lr->lr_offset = off;
1095                 lr->lr_length = len;
1096                 lr->lr_blkoff = 0;
1097                 BP_ZERO(&lr->lr_blkptr);
1098 
1099                 itx->itx_private = zv;
1100                 itx->itx_sync = sync;
1101 
1102                 zil_itx_assign(zilog, itx, tx);
1103 
1104                 off += len;
1105                 resid -= len;
1106         }
1107 }
1108 
1109 static int
1110 zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1111     uint64_t size, boolean_t doread, boolean_t isdump)
1112 {
1113         vdev_disk_t *dvd;
1114         int c;
1115         int numerrors = 0;
1116 
1117         if (vd->vdev_ops == &vdev_mirror_ops ||
1118             vd->vdev_ops == &vdev_replacing_ops ||
1119             vd->vdev_ops == &vdev_spare_ops) {
1120                 for (c = 0; c < vd->vdev_children; c++) {
1121                         int err = zvol_dumpio_vdev(vd->vdev_child[c],
1122                             addr, offset, origoffset, size, doread, isdump);
1123                         if (err != 0) {
1124                                 numerrors++;
1125                         } else if (doread) {
1126                                 break;
1127                         }
1128                 }
1129         }
1130 
1131         if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
1132                 return (numerrors < vd->vdev_children ? 0 : EIO);
1133 
1134         if (doread && !vdev_readable(vd))
1135                 return (SET_ERROR(EIO));
1136         else if (!doread && !vdev_writeable(vd))
1137                 return (SET_ERROR(EIO));
1138 
1139         if (vd->vdev_ops == &vdev_raidz_ops) {
1140                 return (vdev_raidz_physio(vd,
1141                     addr, size, offset, origoffset, doread, isdump));
1142         }
1143 
1144         offset += VDEV_LABEL_START_SIZE;
1145 
1146         if (ddi_in_panic() || isdump) {
1147                 ASSERT(!doread);
1148                 if (doread)
1149                         return (SET_ERROR(EIO));
1150                 dvd = vd->vdev_tsd;
1151                 ASSERT3P(dvd, !=, NULL);
1152                 return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1153                     lbtodb(size)));
1154         } else {
1155                 dvd = vd->vdev_tsd;
1156                 ASSERT3P(dvd, !=, NULL);
1157                 return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1158                     offset, doread ? B_READ : B_WRITE));
1159         }
1160 }
1161 
1162 static int
1163 zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1164     boolean_t doread, boolean_t isdump)
1165 {
1166         vdev_t *vd;
1167         int error;
1168         zvol_extent_t *ze;
1169         spa_t *spa = dmu_objset_spa(zv->zv_objset);
1170 
1171         /* Must be sector aligned, and not stradle a block boundary. */
1172         if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1173             P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1174                 return (SET_ERROR(EINVAL));
1175         }
1176         ASSERT(size <= zv->zv_volblocksize);
1177 
1178         /* Locate the extent this belongs to */
1179         ze = list_head(&zv->zv_extents);
1180         while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
1181                 offset -= ze->ze_nblks * zv->zv_volblocksize;
1182                 ze = list_next(&zv->zv_extents, ze);
1183         }
1184 
1185         if (ze == NULL)
1186                 return (SET_ERROR(EINVAL));
1187 
1188         if (!ddi_in_panic())
1189                 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1190 
1191         vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
1192         offset += DVA_GET_OFFSET(&ze->ze_dva);
1193         error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
1194             size, doread, isdump);
1195 
1196         if (!ddi_in_panic())
1197                 spa_config_exit(spa, SCL_STATE, FTAG);
1198 
1199         return (error);
1200 }
1201 
1202 int
1203 zvol_strategy(buf_t *bp)
1204 {
1205         zfs_soft_state_t *zs = NULL;
1206         zvol_state_t *zv;
1207         uint64_t off, volsize;
1208         size_t resid;
1209         char *addr;
1210         objset_t *os;
1211         rl_t *rl;
1212         int error = 0;
1213         boolean_t doread = bp->b_flags & B_READ;
1214         boolean_t is_dumpified;
1215         boolean_t sync;
1216 
1217         if (getminor(bp->b_edev) == 0) {
1218                 error = SET_ERROR(EINVAL);
1219         } else {
1220                 zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
1221                 if (zs == NULL)
1222                         error = SET_ERROR(ENXIO);
1223                 else if (zs->zss_type != ZSST_ZVOL)
1224                         error = SET_ERROR(EINVAL);
1225         }
1226 
1227         if (error) {
1228                 bioerror(bp, error);
1229                 biodone(bp);
1230                 return (0);
1231         }
1232 
1233         zv = zs->zss_data;
1234 
1235         if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
1236                 bioerror(bp, EROFS);
1237                 biodone(bp);
1238                 return (0);
1239         }
1240 
1241         off = ldbtob(bp->b_blkno);
1242         volsize = zv->zv_volsize;
1243 
1244         os = zv->zv_objset;
1245         ASSERT(os != NULL);
1246 
1247         bp_mapin(bp);
1248         addr = bp->b_un.b_addr;
1249         resid = bp->b_bcount;
1250 
1251         if (resid > 0 && (off < 0 || off >= volsize)) {
1252                 bioerror(bp, EIO);
1253                 biodone(bp);
1254                 return (0);
1255         }
1256 
1257         is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
1258         sync = ((!(bp->b_flags & B_ASYNC) &&
1259             !(zv->zv_flags & ZVOL_WCE)) ||
1260             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
1261             !doread && !is_dumpified;
1262 
1263         /*
1264          * There must be no buffer changes when doing a dmu_sync() because
1265          * we can't change the data whilst calculating the checksum.
1266          */
1267         rl = zfs_range_lock(&zv->zv_znode, off, resid,
1268             doread ? RL_READER : RL_WRITER);
1269 
1270         while (resid != 0 && off < volsize) {
1271                 size_t size = MIN(resid, zvol_maxphys);
1272                 if (is_dumpified) {
1273                         size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1274                         error = zvol_dumpio(zv, addr, off, size,
1275                             doread, B_FALSE);
1276                 } else if (doread) {
1277                         error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1278                             DMU_READ_PREFETCH);
1279                 } else {
1280                         dmu_tx_t *tx = dmu_tx_create(os);
1281                         dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1282                         error = dmu_tx_assign(tx, TXG_WAIT);
1283                         if (error) {
1284                                 dmu_tx_abort(tx);
1285                         } else {
1286                                 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
1287                                 zvol_log_write(zv, tx, off, size, sync);
1288                                 dmu_tx_commit(tx);
1289                         }
1290                 }
1291                 if (error) {
1292                         /* convert checksum errors into IO errors */
1293                         if (error == ECKSUM)
1294                                 error = SET_ERROR(EIO);
1295                         break;
1296                 }
1297                 off += size;
1298                 addr += size;
1299                 resid -= size;
1300         }
1301         zfs_range_unlock(rl);
1302 
1303         if ((bp->b_resid = resid) == bp->b_bcount)
1304                 bioerror(bp, off > volsize ? EINVAL : error);
1305 
1306         if (sync)
1307                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1308         biodone(bp);
1309 
1310         return (0);
1311 }
1312 
1313 /*
1314  * Set the buffer count to the zvol maximum transfer.
1315  * Using our own routine instead of the default minphys()
1316  * means that for larger writes we write bigger buffers on X86
1317  * (128K instead of 56K) and flush the disk write cache less often
1318  * (every zvol_maxphys - currently 1MB) instead of minphys (currently
1319  * 56K on X86 and 128K on sparc).
1320  */
1321 void
1322 zvol_minphys(struct buf *bp)
1323 {
1324         if (bp->b_bcount > zvol_maxphys)
1325                 bp->b_bcount = zvol_maxphys;
1326 }
1327 
1328 int
1329 zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
1330 {
1331         minor_t minor = getminor(dev);
1332         zvol_state_t *zv;
1333         int error = 0;
1334         uint64_t size;
1335         uint64_t boff;
1336         uint64_t resid;
1337 
1338         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1339         if (zv == NULL)
1340                 return (SET_ERROR(ENXIO));
1341 
1342         if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
1343                 return (SET_ERROR(EINVAL));
1344 
1345         boff = ldbtob(blkno);
1346         resid = ldbtob(nblocks);
1347 
1348         VERIFY3U(boff + resid, <=, zv->zv_volsize);
1349 
1350         while (resid) {
1351                 size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
1352                 error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
1353                 if (error)
1354                         break;
1355                 boff += size;
1356                 addr += size;
1357                 resid -= size;
1358         }
1359 
1360         return (error);
1361 }
1362 
1363 /*ARGSUSED*/
1364 int
1365 zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
1366 {
1367         minor_t minor = getminor(dev);
1368         zvol_state_t *zv;
1369         uint64_t volsize;
1370         rl_t *rl;
1371         int error = 0;
1372 
1373         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1374         if (zv == NULL)
1375                 return (SET_ERROR(ENXIO));
1376 
1377         volsize = zv->zv_volsize;
1378         if (uio->uio_resid > 0 &&
1379             (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1380                 return (SET_ERROR(EIO));
1381 
1382         if (zv->zv_flags & ZVOL_DUMPIFIED) {
1383                 error = physio(zvol_strategy, NULL, dev, B_READ,
1384                     zvol_minphys, uio);
1385                 return (error);
1386         }
1387 
1388         rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1389             RL_READER);
1390         while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1391                 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1392 
1393                 /* don't read past the end */
1394                 if (bytes > volsize - uio->uio_loffset)
1395                         bytes = volsize - uio->uio_loffset;
1396 
1397                 error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
1398                 if (error) {
1399                         /* convert checksum errors into IO errors */
1400                         if (error == ECKSUM)
1401                                 error = SET_ERROR(EIO);
1402                         break;
1403                 }
1404         }
1405         zfs_range_unlock(rl);
1406         return (error);
1407 }
1408 
1409 /*ARGSUSED*/
1410 int
1411 zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
1412 {
1413         minor_t minor = getminor(dev);
1414         zvol_state_t *zv;
1415         uint64_t volsize;
1416         rl_t *rl;
1417         int error = 0;
1418         boolean_t sync;
1419 
1420         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1421         if (zv == NULL)
1422                 return (SET_ERROR(ENXIO));
1423 
1424         volsize = zv->zv_volsize;
1425         if (uio->uio_resid > 0 &&
1426             (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1427                 return (SET_ERROR(EIO));
1428 
1429         if (zv->zv_flags & ZVOL_DUMPIFIED) {
1430                 error = physio(zvol_strategy, NULL, dev, B_WRITE,
1431                     zvol_minphys, uio);
1432                 return (error);
1433         }
1434 
1435         sync = !(zv->zv_flags & ZVOL_WCE) ||
1436             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1437 
1438         rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1439             RL_WRITER);
1440         while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1441                 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1442                 uint64_t off = uio->uio_loffset;
1443                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1444 
1445                 if (bytes > volsize - off)   /* don't write past the end */
1446                         bytes = volsize - off;
1447 
1448                 dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1449                 error = dmu_tx_assign(tx, TXG_WAIT);
1450                 if (error) {
1451                         dmu_tx_abort(tx);
1452                         break;
1453                 }
1454                 error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
1455                 if (error == 0)
1456                         zvol_log_write(zv, tx, off, bytes, sync);
1457                 dmu_tx_commit(tx);
1458 
1459                 if (error)
1460                         break;
1461         }
1462         zfs_range_unlock(rl);
1463         if (sync)
1464                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1465         return (error);
1466 }
1467 
1468 int
1469 zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
1470 {
1471         struct uuid uuid = EFI_RESERVED;
1472         efi_gpe_t gpe = { 0 };
1473         uint32_t crc;
1474         dk_efi_t efi;
1475         int length;
1476         char *ptr;
1477 
1478         if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
1479                 return (SET_ERROR(EFAULT));
1480         ptr = (char *)(uintptr_t)efi.dki_data_64;
1481         length = efi.dki_length;
1482         /*
1483          * Some clients may attempt to request a PMBR for the
1484          * zvol.  Currently this interface will return EINVAL to
1485          * such requests.  These requests could be supported by
1486          * adding a check for lba == 0 and consing up an appropriate
1487          * PMBR.
1488          */
1489         if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
1490                 return (SET_ERROR(EINVAL));
1491 
1492         gpe.efi_gpe_StartingLBA = LE_64(34ULL);
1493         gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
1494         UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
1495 
1496         if (efi.dki_lba == 1) {
1497                 efi_gpt_t gpt = { 0 };
1498 
1499                 gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
1500                 gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
1501                 gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
1502                 gpt.efi_gpt_MyLBA = LE_64(1ULL);
1503                 gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
1504                 gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
1505                 gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
1506                 gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
1507                 gpt.efi_gpt_SizeOfPartitionEntry =
1508                     LE_32(sizeof (efi_gpe_t));
1509                 CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
1510                 gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
1511                 CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
1512                 gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
1513                 if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
1514                     flag))
1515                         return (SET_ERROR(EFAULT));
1516                 ptr += sizeof (gpt);
1517                 length -= sizeof (gpt);
1518         }
1519         if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
1520             length), flag))
1521                 return (SET_ERROR(EFAULT));
1522         return (0);
1523 }
1524 
1525 /*
1526  * BEGIN entry points to allow external callers access to the volume.
1527  */
1528 /*
1529  * Return the volume parameters needed for access from an external caller.
1530  * These values are invariant as long as the volume is held open.
1531  */
1532 int
1533 zvol_get_volume_params(minor_t minor, uint64_t *blksize,
1534     uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
1535     void **rl_hdl, void **bonus_hdl)
1536 {
1537         zvol_state_t *zv;
1538 
1539         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1540         if (zv == NULL)
1541                 return (SET_ERROR(ENXIO));
1542         if (zv->zv_flags & ZVOL_DUMPIFIED)
1543                 return (SET_ERROR(ENXIO));
1544 
1545         ASSERT(blksize && max_xfer_len && minor_hdl &&
1546             objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
1547 
1548         *blksize = zv->zv_volblocksize;
1549         *max_xfer_len = (uint64_t)zvol_maxphys;
1550         *minor_hdl = zv;
1551         *objset_hdl = zv->zv_objset;
1552         *zil_hdl = zv->zv_zilog;
1553         *rl_hdl = &zv->zv_znode;
1554         *bonus_hdl = zv->zv_dbuf;
1555         return (0);
1556 }
1557 
1558 /*
1559  * Return the current volume size to an external caller.
1560  * The size can change while the volume is open.
1561  */
1562 uint64_t
1563 zvol_get_volume_size(void *minor_hdl)
1564 {
1565         zvol_state_t *zv = minor_hdl;
1566 
1567         return (zv->zv_volsize);
1568 }
1569 
1570 /*
1571  * Return the current WCE setting to an external caller.
1572  * The WCE setting can change while the volume is open.
1573  */
1574 int
1575 zvol_get_volume_wce(void *minor_hdl)
1576 {
1577         zvol_state_t *zv = minor_hdl;
1578 
1579         return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
1580 }
1581 
1582 /*
1583  * Entry point for external callers to zvol_log_write
1584  */
1585 void
1586 zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
1587     boolean_t sync)
1588 {
1589         zvol_state_t *zv = minor_hdl;
1590 
1591         zvol_log_write(zv, tx, off, resid, sync);
1592 }
1593 /*
1594  * END entry points to allow external callers access to the volume.
1595  */
1596 
1597 /*
1598  * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
1599  */
1600 static void
1601 zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
1602     boolean_t sync)
1603 {
1604         itx_t *itx;
1605         lr_truncate_t *lr;
1606         zilog_t *zilog = zv->zv_zilog;
1607 
1608         if (zil_replaying(zilog, tx))
1609                 return;
1610 
1611         itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
1612         lr = (lr_truncate_t *)&itx->itx_lr;
1613         lr->lr_foid = ZVOL_OBJ;
1614         lr->lr_offset = off;
1615         lr->lr_length = len;
1616 
1617         itx->itx_sync = sync;
1618         zil_itx_assign(zilog, itx, tx);
1619 }
1620 
1621 /*
1622  * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
1623  * Also a dirtbag dkio ioctl for unmap/free-block functionality.
1624  */
1625 /*ARGSUSED*/
1626 int
1627 zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
1628 {
1629         zvol_state_t *zv;
1630         struct dk_callback *dkc;
1631         int error = 0;
1632         rl_t *rl;
1633 
1634         mutex_enter(&zfsdev_state_lock);
1635 
1636         zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
1637 
1638         if (zv == NULL) {
1639                 mutex_exit(&zfsdev_state_lock);
1640                 return (SET_ERROR(ENXIO));
1641         }
1642         ASSERT(zv->zv_total_opens > 0);
1643 
1644         switch (cmd) {
1645 
1646         case DKIOCINFO:
1647         {
1648                 struct dk_cinfo dki;
1649 
1650                 bzero(&dki, sizeof (dki));
1651                 (void) strcpy(dki.dki_cname, "zvol");
1652                 (void) strcpy(dki.dki_dname, "zvol");
1653                 dki.dki_ctype = DKC_UNKNOWN;
1654                 dki.dki_unit = getminor(dev);
1655                 dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
1656                 mutex_exit(&zfsdev_state_lock);
1657                 if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
1658                         error = SET_ERROR(EFAULT);
1659                 return (error);
1660         }
1661 
1662         case DKIOCGMEDIAINFO:
1663         {
1664                 struct dk_minfo dkm;
1665 
1666                 bzero(&dkm, sizeof (dkm));
1667                 dkm.dki_lbsize = 1U << zv->zv_min_bs;
1668                 dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1669                 dkm.dki_media_type = DK_UNKNOWN;
1670                 mutex_exit(&zfsdev_state_lock);
1671                 if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
1672                         error = SET_ERROR(EFAULT);
1673                 return (error);
1674         }
1675 
1676         case DKIOCGMEDIAINFOEXT:
1677         {
1678                 struct dk_minfo_ext dkmext;
1679 
1680                 bzero(&dkmext, sizeof (dkmext));
1681                 dkmext.dki_lbsize = 1U << zv->zv_min_bs;
1682                 dkmext.dki_pbsize = zv->zv_volblocksize;
1683                 dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1684                 dkmext.dki_media_type = DK_UNKNOWN;
1685                 mutex_exit(&zfsdev_state_lock);
1686                 if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
1687                         error = SET_ERROR(EFAULT);
1688                 return (error);
1689         }
1690 
1691         case DKIOCGETEFI:
1692         {
1693                 uint64_t vs = zv->zv_volsize;
1694                 uint8_t bs = zv->zv_min_bs;
1695 
1696                 mutex_exit(&zfsdev_state_lock);
1697                 error = zvol_getefi((void *)arg, flag, vs, bs);
1698                 return (error);
1699         }
1700 
1701         case DKIOCFLUSHWRITECACHE:
1702                 dkc = (struct dk_callback *)arg;
1703                 mutex_exit(&zfsdev_state_lock);
1704                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1705                 if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
1706                         (*dkc->dkc_callback)(dkc->dkc_cookie, error);
1707                         error = 0;
1708                 }
1709                 return (error);
1710 
1711         case DKIOCGETWCE:
1712         {
1713                 int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
1714                 if (ddi_copyout(&wce, (void *)arg, sizeof (int),
1715                     flag))
1716                         error = SET_ERROR(EFAULT);
1717                 break;
1718         }
1719         case DKIOCSETWCE:
1720         {
1721                 int wce;
1722                 if (ddi_copyin((void *)arg, &wce, sizeof (int),
1723                     flag)) {
1724                         error = SET_ERROR(EFAULT);
1725                         break;
1726                 }
1727                 if (wce) {
1728                         zv->zv_flags |= ZVOL_WCE;
1729                         mutex_exit(&zfsdev_state_lock);
1730                 } else {
1731                         zv->zv_flags &= ~ZVOL_WCE;
1732                         mutex_exit(&zfsdev_state_lock);
1733                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
1734                 }
1735                 return (0);
1736         }
1737 
1738         case DKIOCGGEOM:
1739         case DKIOCGVTOC:
1740                 /*
1741                  * commands using these (like prtvtoc) expect ENOTSUP
1742                  * since we're emulating an EFI label
1743                  */
1744                 error = SET_ERROR(ENOTSUP);
1745                 break;
1746 
1747         case DKIOCDUMPINIT:
1748                 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1749                     RL_WRITER);
1750                 error = zvol_dumpify(zv);
1751                 zfs_range_unlock(rl);
1752                 break;
1753 
1754         case DKIOCDUMPFINI:
1755                 if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1756                         break;
1757                 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1758                     RL_WRITER);
1759                 error = zvol_dump_fini(zv);
1760                 zfs_range_unlock(rl);
1761                 break;
1762 
1763         case DKIOCFREE:
1764         {
1765                 dkioc_free_t df;
1766                 dmu_tx_t *tx;
1767 
1768                 if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
1769                         error = SET_ERROR(EFAULT);
1770                         break;
1771                 }
1772 
1773                 /*
1774                  * Apply Postel's Law to length-checking.  If they overshoot,
1775                  * just blank out until the end, if there's a need to blank
1776                  * out anything.
1777                  */
1778                 if (df.df_start >= zv->zv_volsize)
1779                         break;  /* No need to do anything... */
1780                 if (df.df_start + df.df_length > zv->zv_volsize)
1781                         df.df_length = DMU_OBJECT_END;
1782 
1783                 rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
1784                     RL_WRITER);
1785                 tx = dmu_tx_create(zv->zv_objset);
1786                 error = dmu_tx_assign(tx, TXG_WAIT);
1787                 if (error != 0) {
1788                         dmu_tx_abort(tx);
1789                 } else {
1790                         zvol_log_truncate(zv, tx, df.df_start,
1791                             df.df_length, B_TRUE);
1792                         dmu_tx_commit(tx);
1793                         error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1794                             df.df_start, df.df_length);
1795                 }
1796 
1797                 zfs_range_unlock(rl);
1798 
1799                 if (error == 0) {
1800                         /*
1801                          * If the write-cache is disabled or 'sync' property
1802                          * is set to 'always' then treat this as a synchronous
1803                          * operation (i.e. commit to zil).
1804                          */
1805                         if (!(zv->zv_flags & ZVOL_WCE) ||
1806                             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS))
1807                                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1808 
1809                         /*
1810                          * If the caller really wants synchronous writes, and
1811                          * can't wait for them, don't return until the write
1812                          * is done.
1813                          */
1814                         if (df.df_flags & DF_WAIT_SYNC) {
1815                                 txg_wait_synced(
1816                                     dmu_objset_pool(zv->zv_objset), 0);
1817                         }
1818                 }
1819                 break;
1820         }
1821 
1822         default:
1823                 error = SET_ERROR(ENOTTY);
1824                 break;
1825 
1826         }
1827         mutex_exit(&zfsdev_state_lock);
1828         return (error);
1829 }
1830 
1831 int
1832 zvol_busy(void)
1833 {
1834         return (zvol_minors != 0);
1835 }
1836 
1837 void
1838 zvol_init(void)
1839 {
1840         VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
1841             1) == 0);
1842         mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
1843 }
1844 
1845 void
1846 zvol_fini(void)
1847 {
1848         mutex_destroy(&zfsdev_state_lock);
1849         ddi_soft_state_fini(&zfsdev_state);
1850 }
1851 
1852 /*ARGSUSED*/
1853 static int
1854 zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
1855 {
1856         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1857 
1858         if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1859                 return (1);
1860         return (0);
1861 }
1862 
1863 /*ARGSUSED*/
1864 static void
1865 zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
1866 {
1867         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1868 
1869         spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
1870 }
1871 
1872 static int
1873 zvol_dump_init(zvol_state_t *zv, boolean_t resize)
1874 {
1875         dmu_tx_t *tx;
1876         int error;
1877         objset_t *os = zv->zv_objset;
1878         spa_t *spa = dmu_objset_spa(os);
1879         vdev_t *vd = spa->spa_root_vdev;
1880         nvlist_t *nv = NULL;
1881         uint64_t version = spa_version(spa);
1882         enum zio_checksum checksum;
1883 
1884         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
1885         ASSERT(vd->vdev_ops == &vdev_root_ops);
1886 
1887         error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
1888             DMU_OBJECT_END);
1889         /* wait for dmu_free_long_range to actually free the blocks */
1890         txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
1891 
1892         /*
1893          * If the pool on which the dump device is being initialized has more
1894          * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
1895          * enabled.  If so, bump that feature's counter to indicate that the
1896          * feature is active. We also check the vdev type to handle the
1897          * following case:
1898          *   # zpool create test raidz disk1 disk2 disk3
1899          *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
1900          *   the raidz vdev itself has 3 children.
1901          */
1902         if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
1903                 if (!spa_feature_is_enabled(spa,
1904                     SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1905                         return (SET_ERROR(ENOTSUP));
1906                 (void) dsl_sync_task(spa_name(spa),
1907                     zfs_mvdev_dump_feature_check,
1908                     zfs_mvdev_dump_activate_feature_sync, NULL, 2);
1909         }
1910 
1911         tx = dmu_tx_create(os);
1912         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
1913         dmu_tx_hold_bonus(tx, ZVOL_OBJ);
1914         error = dmu_tx_assign(tx, TXG_WAIT);
1915         if (error) {
1916                 dmu_tx_abort(tx);
1917                 return (error);
1918         }
1919 
1920         /*
1921          * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
1922          * function.  Otherwise, use the old default -- OFF.
1923          */
1924         checksum = spa_feature_is_active(spa,
1925             SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
1926             ZIO_CHECKSUM_OFF;
1927 
1928         /*
1929          * If we are resizing the dump device then we only need to
1930          * update the refreservation to match the newly updated
1931          * zvolsize. Otherwise, we save off the original state of the
1932          * zvol so that we can restore them if the zvol is ever undumpified.
1933          */
1934         if (resize) {
1935                 error = zap_update(os, ZVOL_ZAP_OBJ,
1936                     zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1937                     &zv->zv_volsize, tx);
1938         } else {
1939                 uint64_t checksum, compress, refresrv, vbs, dedup;
1940 
1941                 error = dsl_prop_get_integer(zv->zv_name,
1942                     zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
1943                 error = error ? error : dsl_prop_get_integer(zv->zv_name,
1944                     zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
1945                 error = error ? error : dsl_prop_get_integer(zv->zv_name,
1946                     zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
1947                 error = error ? error : dsl_prop_get_integer(zv->zv_name,
1948                     zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
1949                 if (version >= SPA_VERSION_DEDUP) {
1950                         error = error ? error :
1951                             dsl_prop_get_integer(zv->zv_name,
1952                             zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
1953                 }
1954 
1955                 error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1956                     zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
1957                     &compress, tx);
1958                 error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1959                     zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx);
1960                 error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1961                     zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1962                     &refresrv, tx);
1963                 error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1964                     zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
1965                     &vbs, tx);
1966                 error = error ? error : dmu_object_set_blocksize(
1967                     os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
1968                 if (version >= SPA_VERSION_DEDUP) {
1969                         error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1970                             zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
1971                             &dedup, tx);
1972                 }
1973                 if (error == 0)
1974                         zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
1975         }
1976         dmu_tx_commit(tx);
1977 
1978         /*
1979          * We only need update the zvol's property if we are initializing
1980          * the dump area for the first time.
1981          */
1982         if (!resize) {
1983                 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1984                 VERIFY(nvlist_add_uint64(nv,
1985                     zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
1986                 VERIFY(nvlist_add_uint64(nv,
1987                     zfs_prop_to_name(ZFS_PROP_COMPRESSION),
1988                     ZIO_COMPRESS_OFF) == 0);
1989                 VERIFY(nvlist_add_uint64(nv,
1990                     zfs_prop_to_name(ZFS_PROP_CHECKSUM),
1991                     checksum) == 0);
1992                 if (version >= SPA_VERSION_DEDUP) {
1993                         VERIFY(nvlist_add_uint64(nv,
1994                             zfs_prop_to_name(ZFS_PROP_DEDUP),
1995                             ZIO_CHECKSUM_OFF) == 0);
1996                 }
1997 
1998                 error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
1999                     nv, NULL);
2000                 nvlist_free(nv);
2001 
2002                 if (error)
2003                         return (error);
2004         }
2005 
2006         /* Allocate the space for the dump */
2007         error = zvol_prealloc(zv);
2008         return (error);
2009 }
2010 
2011 static int
2012 zvol_dumpify(zvol_state_t *zv)
2013 {
2014         int error = 0;
2015         uint64_t dumpsize = 0;
2016         dmu_tx_t *tx;
2017         objset_t *os = zv->zv_objset;
2018 
2019         if (zv->zv_flags & ZVOL_RDONLY)
2020                 return (SET_ERROR(EROFS));
2021 
2022         if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
2023             8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
2024                 boolean_t resize = (dumpsize > 0);
2025 
2026                 if ((error = zvol_dump_init(zv, resize)) != 0) {
2027                         (void) zvol_dump_fini(zv);
2028                         return (error);
2029                 }
2030         }
2031 
2032         /*
2033          * Build up our lba mapping.
2034          */
2035         error = zvol_get_lbas(zv);
2036         if (error) {
2037                 (void) zvol_dump_fini(zv);
2038                 return (error);
2039         }
2040 
2041         tx = dmu_tx_create(os);
2042         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2043         error = dmu_tx_assign(tx, TXG_WAIT);
2044         if (error) {
2045                 dmu_tx_abort(tx);
2046                 (void) zvol_dump_fini(zv);
2047                 return (error);
2048         }
2049 
2050         zv->zv_flags |= ZVOL_DUMPIFIED;
2051         error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
2052             &zv->zv_volsize, tx);
2053         dmu_tx_commit(tx);
2054 
2055         if (error) {
2056                 (void) zvol_dump_fini(zv);
2057                 return (error);
2058         }
2059 
2060         txg_wait_synced(dmu_objset_pool(os), 0);
2061         return (0);
2062 }
2063 
2064 static int
2065 zvol_dump_fini(zvol_state_t *zv)
2066 {
2067         dmu_tx_t *tx;
2068         objset_t *os = zv->zv_objset;
2069         nvlist_t *nv;
2070         int error = 0;
2071         uint64_t checksum, compress, refresrv, vbs, dedup;
2072         uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
2073 
2074         /*
2075          * Attempt to restore the zvol back to its pre-dumpified state.
2076          * This is a best-effort attempt as it's possible that not all
2077          * of these properties were initialized during the dumpify process
2078          * (i.e. error during zvol_dump_init).
2079          */
2080 
2081         tx = dmu_tx_create(os);
2082         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2083         error = dmu_tx_assign(tx, TXG_WAIT);
2084         if (error) {
2085                 dmu_tx_abort(tx);
2086                 return (error);
2087         }
2088         (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
2089         dmu_tx_commit(tx);
2090 
2091         (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2092             zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
2093         (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2094             zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
2095         (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2096             zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
2097         (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2098             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
2099 
2100         VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2101         (void) nvlist_add_uint64(nv,
2102             zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
2103         (void) nvlist_add_uint64(nv,
2104             zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
2105         (void) nvlist_add_uint64(nv,
2106             zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
2107         if (version >= SPA_VERSION_DEDUP &&
2108             zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2109             zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
2110                 (void) nvlist_add_uint64(nv,
2111                     zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
2112         }
2113         (void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2114             nv, NULL);
2115         nvlist_free(nv);
2116 
2117         zvol_free_extents(zv);
2118         zv->zv_flags &= ~ZVOL_DUMPIFIED;
2119         (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
2120         /* wait for dmu_free_long_range to actually free the blocks */
2121         txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2122         tx = dmu_tx_create(os);
2123         dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2124         error = dmu_tx_assign(tx, TXG_WAIT);
2125         if (error) {
2126                 dmu_tx_abort(tx);
2127                 return (error);
2128         }
2129         if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
2130                 zv->zv_volblocksize = vbs;
2131         dmu_tx_commit(tx);
2132 
2133         return (0);
2134 }