Print this page
OS-7125 Need mitigation of L1TF (CVE-2018-3646)
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>


   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  *
  24  * Portions Copyright 2010 Robert Milkowski
  25  *
  26  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  27  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  28  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  29  * Copyright (c) 2014 Integros [integros.com]
  30  * Copyright (c) 2019, Joyent, Inc.
  31  */
  32 
  33 /*
  34  * ZFS volume emulation driver.
  35  *
  36  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  37  * Volumes are accessed through the symbolic links named:
  38  *
  39  * /dev/zvol/dsk/<pool_name>/<dataset_name>
  40  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
  41  *
  42  * These links are created by the /dev filesystem (sdev_zvolops.c).
  43  * Volumes are persistent through reboot.  No user command needs to be
  44  * run before opening and using a device.
  45  */
  46 
  47 #include <sys/types.h>
  48 #include <sys/param.h>


  73 #include <sys/dirent.h>
  74 #include <sys/policy.h>
  75 #include <sys/fs/zfs.h>
  76 #include <sys/zfs_ioctl.h>
  77 #include <sys/mkdev.h>
  78 #include <sys/zil.h>
  79 #include <sys/refcount.h>
  80 #include <sys/zfs_znode.h>
  81 #include <sys/zfs_rlock.h>
  82 #include <sys/vdev_disk.h>
  83 #include <sys/vdev_impl.h>
  84 #include <sys/vdev_raidz.h>
  85 #include <sys/zvol.h>
  86 #include <sys/dumphdr.h>
  87 #include <sys/zil_impl.h>
  88 #include <sys/dbuf.h>
  89 #include <sys/dmu_tx.h>
  90 #include <sys/zfeature.h>
  91 #include <sys/zio_checksum.h>
  92 #include <sys/zil_impl.h>

  93 #include <sys/dkioc_free_util.h>
  94 #include <sys/zfs_rlock.h>
  95 
  96 #include "zfs_namecheck.h"
  97 
  98 void *zfsdev_state;
  99 static char *zvol_tag = "zvol_tag";
 100 
 101 #define ZVOL_DUMPSIZE           "dumpsize"
 102 
 103 /*
 104  * This lock protects the zfsdev_state structure from being modified
 105  * while it's being used, e.g. an open that comes in before a create
 106  * finishes.  It also protects temporary opens of the dataset so that,
 107  * e.g., an open doesn't get a spurious EBUSY.
 108  */
 109 kmutex_t zfsdev_state_lock;
 110 static uint32_t zvol_minors;
 111 
 112 typedef struct zvol_extent {


1255 
1256         os = zv->zv_objset;
1257         ASSERT(os != NULL);
1258 
1259         bp_mapin(bp);
1260         addr = bp->b_un.b_addr;
1261         resid = bp->b_bcount;
1262 
1263         if (resid > 0 && (off < 0 || off >= volsize)) {
1264                 bioerror(bp, EIO);
1265                 biodone(bp);
1266                 return (0);
1267         }
1268 
1269         is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
1270         sync = ((!(bp->b_flags & B_ASYNC) &&
1271             !(zv->zv_flags & ZVOL_WCE)) ||
1272             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
1273             !doread && !is_dumpified;
1274 


1275         /*
1276          * There must be no buffer changes when doing a dmu_sync() because
1277          * we can't change the data whilst calculating the checksum.
1278          */
1279         locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, off, resid,
1280             doread ? RL_READER : RL_WRITER);
1281 
1282         while (resid != 0 && off < volsize) {
1283                 size_t size = MIN(resid, zvol_maxphys);
1284                 if (is_dumpified) {
1285                         size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1286                         error = zvol_dumpio(zv, addr, off, size,
1287                             doread, B_FALSE);
1288                 } else if (doread) {
1289                         error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1290                             DMU_READ_PREFETCH);
1291                 } else {
1292                         dmu_tx_t *tx = dmu_tx_create(os);
1293                         dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1294                         error = dmu_tx_assign(tx, TXG_WAIT);


1302                 }
1303                 if (error) {
1304                         /* convert checksum errors into IO errors */
1305                         if (error == ECKSUM)
1306                                 error = SET_ERROR(EIO);
1307                         break;
1308                 }
1309                 off += size;
1310                 addr += size;
1311                 resid -= size;
1312         }
1313         rangelock_exit(lr);
1314 
1315         if ((bp->b_resid = resid) == bp->b_bcount)
1316                 bioerror(bp, off > volsize ? EINVAL : error);
1317 
1318         if (sync)
1319                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1320         biodone(bp);
1321 


1322         return (0);
1323 }
1324 
1325 /*
1326  * Set the buffer count to the zvol maximum transfer.
1327  * Using our own routine instead of the default minphys()
1328  * means that for larger writes we write bigger buffers on X86
1329  * (128K instead of 56K) and flush the disk write cache less often
1330  * (every zvol_maxphys - currently 1MB) instead of minphys (currently
1331  * 56K on X86 and 128K on sparc).
1332  */
1333 void
1334 zvol_minphys(struct buf *bp)
1335 {
1336         if (bp->b_bcount > zvol_maxphys)
1337                 bp->b_bcount = zvol_maxphys;
1338 }
1339 
1340 int
1341 zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)


1379         minor_t minor = getminor(dev);
1380         zvol_state_t *zv;
1381         uint64_t volsize;
1382         int error = 0;
1383 
1384         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1385         if (zv == NULL)
1386                 return (SET_ERROR(ENXIO));
1387 
1388         volsize = zv->zv_volsize;
1389         if (uio->uio_resid > 0 &&
1390             (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1391                 return (SET_ERROR(EIO));
1392 
1393         if (zv->zv_flags & ZVOL_DUMPIFIED) {
1394                 error = physio(zvol_strategy, NULL, dev, B_READ,
1395                     zvol_minphys, uio);
1396                 return (error);
1397         }
1398 


1399         locked_range_t *lr = rangelock_enter(&zv->zv_rangelock,
1400             uio->uio_loffset, uio->uio_resid, RL_READER);
1401         while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1402                 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1403 
1404                 /* don't read past the end */
1405                 if (bytes > volsize - uio->uio_loffset)
1406                         bytes = volsize - uio->uio_loffset;
1407 
1408                 error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
1409                 if (error) {
1410                         /* convert checksum errors into IO errors */
1411                         if (error == ECKSUM)
1412                                 error = SET_ERROR(EIO);
1413                         break;
1414                 }
1415         }
1416         rangelock_exit(lr);
1417 


1418         return (error);
1419 }
1420 
1421 /*ARGSUSED*/
1422 int
1423 zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
1424 {
1425         minor_t minor = getminor(dev);
1426         zvol_state_t *zv;
1427         uint64_t volsize;
1428         int error = 0;
1429         boolean_t sync;
1430 
1431         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1432         if (zv == NULL)
1433                 return (SET_ERROR(ENXIO));
1434 
1435         volsize = zv->zv_volsize;
1436         if (uio->uio_resid > 0 &&
1437             (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1438                 return (SET_ERROR(EIO));
1439 
1440         if (zv->zv_flags & ZVOL_DUMPIFIED) {
1441                 error = physio(zvol_strategy, NULL, dev, B_WRITE,
1442                     zvol_minphys, uio);
1443                 return (error);
1444         }
1445 


1446         sync = !(zv->zv_flags & ZVOL_WCE) ||
1447             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1448 
1449         locked_range_t *lr = rangelock_enter(&zv->zv_rangelock,
1450             uio->uio_loffset, uio->uio_resid, RL_WRITER);
1451         while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1452                 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1453                 uint64_t off = uio->uio_loffset;
1454                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1455 
1456                 if (bytes > volsize - off)   /* don't write past the end */
1457                         bytes = volsize - off;
1458 
1459                 dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1460                 error = dmu_tx_assign(tx, TXG_WAIT);
1461                 if (error) {
1462                         dmu_tx_abort(tx);
1463                         break;
1464                 }
1465                 error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx);
1466                 if (error == 0)
1467                         zvol_log_write(zv, tx, off, bytes, sync);
1468                 dmu_tx_commit(tx);
1469 
1470                 if (error)
1471                         break;
1472         }
1473         rangelock_exit(lr);
1474 
1475         if (sync)
1476                 zil_commit(zv->zv_zilog, ZVOL_OBJ);



1477         return (error);
1478 }
1479 
1480 int
1481 zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
1482 {
1483         struct uuid uuid = EFI_RESERVED;
1484         efi_gpe_t gpe = { 0 };
1485         uint32_t crc;
1486         dk_efi_t efi;
1487         int length;
1488         char *ptr;
1489 
1490         if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
1491                 return (SET_ERROR(EFAULT));
1492         ptr = (char *)(uintptr_t)efi.dki_data_64;
1493         length = efi.dki_length;
1494         /*
1495          * Some clients may attempt to request a PMBR for the
1496          * zvol.  Currently this interface will return EINVAL to


1697                 dkmext.dki_media_type = DK_UNKNOWN;
1698                 mutex_exit(&zfsdev_state_lock);
1699                 if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
1700                         error = SET_ERROR(EFAULT);
1701                 return (error);
1702         }
1703 
1704         case DKIOCGETEFI:
1705         {
1706                 uint64_t vs = zv->zv_volsize;
1707                 uint8_t bs = zv->zv_min_bs;
1708 
1709                 mutex_exit(&zfsdev_state_lock);
1710                 error = zvol_getefi((void *)arg, flag, vs, bs);
1711                 return (error);
1712         }
1713 
1714         case DKIOCFLUSHWRITECACHE:
1715                 dkc = (struct dk_callback *)arg;
1716                 mutex_exit(&zfsdev_state_lock);



1717                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1718                 if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
1719                         (*dkc->dkc_callback)(dkc->dkc_cookie, error);
1720                         error = 0;
1721                 }



1722                 return (error);
1723 
1724         case DKIOCGETWCE:
1725         {
1726                 int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
1727                 if (ddi_copyout(&wce, (void *)arg, sizeof (int),
1728                     flag))
1729                         error = SET_ERROR(EFAULT);
1730                 break;
1731         }
1732         case DKIOCSETWCE:
1733         {
1734                 int wce;
1735                 if (ddi_copyin((void *)arg, &wce, sizeof (int),
1736                     flag)) {
1737                         error = SET_ERROR(EFAULT);
1738                         break;
1739                 }
1740                 if (wce) {
1741                         zv->zv_flags |= ZVOL_WCE;
1742                         mutex_exit(&zfsdev_state_lock);
1743                 } else {
1744                         zv->zv_flags &= ~ZVOL_WCE;
1745                         mutex_exit(&zfsdev_state_lock);

1746                         zil_commit(zv->zv_zilog, ZVOL_OBJ);

1747                 }
1748                 return (0);
1749         }
1750 
1751         case DKIOCGGEOM:
1752         case DKIOCGVTOC:
1753                 /*
1754                  * commands using these (like prtvtoc) expect ENOTSUP
1755                  * since we're emulating an EFI label
1756                  */
1757                 error = SET_ERROR(ENOTSUP);
1758                 break;
1759 
1760         case DKIOCDUMPINIT:
1761                 lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize,
1762                     RL_WRITER);
1763                 error = zvol_dumpify(zv);
1764                 rangelock_exit(lr);
1765                 break;
1766 


1779                 dmu_tx_t *tx;
1780 
1781                 if (!zvol_unmap_enabled)
1782                         break;
1783 
1784                 if (!(flag & FKIOCTL)) {
1785                         error = dfl_copyin((void *)arg, &dfl, flag, KM_SLEEP);
1786                         if (error != 0)
1787                                 break;
1788                 } else {
1789                         dfl = (dkioc_free_list_t *)arg;
1790                         ASSERT3U(dfl->dfl_num_exts, <=, DFL_COPYIN_MAX_EXTS);
1791                         if (dfl->dfl_num_exts > DFL_COPYIN_MAX_EXTS) {
1792                                 error = SET_ERROR(EINVAL);
1793                                 break;
1794                         }
1795                 }
1796 
1797                 mutex_exit(&zfsdev_state_lock);
1798 


1799                 for (int i = 0; i < dfl->dfl_num_exts; i++) {
1800                         uint64_t start = dfl->dfl_exts[i].dfle_start,
1801                             length = dfl->dfl_exts[i].dfle_length,
1802                             end = start + length;
1803 
1804                         /*
1805                          * Apply Postel's Law to length-checking.  If they
1806                          * overshoot, just blank out until the end, if there's
1807                          * a need to blank out anything.
1808                          */
1809                         if (start >= zv->zv_volsize)
1810                                 continue;       /* No need to do anything... */
1811                         if (end > zv->zv_volsize) {
1812                                 end = DMU_OBJECT_END;
1813                                 length = end - start;
1814                         }
1815 
1816                         lr = rangelock_enter(&zv->zv_rangelock, start, length,
1817                             RL_WRITER);
1818                         tx = dmu_tx_create(zv->zv_objset);


1834                 }
1835 
1836                 /*
1837                  * If the write-cache is disabled, 'sync' property
1838                  * is set to 'always', or if the caller is asking for
1839                  * a synchronous free, commit this operation to the zil.
1840                  * This will sync any previous uncommitted writes to the
1841                  * zvol object.
1842                  * Can be overridden by the zvol_unmap_sync_enabled tunable.
1843                  */
1844                 if ((error == 0) && zvol_unmap_sync_enabled &&
1845                     (!(zv->zv_flags & ZVOL_WCE) ||
1846                     (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) ||
1847                     (dfl->dfl_flags & DF_WAIT_SYNC))) {
1848                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
1849                 }
1850 
1851                 if (!(flag & FKIOCTL))
1852                         dfl_free(dfl);
1853 


1854                 return (error);
1855         }
1856 
1857         default:
1858                 error = SET_ERROR(ENOTTY);
1859                 break;
1860 
1861         }
1862         mutex_exit(&zfsdev_state_lock);
1863         return (error);
1864 }
1865 
1866 int
1867 zvol_busy(void)
1868 {
1869         return (zvol_minors != 0);
1870 }
1871 
1872 void
1873 zvol_init(void)




   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  *
  24  * Portions Copyright 2010 Robert Milkowski
  25  *
  26  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  27  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.

  28  * Copyright (c) 2014 Integros [integros.com]
  29  * Copyright (c) 2019, Joyent, Inc.
  30  */
  31 
  32 /*
  33  * ZFS volume emulation driver.
  34  *
  35  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  36  * Volumes are accessed through the symbolic links named:
  37  *
  38  * /dev/zvol/dsk/<pool_name>/<dataset_name>
  39  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
  40  *
  41  * These links are created by the /dev filesystem (sdev_zvolops.c).
  42  * Volumes are persistent through reboot.  No user command needs to be
  43  * run before opening and using a device.
  44  */
  45 
  46 #include <sys/types.h>
  47 #include <sys/param.h>


  72 #include <sys/dirent.h>
  73 #include <sys/policy.h>
  74 #include <sys/fs/zfs.h>
  75 #include <sys/zfs_ioctl.h>
  76 #include <sys/mkdev.h>
  77 #include <sys/zil.h>
  78 #include <sys/refcount.h>
  79 #include <sys/zfs_znode.h>
  80 #include <sys/zfs_rlock.h>
  81 #include <sys/vdev_disk.h>
  82 #include <sys/vdev_impl.h>
  83 #include <sys/vdev_raidz.h>
  84 #include <sys/zvol.h>
  85 #include <sys/dumphdr.h>
  86 #include <sys/zil_impl.h>
  87 #include <sys/dbuf.h>
  88 #include <sys/dmu_tx.h>
  89 #include <sys/zfeature.h>
  90 #include <sys/zio_checksum.h>
  91 #include <sys/zil_impl.h>
  92 #include <sys/ht.h>
  93 #include <sys/dkioc_free_util.h>
  94 #include <sys/zfs_rlock.h>
  95 
  96 #include "zfs_namecheck.h"
  97 
  98 void *zfsdev_state;
  99 static char *zvol_tag = "zvol_tag";
 100 
 101 #define ZVOL_DUMPSIZE           "dumpsize"
 102 
 103 /*
 104  * This lock protects the zfsdev_state structure from being modified
 105  * while it's being used, e.g. an open that comes in before a create
 106  * finishes.  It also protects temporary opens of the dataset so that,
 107  * e.g., an open doesn't get a spurious EBUSY.
 108  */
 109 kmutex_t zfsdev_state_lock;
 110 static uint32_t zvol_minors;
 111 
 112 typedef struct zvol_extent {


1255 
1256         os = zv->zv_objset;
1257         ASSERT(os != NULL);
1258 
1259         bp_mapin(bp);
1260         addr = bp->b_un.b_addr;
1261         resid = bp->b_bcount;
1262 
1263         if (resid > 0 && (off < 0 || off >= volsize)) {
1264                 bioerror(bp, EIO);
1265                 biodone(bp);
1266                 return (0);
1267         }
1268 
1269         is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
1270         sync = ((!(bp->b_flags & B_ASYNC) &&
1271             !(zv->zv_flags & ZVOL_WCE)) ||
1272             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
1273             !doread && !is_dumpified;
1274 
1275         ht_begin_unsafe();
1276 
1277         /*
1278          * There must be no buffer changes when doing a dmu_sync() because
1279          * we can't change the data whilst calculating the checksum.
1280          */
1281         locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, off, resid,
1282             doread ? RL_READER : RL_WRITER);
1283 
1284         while (resid != 0 && off < volsize) {
1285                 size_t size = MIN(resid, zvol_maxphys);
1286                 if (is_dumpified) {
1287                         size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1288                         error = zvol_dumpio(zv, addr, off, size,
1289                             doread, B_FALSE);
1290                 } else if (doread) {
1291                         error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1292                             DMU_READ_PREFETCH);
1293                 } else {
1294                         dmu_tx_t *tx = dmu_tx_create(os);
1295                         dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1296                         error = dmu_tx_assign(tx, TXG_WAIT);


1304                 }
1305                 if (error) {
1306                         /* convert checksum errors into IO errors */
1307                         if (error == ECKSUM)
1308                                 error = SET_ERROR(EIO);
1309                         break;
1310                 }
1311                 off += size;
1312                 addr += size;
1313                 resid -= size;
1314         }
1315         rangelock_exit(lr);
1316 
1317         if ((bp->b_resid = resid) == bp->b_bcount)
1318                 bioerror(bp, off > volsize ? EINVAL : error);
1319 
1320         if (sync)
1321                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1322         biodone(bp);
1323 
1324         ht_end_unsafe();
1325 
1326         return (0);
1327 }
1328 
1329 /*
1330  * Set the buffer count to the zvol maximum transfer.
1331  * Using our own routine instead of the default minphys()
1332  * means that for larger writes we write bigger buffers on X86
1333  * (128K instead of 56K) and flush the disk write cache less often
1334  * (every zvol_maxphys - currently 1MB) instead of minphys (currently
1335  * 56K on X86 and 128K on sparc).
1336  */
1337 void
1338 zvol_minphys(struct buf *bp)
1339 {
1340         if (bp->b_bcount > zvol_maxphys)
1341                 bp->b_bcount = zvol_maxphys;
1342 }
1343 
1344 int
1345 zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)


1383         minor_t minor = getminor(dev);
1384         zvol_state_t *zv;
1385         uint64_t volsize;
1386         int error = 0;
1387 
1388         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1389         if (zv == NULL)
1390                 return (SET_ERROR(ENXIO));
1391 
1392         volsize = zv->zv_volsize;
1393         if (uio->uio_resid > 0 &&
1394             (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1395                 return (SET_ERROR(EIO));
1396 
1397         if (zv->zv_flags & ZVOL_DUMPIFIED) {
1398                 error = physio(zvol_strategy, NULL, dev, B_READ,
1399                     zvol_minphys, uio);
1400                 return (error);
1401         }
1402 
1403         ht_begin_unsafe();
1404 
1405         locked_range_t *lr = rangelock_enter(&zv->zv_rangelock,
1406             uio->uio_loffset, uio->uio_resid, RL_READER);
1407         while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1408                 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1409 
1410                 /* don't read past the end */
1411                 if (bytes > volsize - uio->uio_loffset)
1412                         bytes = volsize - uio->uio_loffset;
1413 
1414                 error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
1415                 if (error) {
1416                         /* convert checksum errors into IO errors */
1417                         if (error == ECKSUM)
1418                                 error = SET_ERROR(EIO);
1419                         break;
1420                 }
1421         }
1422         rangelock_exit(lr);
1423 
1424         ht_end_unsafe();
1425 
1426         return (error);
1427 }
1428 
1429 /*ARGSUSED*/
1430 int
1431 zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
1432 {
1433         minor_t minor = getminor(dev);
1434         zvol_state_t *zv;
1435         uint64_t volsize;
1436         int error = 0;
1437         boolean_t sync;
1438 
1439         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1440         if (zv == NULL)
1441                 return (SET_ERROR(ENXIO));
1442 
1443         volsize = zv->zv_volsize;
1444         if (uio->uio_resid > 0 &&
1445             (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1446                 return (SET_ERROR(EIO));
1447 
1448         if (zv->zv_flags & ZVOL_DUMPIFIED) {
1449                 error = physio(zvol_strategy, NULL, dev, B_WRITE,
1450                     zvol_minphys, uio);
1451                 return (error);
1452         }
1453 
1454         ht_begin_unsafe();
1455 
1456         sync = !(zv->zv_flags & ZVOL_WCE) ||
1457             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1458 
1459         locked_range_t *lr = rangelock_enter(&zv->zv_rangelock,
1460             uio->uio_loffset, uio->uio_resid, RL_WRITER);
1461         while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1462                 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1463                 uint64_t off = uio->uio_loffset;
1464                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1465 
1466                 if (bytes > volsize - off)   /* don't write past the end */
1467                         bytes = volsize - off;
1468 
1469                 dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1470                 error = dmu_tx_assign(tx, TXG_WAIT);
1471                 if (error) {
1472                         dmu_tx_abort(tx);
1473                         break;
1474                 }
1475                 error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx);
1476                 if (error == 0)
1477                         zvol_log_write(zv, tx, off, bytes, sync);
1478                 dmu_tx_commit(tx);
1479 
1480                 if (error)
1481                         break;
1482         }
1483         rangelock_exit(lr);
1484 
1485         if (sync)
1486                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1487 
1488         ht_end_unsafe();
1489 
1490         return (error);
1491 }
1492 
1493 int
1494 zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
1495 {
1496         struct uuid uuid = EFI_RESERVED;
1497         efi_gpe_t gpe = { 0 };
1498         uint32_t crc;
1499         dk_efi_t efi;
1500         int length;
1501         char *ptr;
1502 
1503         if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
1504                 return (SET_ERROR(EFAULT));
1505         ptr = (char *)(uintptr_t)efi.dki_data_64;
1506         length = efi.dki_length;
1507         /*
1508          * Some clients may attempt to request a PMBR for the
1509          * zvol.  Currently this interface will return EINVAL to


1710                 dkmext.dki_media_type = DK_UNKNOWN;
1711                 mutex_exit(&zfsdev_state_lock);
1712                 if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
1713                         error = SET_ERROR(EFAULT);
1714                 return (error);
1715         }
1716 
1717         case DKIOCGETEFI:
1718         {
1719                 uint64_t vs = zv->zv_volsize;
1720                 uint8_t bs = zv->zv_min_bs;
1721 
1722                 mutex_exit(&zfsdev_state_lock);
1723                 error = zvol_getefi((void *)arg, flag, vs, bs);
1724                 return (error);
1725         }
1726 
1727         case DKIOCFLUSHWRITECACHE:
1728                 dkc = (struct dk_callback *)arg;
1729                 mutex_exit(&zfsdev_state_lock);
1730 
1731                 ht_begin_unsafe();
1732 
1733                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1734                 if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
1735                         (*dkc->dkc_callback)(dkc->dkc_cookie, error);
1736                         error = 0;
1737                 }
1738 
1739                 ht_end_unsafe();
1740 
1741                 return (error);
1742 
1743         case DKIOCGETWCE:
1744         {
1745                 int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
1746                 if (ddi_copyout(&wce, (void *)arg, sizeof (int),
1747                     flag))
1748                         error = SET_ERROR(EFAULT);
1749                 break;
1750         }
1751         case DKIOCSETWCE:
1752         {
1753                 int wce;
1754                 if (ddi_copyin((void *)arg, &wce, sizeof (int),
1755                     flag)) {
1756                         error = SET_ERROR(EFAULT);
1757                         break;
1758                 }
1759                 if (wce) {
1760                         zv->zv_flags |= ZVOL_WCE;
1761                         mutex_exit(&zfsdev_state_lock);
1762                 } else {
1763                         zv->zv_flags &= ~ZVOL_WCE;
1764                         mutex_exit(&zfsdev_state_lock);
1765                         ht_begin_unsafe();
1766                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
1767                         ht_end_unsafe();
1768                 }
1769                 return (0);
1770         }
1771 
1772         case DKIOCGGEOM:
1773         case DKIOCGVTOC:
1774                 /*
1775                  * commands using these (like prtvtoc) expect ENOTSUP
1776                  * since we're emulating an EFI label
1777                  */
1778                 error = SET_ERROR(ENOTSUP);
1779                 break;
1780 
1781         case DKIOCDUMPINIT:
1782                 lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize,
1783                     RL_WRITER);
1784                 error = zvol_dumpify(zv);
1785                 rangelock_exit(lr);
1786                 break;
1787 


1800                 dmu_tx_t *tx;
1801 
1802                 if (!zvol_unmap_enabled)
1803                         break;
1804 
1805                 if (!(flag & FKIOCTL)) {
1806                         error = dfl_copyin((void *)arg, &dfl, flag, KM_SLEEP);
1807                         if (error != 0)
1808                                 break;
1809                 } else {
1810                         dfl = (dkioc_free_list_t *)arg;
1811                         ASSERT3U(dfl->dfl_num_exts, <=, DFL_COPYIN_MAX_EXTS);
1812                         if (dfl->dfl_num_exts > DFL_COPYIN_MAX_EXTS) {
1813                                 error = SET_ERROR(EINVAL);
1814                                 break;
1815                         }
1816                 }
1817 
1818                 mutex_exit(&zfsdev_state_lock);
1819 
1820                 ht_begin_unsafe();
1821 
1822                 for (int i = 0; i < dfl->dfl_num_exts; i++) {
1823                         uint64_t start = dfl->dfl_exts[i].dfle_start,
1824                             length = dfl->dfl_exts[i].dfle_length,
1825                             end = start + length;
1826 
1827                         /*
1828                          * Apply Postel's Law to length-checking.  If they
1829                          * overshoot, just blank out until the end, if there's
1830                          * a need to blank out anything.
1831                          */
1832                         if (start >= zv->zv_volsize)
1833                                 continue;       /* No need to do anything... */
1834                         if (end > zv->zv_volsize) {
1835                                 end = DMU_OBJECT_END;
1836                                 length = end - start;
1837                         }
1838 
1839                         lr = rangelock_enter(&zv->zv_rangelock, start, length,
1840                             RL_WRITER);
1841                         tx = dmu_tx_create(zv->zv_objset);


1857                 }
1858 
1859                 /*
1860                  * If the write-cache is disabled, 'sync' property
1861                  * is set to 'always', or if the caller is asking for
1862                  * a synchronous free, commit this operation to the zil.
1863                  * This will sync any previous uncommitted writes to the
1864                  * zvol object.
1865                  * Can be overridden by the zvol_unmap_sync_enabled tunable.
1866                  */
1867                 if ((error == 0) && zvol_unmap_sync_enabled &&
1868                     (!(zv->zv_flags & ZVOL_WCE) ||
1869                     (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) ||
1870                     (dfl->dfl_flags & DF_WAIT_SYNC))) {
1871                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
1872                 }
1873 
1874                 if (!(flag & FKIOCTL))
1875                         dfl_free(dfl);
1876 
1877                 ht_end_unsafe();
1878 
1879                 return (error);
1880         }
1881 
1882         default:
1883                 error = SET_ERROR(ENOTTY);
1884                 break;
1885 
1886         }
1887         mutex_exit(&zfsdev_state_lock);
1888         return (error);
1889 }
1890 
1891 int
1892 zvol_busy(void)
1893 {
1894         return (zvol_minors != 0);
1895 }
1896 
1897 void
1898 zvol_init(void)