Print this page
3756 want lz4 support for metadata compression


   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.

  24  */
  25 
  26 #include <sys/dmu.h>
  27 #include <sys/dmu_impl.h>
  28 #include <sys/dmu_tx.h>
  29 #include <sys/dbuf.h>
  30 #include <sys/dnode.h>
  31 #include <sys/zfs_context.h>
  32 #include <sys/dmu_objset.h>
  33 #include <sys/dmu_traverse.h>
  34 #include <sys/dsl_dataset.h>
  35 #include <sys/dsl_dir.h>
  36 #include <sys/dsl_pool.h>
  37 #include <sys/dsl_synctask.h>
  38 #include <sys/dsl_prop.h>
  39 #include <sys/dmu_zfetch.h>
  40 #include <sys/zfs_ioctl.h>
  41 #include <sys/zap.h>
  42 #include <sys/zio_checksum.h>
  43 #include <sys/zio_compress.h>
  44 #include <sys/sa.h>

  45 #ifdef _KERNEL
  46 #include <sys/vmsystm.h>
  47 #include <sys/zfs_znode.h>
  48 #endif
  49 
  50 /*
  51  * Enable/disable nopwrite feature.
  52  */
  53 int zfs_nopwrite_enabled = 1;
  54 
  55 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
  56         {       DMU_BSWAP_UINT8,        TRUE,   "unallocated"           },
  57         {       DMU_BSWAP_ZAP,          TRUE,   "object directory"      },
  58         {       DMU_BSWAP_UINT64,       TRUE,   "object array"          },
  59         {       DMU_BSWAP_UINT8,        TRUE,   "packed nvlist"         },
  60         {       DMU_BSWAP_UINT64,       TRUE,   "packed nvlist size"    },
  61         {       DMU_BSWAP_UINT64,       TRUE,   "bpobj"                 },
  62         {       DMU_BSWAP_UINT64,       TRUE,   "bpobj header"          },
  63         {       DMU_BSWAP_UINT64,       TRUE,   "SPA space map header"  },
  64         {       DMU_BSWAP_UINT64,       TRUE,   "SPA space map"         },


1400 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
1401 {
1402         blkptr_t *bp = zgd->zgd_bp;
1403         dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
1404         objset_t *os = db->db_objset;
1405         dsl_dataset_t *ds = os->os_dsl_dataset;
1406         dbuf_dirty_record_t *dr;
1407         dmu_sync_arg_t *dsa;
1408         zbookmark_t zb;
1409         zio_prop_t zp;
1410         dnode_t *dn;
1411 
1412         ASSERT(pio != NULL);
1413         ASSERT(txg != 0);
1414 
1415         SET_BOOKMARK(&zb, ds->ds_object,
1416             db->db.db_object, db->db_level, db->db_blkid);
1417 
1418         DB_DNODE_ENTER(db);
1419         dn = DB_DNODE(db);
1420         dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
1421         DB_DNODE_EXIT(db);
1422 
1423         /*
1424          * If we're frozen (running ziltest), we always need to generate a bp.
1425          */
1426         if (txg > spa_freeze_txg(os->os_spa))
1427                 return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
1428 
1429         /*
1430          * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
1431          * and us.  If we determine that this txg is not yet syncing,
1432          * but it begins to sync a moment later, that's OK because the
1433          * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
1434          */
1435         mutex_enter(&db->db_mtx);
1436 
1437         if (txg <= spa_last_synced_txg(os->os_spa)) {
1438                 /*
1439                  * This txg has already synced.  There's nothing to do.
1440                  */


1538         dn->dn_checksum = checksum;
1539         dnode_setdirty(dn, tx);
1540         dnode_rele(dn, FTAG);
1541 }
1542 
1543 void
1544 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
1545         dmu_tx_t *tx)
1546 {
1547         dnode_t *dn;
1548 
1549         /* XXX assumes dnode_hold will not get an i/o error */
1550         (void) dnode_hold(os, object, FTAG, &dn);
1551         ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
1552         dn->dn_compress = compress;
1553         dnode_setdirty(dn, tx);
1554         dnode_rele(dn, FTAG);
1555 }
1556 
1557 int zfs_mdcomp_disable = 0;

1558 
1559 void
1560 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)

1561 {
1562         dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
1563         boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
1564             (wp & WP_SPILL));
1565         enum zio_checksum checksum = os->os_checksum;
1566         enum zio_compress compress = os->os_compress;
1567         enum zio_checksum dedup_checksum = os->os_dedup_checksum;
1568         boolean_t dedup = B_FALSE;
1569         boolean_t nopwrite = B_FALSE;
1570         boolean_t dedup_verify = os->os_dedup_verify;
1571         int copies = os->os_copies;
1572 
1573         /*
1574          * We maintain different write policies for each of the following
1575          * types of data:
1576          *       1. metadata
1577          *       2. preallocated blocks (i.e. level-0 blocks of a dump device)
1578          *       3. all other level 0 blocks
1579          */
1580         if (ismd) {
1581                 /*
1582                  * XXX -- we should design a compression algorithm
1583                  * that specializes in arrays of bps.
1584                  */
1585                 compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
1586                     ZIO_COMPRESS_LZJB;


















1587 
1588                 /*
1589                  * Metadata always gets checksummed.  If the data
1590                  * checksum is multi-bit correctable, and it's not a
1591                  * ZBT-style checksum, then it's suitable for metadata
1592                  * as well.  Otherwise, the metadata checksum defaults
1593                  * to fletcher4.
1594                  */
1595                 if (zio_checksum_table[checksum].ci_correctable < 1 ||
1596                     zio_checksum_table[checksum].ci_eck)
1597                         checksum = ZIO_CHECKSUM_FLETCHER_4;
1598         } else if (wp & WP_NOFILL) {
1599                 ASSERT(level == 0);
1600 
1601                 /*
1602                  * If we're writing preallocated blocks, we aren't actually
1603                  * writing them so don't set any policy properties.  These
1604                  * blocks are currently only used by an external subsystem
1605                  * outside of zfs (i.e. dump) and not written by the zio
1606                  * pipeline.




   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 Martin Matuska. All rights reserved.
  25  */
  26 
  27 #include <sys/dmu.h>
  28 #include <sys/dmu_impl.h>
  29 #include <sys/dmu_tx.h>
  30 #include <sys/dbuf.h>
  31 #include <sys/dnode.h>
  32 #include <sys/zfs_context.h>
  33 #include <sys/dmu_objset.h>
  34 #include <sys/dmu_traverse.h>
  35 #include <sys/dsl_dataset.h>
  36 #include <sys/dsl_dir.h>
  37 #include <sys/dsl_pool.h>
  38 #include <sys/dsl_synctask.h>
  39 #include <sys/dsl_prop.h>
  40 #include <sys/dmu_zfetch.h>
  41 #include <sys/zfs_ioctl.h>
  42 #include <sys/zap.h>
  43 #include <sys/zio_checksum.h>
  44 #include <sys/zio_compress.h>
  45 #include <sys/sa.h>
  46 #include <sys/zfeature.h>
  47 #ifdef _KERNEL
  48 #include <sys/vmsystm.h>
  49 #include <sys/zfs_znode.h>
  50 #endif
  51 
  52 /*
  53  * Enable/disable nopwrite feature.
  54  */
  55 int zfs_nopwrite_enabled = 1;
  56 
  57 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
  58         {       DMU_BSWAP_UINT8,        TRUE,   "unallocated"           },
  59         {       DMU_BSWAP_ZAP,          TRUE,   "object directory"      },
  60         {       DMU_BSWAP_UINT64,       TRUE,   "object array"          },
  61         {       DMU_BSWAP_UINT8,        TRUE,   "packed nvlist"         },
  62         {       DMU_BSWAP_UINT64,       TRUE,   "packed nvlist size"    },
  63         {       DMU_BSWAP_UINT64,       TRUE,   "bpobj"                 },
  64         {       DMU_BSWAP_UINT64,       TRUE,   "bpobj header"          },
  65         {       DMU_BSWAP_UINT64,       TRUE,   "SPA space map header"  },
  66         {       DMU_BSWAP_UINT64,       TRUE,   "SPA space map"         },


1402 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
1403 {
1404         blkptr_t *bp = zgd->zgd_bp;
1405         dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
1406         objset_t *os = db->db_objset;
1407         dsl_dataset_t *ds = os->os_dsl_dataset;
1408         dbuf_dirty_record_t *dr;
1409         dmu_sync_arg_t *dsa;
1410         zbookmark_t zb;
1411         zio_prop_t zp;
1412         dnode_t *dn;
1413 
1414         ASSERT(pio != NULL);
1415         ASSERT(txg != 0);
1416 
1417         SET_BOOKMARK(&zb, ds->ds_object,
1418             db->db.db_object, db->db_level, db->db_blkid);
1419 
1420         DB_DNODE_ENTER(db);
1421         dn = DB_DNODE(db);
1422         dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp, txg);
1423         DB_DNODE_EXIT(db);
1424 
1425         /*
1426          * If we're frozen (running ziltest), we always need to generate a bp.
1427          */
1428         if (txg > spa_freeze_txg(os->os_spa))
1429                 return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
1430 
1431         /*
1432          * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
1433          * and us.  If we determine that this txg is not yet syncing,
1434          * but it begins to sync a moment later, that's OK because the
1435          * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
1436          */
1437         mutex_enter(&db->db_mtx);
1438 
1439         if (txg <= spa_last_synced_txg(os->os_spa)) {
1440                 /*
1441                  * This txg has already synced.  There's nothing to do.
1442                  */


1540         dn->dn_checksum = checksum;
1541         dnode_setdirty(dn, tx);
1542         dnode_rele(dn, FTAG);
1543 }
1544 
1545 void
1546 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
1547         dmu_tx_t *tx)
1548 {
1549         dnode_t *dn;
1550 
1551         /* XXX assumes dnode_hold will not get an i/o error */
1552         (void) dnode_hold(os, object, FTAG, &dn);
1553         ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
1554         dn->dn_compress = compress;
1555         dnode_setdirty(dn, tx);
1556         dnode_rele(dn, FTAG);
1557 }
1558 
1559 int zfs_mdcomp_disable = 0;
1560 int zfs_mdcomp_lz4 = 0;
1561 
1562 void
1563 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp,
1564     uint64_t txg)
1565 {
1566         dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
1567         boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
1568             (wp & WP_SPILL));
1569         enum zio_checksum checksum = os->os_checksum;
1570         enum zio_compress compress = os->os_compress;
1571         enum zio_checksum dedup_checksum = os->os_dedup_checksum;
1572         boolean_t dedup = B_FALSE;
1573         boolean_t nopwrite = B_FALSE;
1574         boolean_t dedup_verify = os->os_dedup_verify;
1575         int copies = os->os_copies;
1576 
1577         /*
1578          * We maintain different write policies for each of the following
1579          * types of data:
1580          *       1. metadata
1581          *       2. preallocated blocks (i.e. level-0 blocks of a dump device)
1582          *       3. all other level 0 blocks
1583          */
1584         if (ismd) {
1585                 /*
1586                  * XXX -- we should design a compression algorithm
1587                  * that specializes in arrays of bps.
1588                  */
1589                 if (zfs_mdcomp_disable)
1590                         compress = ZIO_COMPRESS_EMPTY;
1591                 else if (zfs_mdcomp_lz4 && os->os_spa != NULL) {
1592                         zfeature_info_t *feat = &spa_feature_table
1593                             [SPA_FEATURE_LZ4_COMPRESS];
1594 
1595                         if (spa_feature_is_active(os->os_spa, feat))
1596                                 compress = ZIO_COMPRESS_LZ4;
1597                         else if (spa_feature_is_enabled(os->os_spa, feat)) {
1598                                 dmu_tx_t *tx;
1599 
1600                                 tx = dmu_tx_create_assigned(
1601                                     spa_get_dsl(os->os_spa), txg);
1602                                 spa_feature_incr(os->os_spa, feat, tx);
1603                                 dmu_tx_commit(tx);
1604                                 compress = ZIO_COMPRESS_LZ4;
1605                         } else
1606                                 compress = ZIO_COMPRESS_LZJB;
1607                 } else
1608                         compress = ZIO_COMPRESS_LZJB;
1609 
1610                 /*
1611                  * Metadata always gets checksummed.  If the data
1612                  * checksum is multi-bit correctable, and it's not a
1613                  * ZBT-style checksum, then it's suitable for metadata
1614                  * as well.  Otherwise, the metadata checksum defaults
1615                  * to fletcher4.
1616                  */
1617                 if (zio_checksum_table[checksum].ci_correctable < 1 ||
1618                     zio_checksum_table[checksum].ci_eck)
1619                         checksum = ZIO_CHECKSUM_FLETCHER_4;
1620         } else if (wp & WP_NOFILL) {
1621                 ASSERT(level == 0);
1622 
1623                 /*
1624                  * If we're writing preallocated blocks, we aren't actually
1625                  * writing them so don't set any policy properties.  These
1626                  * blocks are currently only used by an external subsystem
1627                  * outside of zfs (i.e. dump) and not written by the zio
1628                  * pipeline.