Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>


  70 #ifdef  _KERNEL
  71 #include <sys/bootprops.h>
  72 #include <sys/callb.h>
  73 #include <sys/cpupart.h>
  74 #include <sys/pool.h>
  75 #include <sys/sysdc.h>
  76 #include <sys/zone.h>
  77 #endif  /* _KERNEL */
  78 
  79 #include "zfs_prop.h"
  80 #include "zfs_comutil.h"
  81 
  82 /*
  83  * The interval, in seconds, at which failed configuration cache file writes
  84  * should be retried.
  85  */
  86 static int zfs_ccw_retry_interval = 300;
  87 
  88 typedef enum zti_modes {
  89         ZTI_MODE_FIXED,                 /* value is # of threads (min 1) */
  90         ZTI_MODE_ONLINE_PERCENT,        /* value is % of online CPUs */
  91         ZTI_MODE_BATCH,                 /* cpu-intensive; value is ignored */
  92         ZTI_MODE_NULL,                  /* don't create a taskq */
  93         ZTI_NMODES
  94 } zti_modes_t;
  95 
  96 #define ZTI_P(n, q)     { ZTI_MODE_FIXED, (n), (q) }
  97 #define ZTI_PCT(n)      { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
  98 #define ZTI_BATCH       { ZTI_MODE_BATCH, 0, 1 }
  99 #define ZTI_NULL        { ZTI_MODE_NULL, 0, 0 }
 100 
 101 #define ZTI_N(n)        ZTI_P(n, 1)
 102 #define ZTI_ONE         ZTI_N(1)
 103 
 104 typedef struct zio_taskq_info {
 105         zti_modes_t zti_mode;
 106         uint_t zti_value;
 107         uint_t zti_count;
 108 } zio_taskq_info_t;
 109 
 110 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 111         "issue", "issue_high", "intr", "intr_high"
 112 };
 113 
 114 /*
 115  * This table defines the taskq settings for each ZFS I/O type. When
 116  * initializing a pool, we use this table to create an appropriately sized
 117  * taskq. Some operations are low volume and therefore have a small, static


 129  * need to be handled with minimum delay.
 130  */
 131 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 132         /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
 133         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* NULL */
 134         { ZTI_N(8),     ZTI_NULL,       ZTI_BATCH,      ZTI_NULL }, /* READ */
 135         { ZTI_BATCH,    ZTI_N(5),       ZTI_N(8),       ZTI_N(5) }, /* WRITE */
 136         { ZTI_P(12, 8), ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* FREE */
 137         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* CLAIM */
 138         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* IOCTL */
 139 };
 140 
 141 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 142 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 143 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 144 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
 145     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
 146     char **ereport);
 147 static void spa_vdev_resilver_done(spa_t *spa);
 148 
 149 uint_t          zio_taskq_batch_pct = 100;      /* 1 thread per cpu in pset */
 150 id_t            zio_taskq_psrset_bind = PS_NONE;
 151 boolean_t       zio_taskq_sysdc = B_TRUE;       /* use SDC scheduling class */
 152 uint_t          zio_taskq_basedc = 80;          /* base duty cycle */
 153 
 154 boolean_t       spa_create_process = B_TRUE;    /* no process ==> no sysdc */
 155 extern int      zfs_sync_pass_deferred_free;
 156 
 157 /*
 158  * This (illegal) pool name is used when temporarily importing a spa_t in order
 159  * to get the vdev stats associated with the imported devices.
 160  */
 161 #define TRYIMPORT_NAME  "$import"
 162 
 163 /*
 164  * ==========================================================================
 165  * SPA properties routines
 166  * ==========================================================================
 167  */
 168 
 169 /*


 825         const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
 826         enum zti_modes mode = ztip->zti_mode;
 827         uint_t value = ztip->zti_value;
 828         uint_t count = ztip->zti_count;
 829         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 830         char name[32];
 831         uint_t flags = 0;
 832         boolean_t batch = B_FALSE;
 833 
 834         if (mode == ZTI_MODE_NULL) {
 835                 tqs->stqs_count = 0;
 836                 tqs->stqs_taskq = NULL;
 837                 return;
 838         }
 839 
 840         ASSERT3U(count, >, 0);
 841 
 842         tqs->stqs_count = count;
 843         tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
 844 
 845         for (uint_t i = 0; i < count; i++) {
 846                 taskq_t *tq;
 847 
 848                 switch (mode) {
 849                 case ZTI_MODE_FIXED:
 850                         ASSERT3U(value, >=, 1);
 851                         value = MAX(value, 1);
 852                         break;
 853 
 854                 case ZTI_MODE_BATCH:
 855                         batch = B_TRUE;
 856                         flags |= TASKQ_THREADS_CPU_PCT;
 857                         value = zio_taskq_batch_pct;
 858                         break;
 859 
 860                 case ZTI_MODE_ONLINE_PERCENT:
 861                         flags |= TASKQ_THREADS_CPU_PCT;
 862                         break;
 863 
 864                 default:
 865                         panic("unrecognized mode for %s_%s taskq (%u:%u) in "
 866                             "spa_activate()",
 867                             zio_type_name[t], zio_taskq_types[q], mode, value);
 868                         break;
 869                 }
 870 



 871                 if (count > 1) {
 872                         (void) snprintf(name, sizeof (name), "%s_%s_%u",
 873                             zio_type_name[t], zio_taskq_types[q], i);
 874                 } else {
 875                         (void) snprintf(name, sizeof (name), "%s_%s",
 876                             zio_type_name[t], zio_taskq_types[q]);
 877                 }
 878 
 879                 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 880                         if (batch)
 881                                 flags |= TASKQ_DC_BATCH;
 882 
 883                         tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 884                             spa->spa_proc, zio_taskq_basedc, flags);
 885                 } else {
 886                         tq = taskq_create_proc(name, value, maxclsyspri, 50,









 887                             INT_MAX, spa->spa_proc, flags);
 888                 }
 889 
 890                 tqs->stqs_taskq[i] = tq;
 891         }
 892 }
 893 
 894 static void
 895 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 896 {
 897         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 898 
 899         if (tqs->stqs_taskq == NULL) {
 900                 ASSERT0(tqs->stqs_count);
 901                 return;
 902         }
 903 
 904         for (uint_t i = 0; i < tqs->stqs_count; i++) {
 905                 ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
 906                 taskq_destroy(tqs->stqs_taskq[i]);


5720  */
5721 
5722 static int
5723 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5724 {
5725         bpobj_t *bpo = arg;
5726         bpobj_enqueue(bpo, bp, tx);
5727         return (0);
5728 }
5729 
5730 static int
5731 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5732 {
5733         zio_t *zio = arg;
5734 
5735         zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5736             zio->io_flags));
5737         return (0);
5738 }
5739 




5740 static void






















5741 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5742 {
5743         char *packed = NULL;
5744         size_t bufsize;
5745         size_t nvsize = 0;
5746         dmu_buf_t *db;
5747 
5748         VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5749 
5750         /*
5751          * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5752          * information.  This avoids the dbuf_will_dirty() path and
5753          * saves us a pre-read to get data we don't actually care about.
5754          */
5755         bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
5756         packed = kmem_alloc(bufsize, KM_SLEEP);
5757 
5758         VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5759             KM_SLEEP) == 0);
5760         bzero(packed + nvsize, bufsize - nvsize);


6046                 /* Keeping the freedir open increases spa_minref */
6047                 spa->spa_minref += 3;
6048         }
6049 
6050         if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
6051             spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
6052                 spa_feature_create_zap_objects(spa, tx);
6053         }
6054         rrw_exit(&dp->dp_config_rwlock, FTAG);
6055 }
6056 
6057 /*
6058  * Sync the specified transaction group.  New blocks may be dirtied as
6059  * part of the process, so we iterate until it converges.
6060  */
6061 void
6062 spa_sync(spa_t *spa, uint64_t txg)
6063 {
6064         dsl_pool_t *dp = spa->spa_dsl_pool;
6065         objset_t *mos = spa->spa_meta_objset;
6066         bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
6067         bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
6068         vdev_t *rvd = spa->spa_root_vdev;
6069         vdev_t *vd;
6070         dmu_tx_t *tx;
6071         int error;
6072 
6073         VERIFY(spa_writeable(spa));
6074 
6075         /*
6076          * Lock out configuration changes.
6077          */
6078         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6079 
6080         spa->spa_syncing_txg = txg;
6081         spa->spa_sync_pass = 0;
6082 
6083         /*
6084          * If there are any pending vdev state changes, convert them
6085          * into config changes that go out with this transaction group.
6086          */


6126                 if (i == rvd->vdev_children) {
6127                         spa->spa_deflate = TRUE;
6128                         VERIFY(0 == zap_add(spa->spa_meta_objset,
6129                             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
6130                             sizeof (uint64_t), 1, &spa->spa_deflate, tx));
6131                 }
6132         }
6133 
6134         /*
6135          * If anything has changed in this txg, or if someone is waiting
6136          * for this txg to sync (eg, spa_vdev_remove()), push the
6137          * deferred frees from the previous txg.  If not, leave them
6138          * alone so that we don't generate work on an otherwise idle
6139          * system.
6140          */
6141         if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
6142             !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
6143             !txg_list_empty(&dp->dp_sync_tasks, txg) ||
6144             ((dsl_scan_active(dp->dp_scan) ||
6145             txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
6146                 zio_t *zio = zio_root(spa, NULL, NULL, 0);
6147                 VERIFY3U(bpobj_iterate(defer_bpo,
6148                     spa_free_sync_cb, zio, tx), ==, 0);
6149                 VERIFY0(zio_wait(zio));
6150         }
6151 
6152         /*
6153          * Iterate to convergence.
6154          */
6155         do {
6156                 int pass = ++spa->spa_sync_pass;
6157 
6158                 spa_sync_config_object(spa, tx);
6159                 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
6160                     ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
6161                 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
6162                     ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
6163                 spa_errlog_sync(spa, txg);
6164                 dsl_pool_sync(dp, txg);
6165 
6166                 if (pass < zfs_sync_pass_deferred_free) {
6167                         zio_t *zio = zio_root(spa, NULL, NULL, 0);
6168                         bplist_iterate(free_bpl, spa_free_sync_cb,
6169                             zio, tx);
6170                         VERIFY(zio_wait(zio) == 0);
6171                 } else {
6172                         bplist_iterate(free_bpl, bpobj_enqueue_cb,
6173                             defer_bpo, tx);
6174                 }
6175 
6176                 ddt_sync(spa, txg);
6177                 dsl_scan_sync(dp, tx);
6178 
6179                 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
6180                         vdev_sync(vd, txg);
6181 
6182                 if (pass == 1)
6183                         spa_sync_upgrades(spa, tx);
6184 
6185         } while (dmu_objset_is_dirty(mos, txg));
6186 
6187         /*
6188          * Rewrite the vdev configuration (which includes the uberblock)
6189          * to commit the transaction group.
6190          *
6191          * If there are no dirty vdevs, we sync the uberblock to a few
6192          * random top-level vdevs that are known to be visible in the
6193          * config cache (see spa_vdev_add() for a complete description).




  70 #ifdef  _KERNEL
  71 #include <sys/bootprops.h>
  72 #include <sys/callb.h>
  73 #include <sys/cpupart.h>
  74 #include <sys/pool.h>
  75 #include <sys/sysdc.h>
  76 #include <sys/zone.h>
  77 #endif  /* _KERNEL */
  78 
  79 #include "zfs_prop.h"
  80 #include "zfs_comutil.h"
  81 
  82 /*
  83  * The interval, in seconds, at which failed configuration cache file writes
  84  * should be retried.
  85  */
  86 static int zfs_ccw_retry_interval = 300;
  87 
  88 typedef enum zti_modes {
  89         ZTI_MODE_FIXED,                 /* value is # of threads (min 1) */

  90         ZTI_MODE_BATCH,                 /* cpu-intensive; value is ignored */
  91         ZTI_MODE_NULL,                  /* don't create a taskq */
  92         ZTI_NMODES
  93 } zti_modes_t;
  94 
  95 #define ZTI_P(n, q)     { ZTI_MODE_FIXED, (n), (q) }

  96 #define ZTI_BATCH       { ZTI_MODE_BATCH, 0, 1 }
  97 #define ZTI_NULL        { ZTI_MODE_NULL, 0, 0 }
  98 
  99 #define ZTI_N(n)        ZTI_P(n, 1)
 100 #define ZTI_ONE         ZTI_N(1)
 101 
 102 typedef struct zio_taskq_info {
 103         zti_modes_t zti_mode;
 104         uint_t zti_value;
 105         uint_t zti_count;
 106 } zio_taskq_info_t;
 107 
 108 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 109         "issue", "issue_high", "intr", "intr_high"
 110 };
 111 
 112 /*
 113  * This table defines the taskq settings for each ZFS I/O type. When
 114  * initializing a pool, we use this table to create an appropriately sized
 115  * taskq. Some operations are low volume and therefore have a small, static


 127  * need to be handled with minimum delay.
 128  */
 129 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 130         /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
 131         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* NULL */
 132         { ZTI_N(8),     ZTI_NULL,       ZTI_BATCH,      ZTI_NULL }, /* READ */
 133         { ZTI_BATCH,    ZTI_N(5),       ZTI_N(8),       ZTI_N(5) }, /* WRITE */
 134         { ZTI_P(12, 8), ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* FREE */
 135         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* CLAIM */
 136         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* IOCTL */
 137 };
 138 
 139 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 140 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 141 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 142 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
 143     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
 144     char **ereport);
 145 static void spa_vdev_resilver_done(spa_t *spa);
 146 
 147 uint_t          zio_taskq_batch_pct = 75;       /* 1 thread per cpu in pset */
 148 id_t            zio_taskq_psrset_bind = PS_NONE;
 149 boolean_t       zio_taskq_sysdc = B_TRUE;       /* use SDC scheduling class */
 150 uint_t          zio_taskq_basedc = 80;          /* base duty cycle */
 151 
 152 boolean_t       spa_create_process = B_TRUE;    /* no process ==> no sysdc */
 153 extern int      zfs_sync_pass_deferred_free;
 154 
 155 /*
 156  * This (illegal) pool name is used when temporarily importing a spa_t in order
 157  * to get the vdev stats associated with the imported devices.
 158  */
 159 #define TRYIMPORT_NAME  "$import"
 160 
 161 /*
 162  * ==========================================================================
 163  * SPA properties routines
 164  * ==========================================================================
 165  */
 166 
 167 /*


 823         const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
 824         enum zti_modes mode = ztip->zti_mode;
 825         uint_t value = ztip->zti_value;
 826         uint_t count = ztip->zti_count;
 827         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 828         char name[32];
 829         uint_t flags = 0;
 830         boolean_t batch = B_FALSE;
 831 
 832         if (mode == ZTI_MODE_NULL) {
 833                 tqs->stqs_count = 0;
 834                 tqs->stqs_taskq = NULL;
 835                 return;
 836         }
 837 
 838         ASSERT3U(count, >, 0);
 839 
 840         tqs->stqs_count = count;
 841         tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
 842 



 843         switch (mode) {
 844         case ZTI_MODE_FIXED:
 845                 ASSERT3U(value, >=, 1);
 846                 value = MAX(value, 1);
 847                 break;
 848 
 849         case ZTI_MODE_BATCH:
 850                 batch = B_TRUE;
 851                 flags |= TASKQ_THREADS_CPU_PCT;
 852                 value = zio_taskq_batch_pct;
 853                 break;
 854 




 855         default:
 856                 panic("unrecognized mode for %s_%s taskq (%u:%u) in "
 857                     "spa_activate()",
 858                     zio_type_name[t], zio_taskq_types[q], mode, value);
 859                 break;
 860         }
 861 
 862         for (uint_t i = 0; i < count; i++) {
 863                 taskq_t *tq;
 864 
 865                 if (count > 1) {
 866                         (void) snprintf(name, sizeof (name), "%s_%s_%u",
 867                             zio_type_name[t], zio_taskq_types[q], i);
 868                 } else {
 869                         (void) snprintf(name, sizeof (name), "%s_%s",
 870                             zio_type_name[t], zio_taskq_types[q]);
 871                 }
 872 
 873                 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 874                         if (batch)
 875                                 flags |= TASKQ_DC_BATCH;
 876 
 877                         tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 878                             spa->spa_proc, zio_taskq_basedc, flags);
 879                 } else {
 880                         pri_t pri = maxclsyspri;
 881                         /*
 882                          * The write issue taskq can be extremely CPU
 883                          * intensive.  Run it at slightly lower priority
 884                          * than the other taskqs.
 885                          */
 886                         if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
 887                                 pri--;
 888 
 889                         tq = taskq_create_proc(name, value, pri, 50,
 890                             INT_MAX, spa->spa_proc, flags);
 891                 }
 892 
 893                 tqs->stqs_taskq[i] = tq;
 894         }
 895 }
 896 
 897 static void
 898 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 899 {
 900         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 901 
 902         if (tqs->stqs_taskq == NULL) {
 903                 ASSERT0(tqs->stqs_count);
 904                 return;
 905         }
 906 
 907         for (uint_t i = 0; i < tqs->stqs_count; i++) {
 908                 ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
 909                 taskq_destroy(tqs->stqs_taskq[i]);


5723  */
5724 
5725 static int
5726 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5727 {
5728         bpobj_t *bpo = arg;
5729         bpobj_enqueue(bpo, bp, tx);
5730         return (0);
5731 }
5732 
5733 static int
5734 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5735 {
5736         zio_t *zio = arg;
5737 
5738         zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5739             zio->io_flags));
5740         return (0);
5741 }
5742 
5743 /*
5744  * Note: this simple function is not inlined to make it easier to dtrace the
5745  * amount of time spent syncing frees.
5746  */
5747 static void
5748 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
5749 {
5750         zio_t *zio = zio_root(spa, NULL, NULL, 0);
5751         bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
5752         VERIFY(zio_wait(zio) == 0);
5753 }
5754 
5755 /*
5756  * Note: this simple function is not inlined to make it easier to dtrace the
5757  * amount of time spent syncing deferred frees.
5758  */
5759 static void
5760 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
5761 {
5762         zio_t *zio = zio_root(spa, NULL, NULL, 0);
5763         VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
5764             spa_free_sync_cb, zio, tx), ==, 0);
5765         VERIFY0(zio_wait(zio));
5766 }
5767 
5768 
5769 static void
5770 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5771 {
5772         char *packed = NULL;
5773         size_t bufsize;
5774         size_t nvsize = 0;
5775         dmu_buf_t *db;
5776 
5777         VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5778 
5779         /*
5780          * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5781          * information.  This avoids the dbuf_will_dirty() path and
5782          * saves us a pre-read to get data we don't actually care about.
5783          */
5784         bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
5785         packed = kmem_alloc(bufsize, KM_SLEEP);
5786 
5787         VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5788             KM_SLEEP) == 0);
5789         bzero(packed + nvsize, bufsize - nvsize);


6075                 /* Keeping the freedir open increases spa_minref */
6076                 spa->spa_minref += 3;
6077         }
6078 
6079         if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
6080             spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
6081                 spa_feature_create_zap_objects(spa, tx);
6082         }
6083         rrw_exit(&dp->dp_config_rwlock, FTAG);
6084 }
6085 
6086 /*
6087  * Sync the specified transaction group.  New blocks may be dirtied as
6088  * part of the process, so we iterate until it converges.
6089  */
6090 void
6091 spa_sync(spa_t *spa, uint64_t txg)
6092 {
6093         dsl_pool_t *dp = spa->spa_dsl_pool;
6094         objset_t *mos = spa->spa_meta_objset;

6095         bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
6096         vdev_t *rvd = spa->spa_root_vdev;
6097         vdev_t *vd;
6098         dmu_tx_t *tx;
6099         int error;
6100 
6101         VERIFY(spa_writeable(spa));
6102 
6103         /*
6104          * Lock out configuration changes.
6105          */
6106         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6107 
6108         spa->spa_syncing_txg = txg;
6109         spa->spa_sync_pass = 0;
6110 
6111         /*
6112          * If there are any pending vdev state changes, convert them
6113          * into config changes that go out with this transaction group.
6114          */


6154                 if (i == rvd->vdev_children) {
6155                         spa->spa_deflate = TRUE;
6156                         VERIFY(0 == zap_add(spa->spa_meta_objset,
6157                             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
6158                             sizeof (uint64_t), 1, &spa->spa_deflate, tx));
6159                 }
6160         }
6161 
6162         /*
6163          * If anything has changed in this txg, or if someone is waiting
6164          * for this txg to sync (eg, spa_vdev_remove()), push the
6165          * deferred frees from the previous txg.  If not, leave them
6166          * alone so that we don't generate work on an otherwise idle
6167          * system.
6168          */
6169         if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
6170             !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
6171             !txg_list_empty(&dp->dp_sync_tasks, txg) ||
6172             ((dsl_scan_active(dp->dp_scan) ||
6173             txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
6174                 spa_sync_deferred_frees(spa, tx);



6175         }
6176 
6177         /*
6178          * Iterate to convergence.
6179          */
6180         do {
6181                 int pass = ++spa->spa_sync_pass;
6182 
6183                 spa_sync_config_object(spa, tx);
6184                 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
6185                     ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
6186                 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
6187                     ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
6188                 spa_errlog_sync(spa, txg);
6189                 dsl_pool_sync(dp, txg);
6190 
6191                 if (pass < zfs_sync_pass_deferred_free) {
6192                         spa_sync_frees(spa, free_bpl, tx);



6193                 } else {
6194                         bplist_iterate(free_bpl, bpobj_enqueue_cb,
6195                             &spa->spa_deferred_bpobj, tx);
6196                 }
6197 
6198                 ddt_sync(spa, txg);
6199                 dsl_scan_sync(dp, tx);
6200 
6201                 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
6202                         vdev_sync(vd, txg);
6203 
6204                 if (pass == 1)
6205                         spa_sync_upgrades(spa, tx);
6206 
6207         } while (dmu_objset_is_dirty(mos, txg));
6208 
6209         /*
6210          * Rewrite the vdev configuration (which includes the uberblock)
6211          * to commit the transaction group.
6212          *
6213          * If there are no dirty vdevs, we sync the uberblock to a few
6214          * random top-level vdevs that are known to be visible in the
6215          * config cache (see spa_vdev_add() for a complete description).