Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/spa.c
          +++ new/usr/src/uts/common/fs/zfs/spa.c
↓ open down ↓ 79 lines elided ↑ open up ↑
  80   80  #include "zfs_comutil.h"
  81   81  
  82   82  /*
  83   83   * The interval, in seconds, at which failed configuration cache file writes
  84   84   * should be retried.
  85   85   */
  86   86  static int zfs_ccw_retry_interval = 300;
  87   87  
  88   88  typedef enum zti_modes {
  89   89          ZTI_MODE_FIXED,                 /* value is # of threads (min 1) */
  90      -        ZTI_MODE_ONLINE_PERCENT,        /* value is % of online CPUs */
  91   90          ZTI_MODE_BATCH,                 /* cpu-intensive; value is ignored */
  92   91          ZTI_MODE_NULL,                  /* don't create a taskq */
  93   92          ZTI_NMODES
  94   93  } zti_modes_t;
  95   94  
  96   95  #define ZTI_P(n, q)     { ZTI_MODE_FIXED, (n), (q) }
  97      -#define ZTI_PCT(n)      { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
  98   96  #define ZTI_BATCH       { ZTI_MODE_BATCH, 0, 1 }
  99   97  #define ZTI_NULL        { ZTI_MODE_NULL, 0, 0 }
 100   98  
 101   99  #define ZTI_N(n)        ZTI_P(n, 1)
 102  100  #define ZTI_ONE         ZTI_N(1)
 103  101  
 104  102  typedef struct zio_taskq_info {
 105  103          zti_modes_t zti_mode;
 106  104          uint_t zti_value;
 107  105          uint_t zti_count;
↓ open down ↓ 31 lines elided ↑ open up ↑
 139  137  };
 140  138  
 141  139  static void spa_sync_version(void *arg, dmu_tx_t *tx);
 142  140  static void spa_sync_props(void *arg, dmu_tx_t *tx);
 143  141  static boolean_t spa_has_active_shared_spare(spa_t *spa);
 144  142  static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
 145  143      spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
 146  144      char **ereport);
 147  145  static void spa_vdev_resilver_done(spa_t *spa);
 148  146  
 149      -uint_t          zio_taskq_batch_pct = 100;      /* 1 thread per cpu in pset */
      147 +uint_t          zio_taskq_batch_pct = 75;       /* 1 thread per cpu in pset */
 150  148  id_t            zio_taskq_psrset_bind = PS_NONE;
 151  149  boolean_t       zio_taskq_sysdc = B_TRUE;       /* use SDC scheduling class */
 152  150  uint_t          zio_taskq_basedc = 80;          /* base duty cycle */
 153  151  
 154  152  boolean_t       spa_create_process = B_TRUE;    /* no process ==> no sysdc */
 155  153  extern int      zfs_sync_pass_deferred_free;
 156  154  
 157  155  /*
 158  156   * This (illegal) pool name is used when temporarily importing a spa_t in order
 159  157   * to get the vdev stats associated with the imported devices.
↓ open down ↓ 675 lines elided ↑ open up ↑
 835  833                  tqs->stqs_count = 0;
 836  834                  tqs->stqs_taskq = NULL;
 837  835                  return;
 838  836          }
 839  837  
 840  838          ASSERT3U(count, >, 0);
 841  839  
 842  840          tqs->stqs_count = count;
 843  841          tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
 844  842  
 845      -        for (uint_t i = 0; i < count; i++) {
 846      -                taskq_t *tq;
      843 +        switch (mode) {
      844 +        case ZTI_MODE_FIXED:
      845 +                ASSERT3U(value, >=, 1);
      846 +                value = MAX(value, 1);
      847 +                break;
 847  848  
 848      -                switch (mode) {
 849      -                case ZTI_MODE_FIXED:
 850      -                        ASSERT3U(value, >=, 1);
 851      -                        value = MAX(value, 1);
 852      -                        break;
      849 +        case ZTI_MODE_BATCH:
      850 +                batch = B_TRUE;
      851 +                flags |= TASKQ_THREADS_CPU_PCT;
      852 +                value = zio_taskq_batch_pct;
      853 +                break;
 853  854  
 854      -                case ZTI_MODE_BATCH:
 855      -                        batch = B_TRUE;
 856      -                        flags |= TASKQ_THREADS_CPU_PCT;
 857      -                        value = zio_taskq_batch_pct;
 858      -                        break;
      855 +        default:
      856 +                panic("unrecognized mode for %s_%s taskq (%u:%u) in "
      857 +                    "spa_activate()",
      858 +                    zio_type_name[t], zio_taskq_types[q], mode, value);
      859 +                break;
      860 +        }
 859  861  
 860      -                case ZTI_MODE_ONLINE_PERCENT:
 861      -                        flags |= TASKQ_THREADS_CPU_PCT;
 862      -                        break;
      862 +        for (uint_t i = 0; i < count; i++) {
      863 +                taskq_t *tq;
 863  864  
 864      -                default:
 865      -                        panic("unrecognized mode for %s_%s taskq (%u:%u) in "
 866      -                            "spa_activate()",
 867      -                            zio_type_name[t], zio_taskq_types[q], mode, value);
 868      -                        break;
 869      -                }
 870      -
 871  865                  if (count > 1) {
 872  866                          (void) snprintf(name, sizeof (name), "%s_%s_%u",
 873  867                              zio_type_name[t], zio_taskq_types[q], i);
 874  868                  } else {
 875  869                          (void) snprintf(name, sizeof (name), "%s_%s",
 876  870                              zio_type_name[t], zio_taskq_types[q]);
 877  871                  }
 878  872  
 879  873                  if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 880  874                          if (batch)
 881  875                                  flags |= TASKQ_DC_BATCH;
 882  876  
 883  877                          tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 884  878                              spa->spa_proc, zio_taskq_basedc, flags);
 885  879                  } else {
 886      -                        tq = taskq_create_proc(name, value, maxclsyspri, 50,
      880 +                        pri_t pri = maxclsyspri;
      881 +                        /*
      882 +                         * The write issue taskq can be extremely CPU
      883 +                         * intensive.  Run it at slightly lower priority
      884 +                         * than the other taskqs.
      885 +                         */
      886 +                        if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
      887 +                                pri--;
      888 +
      889 +                        tq = taskq_create_proc(name, value, pri, 50,
 887  890                              INT_MAX, spa->spa_proc, flags);
 888  891                  }
 889  892  
 890  893                  tqs->stqs_taskq[i] = tq;
 891  894          }
 892  895  }
 893  896  
 894  897  static void
 895  898  spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 896  899  {
↓ open down ↓ 4833 lines elided ↑ open up ↑
5730 5733  static int
5731 5734  spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5732 5735  {
5733 5736          zio_t *zio = arg;
5734 5737  
5735 5738          zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5736 5739              zio->io_flags));
5737 5740          return (0);
5738 5741  }
5739 5742  
     5743 +/*
     5744 + * Note: this simple function is not inlined to make it easier to dtrace the
     5745 + * amount of time spent syncing frees.
     5746 + */
5740 5747  static void
     5748 +spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
     5749 +{
     5750 +        zio_t *zio = zio_root(spa, NULL, NULL, 0);
     5751 +        bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
     5752 +        VERIFY(zio_wait(zio) == 0);
     5753 +}
     5754 +
     5755 +/*
     5756 + * Note: this simple function is not inlined to make it easier to dtrace the
     5757 + * amount of time spent syncing deferred frees.
     5758 + */
     5759 +static void
     5760 +spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
     5761 +{
     5762 +        zio_t *zio = zio_root(spa, NULL, NULL, 0);
     5763 +        VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
     5764 +            spa_free_sync_cb, zio, tx), ==, 0);
     5765 +        VERIFY0(zio_wait(zio));
     5766 +}
     5767 +
     5768 +
     5769 +static void
5741 5770  spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5742 5771  {
5743 5772          char *packed = NULL;
5744 5773          size_t bufsize;
5745 5774          size_t nvsize = 0;
5746 5775          dmu_buf_t *db;
5747 5776  
5748 5777          VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5749 5778  
5750 5779          /*
↓ open down ↓ 305 lines elided ↑ open up ↑
6056 6085  
6057 6086  /*
6058 6087   * Sync the specified transaction group.  New blocks may be dirtied as
6059 6088   * part of the process, so we iterate until it converges.
6060 6089   */
6061 6090  void
6062 6091  spa_sync(spa_t *spa, uint64_t txg)
6063 6092  {
6064 6093          dsl_pool_t *dp = spa->spa_dsl_pool;
6065 6094          objset_t *mos = spa->spa_meta_objset;
6066      -        bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
6067 6095          bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
6068 6096          vdev_t *rvd = spa->spa_root_vdev;
6069 6097          vdev_t *vd;
6070 6098          dmu_tx_t *tx;
6071 6099          int error;
6072 6100  
6073 6101          VERIFY(spa_writeable(spa));
6074 6102  
6075 6103          /*
6076 6104           * Lock out configuration changes.
↓ open down ↓ 59 lines elided ↑ open up ↑
6136 6164           * for this txg to sync (eg, spa_vdev_remove()), push the
6137 6165           * deferred frees from the previous txg.  If not, leave them
6138 6166           * alone so that we don't generate work on an otherwise idle
6139 6167           * system.
6140 6168           */
6141 6169          if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
6142 6170              !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
6143 6171              !txg_list_empty(&dp->dp_sync_tasks, txg) ||
6144 6172              ((dsl_scan_active(dp->dp_scan) ||
6145 6173              txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
6146      -                zio_t *zio = zio_root(spa, NULL, NULL, 0);
6147      -                VERIFY3U(bpobj_iterate(defer_bpo,
6148      -                    spa_free_sync_cb, zio, tx), ==, 0);
6149      -                VERIFY0(zio_wait(zio));
     6174 +                spa_sync_deferred_frees(spa, tx);
6150 6175          }
6151 6176  
6152 6177          /*
6153 6178           * Iterate to convergence.
6154 6179           */
6155 6180          do {
6156 6181                  int pass = ++spa->spa_sync_pass;
6157 6182  
6158 6183                  spa_sync_config_object(spa, tx);
6159 6184                  spa_sync_aux_dev(spa, &spa->spa_spares, tx,
6160 6185                      ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
6161 6186                  spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
6162 6187                      ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
6163 6188                  spa_errlog_sync(spa, txg);
6164 6189                  dsl_pool_sync(dp, txg);
6165 6190  
6166 6191                  if (pass < zfs_sync_pass_deferred_free) {
6167      -                        zio_t *zio = zio_root(spa, NULL, NULL, 0);
6168      -                        bplist_iterate(free_bpl, spa_free_sync_cb,
6169      -                            zio, tx);
6170      -                        VERIFY(zio_wait(zio) == 0);
     6192 +                        spa_sync_frees(spa, free_bpl, tx);
6171 6193                  } else {
6172 6194                          bplist_iterate(free_bpl, bpobj_enqueue_cb,
6173      -                            defer_bpo, tx);
     6195 +                            &spa->spa_deferred_bpobj, tx);
6174 6196                  }
6175 6197  
6176 6198                  ddt_sync(spa, txg);
6177 6199                  dsl_scan_sync(dp, tx);
6178 6200  
6179 6201                  while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
6180 6202                          vdev_sync(vd, txg);
6181 6203  
6182 6204                  if (pass == 1)
6183 6205                          spa_sync_upgrades(spa, tx);
↓ open down ↓ 317 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX