dlpx-os-diff Cdiff usr/src/uts/common/fs/zfs/dmu

Print this page

4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>


*** 52,61 ****
--- 52,62 ----
                  tx->tx_pool = dd->dd_pool;
          list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
              offsetof(dmu_tx_hold_t, txh_node));
          list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
              offsetof(dmu_tx_callback_t, dcb_node));
+         tx->tx_start = gethrtime();
  #ifdef ZFS_DEBUG
          refcount_create(&tx->tx_space_written);
          refcount_create(&tx->tx_space_freed);
  #endif
          return (tx);
*** 595,611 ****
          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
              object, THT_FREE, off, len);
          if (txh == NULL)
                  return;
          dn = txh->txh_dnode;
  
          if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
                  return;
          if (len == DMU_OBJECT_END)
                  len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
  
-         dmu_tx_count_dnode(txh);
  
          /*
           * For i/o error checking, we read the first and last level-0
           * blocks if they are not aligned, and all the level-1 blocks.
           *
--- 596,612 ----
          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
              object, THT_FREE, off, len);
          if (txh == NULL)
                  return;
          dn = txh->txh_dnode;
+         dmu_tx_count_dnode(txh);
  
          if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
                  return;
          if (len == DMU_OBJECT_END)
                  len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
  
  
          /*
           * For i/o error checking, we read the first and last level-0
           * blocks if they are not aligned, and all the level-1 blocks.
           *
*** 909,918 ****
--- 910,1069 ----
              (u_longlong_t)db->db.db_object, db->db_level,
              (u_longlong_t)db->db_blkid);
  }
  #endif
  
+ /*
+  * If we can't do 10 iops, something is wrong.  Let us go ahead
+  * and hit zfs_dirty_data_max.
+  */
+ hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
+ int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
+ 
+ /*
+  * We delay transactions when we've determined that the backend storage
+  * isn't able to accommodate the rate of incoming writes.
+  *
+  * If there is already a transaction waiting, we delay relative to when
+  * that transaction finishes waiting.  This way the calculated min_time
+  * is independent of the number of threads concurrently executing
+  * transactions.
+  *
+  * If we are the only waiter, wait relative to when the transaction
+  * started, rather than the current time.  This credits the transaction for
+  * "time already served", e.g. reading indirect blocks.
+  *
+  * The minimum time for a transaction to take is calculated as:
+  *     min_time = scale * (dirty - min) / (max - dirty)
+  *     min_time is then capped at zfs_delay_max_ns.
+  *
+  * The delay has two degrees of freedom that can be adjusted via tunables.
+  * The percentage of dirty data at which we start to delay is defined by
+  * zfs_delay_min_dirty_percent. This should typically be at or above
+  * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
+  * delay after writing at full speed has failed to keep up with the incoming
+  * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
+  * speaking, this variable determines the amount of delay at the midpoint of
+  * the curve.
+  *
+  * delay
+  *  10ms +-------------------------------------------------------------*+
+  *       |                                                             *|
+  *   9ms +                                                             *+
+  *       |                                                             *|
+  *   8ms +                                                             *+
+  *       |                                                            * |
+  *   7ms +                                                            * +
+  *       |                                                            * |
+  *   6ms +                                                            * +
+  *       |                                                            * |
+  *   5ms +                                                           *  +
+  *       |                                                           *  |
+  *   4ms +                                                           *  +
+  *       |                                                           *  |
+  *   3ms +                                                          *   +
+  *       |                                                          *   |
+  *   2ms +                                              (midpoint) *    +
+  *       |                                                  |    **     |
+  *   1ms +                                                  v ***       +
+  *       |             zfs_delay_scale ---------->     ********         |
+  *     0 +-------------------------------------*********----------------+
+  *       0%                    <- zfs_dirty_data_max ->               100%
+  *
+  * Note that since the delay is added to the outstanding time remaining on the
+  * most recent transaction, the delay is effectively the inverse of IOPS.
+  * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
+  * was chosen such that small changes in the amount of accumulated dirty data
+  * in the first 3/4 of the curve yield relatively small differences in the
+  * amount of delay.
+  *
+  * The effects can be easier to understand when the amount of delay is
+  * represented on a log scale:
+  *
+  * delay
+  * 100ms +-------------------------------------------------------------++
+  *       +                                                              +
+  *       |                                                              |
+  *       +                                                             *+
+  *  10ms +                                                             *+
+  *       +                                                           ** +
+  *       |                                              (midpoint)  **  |
+  *       +                                                  |     **    +
+  *   1ms +                                                  v ****      +
+  *       +             zfs_delay_scale ---------->        *****         +
+  *       |                                             ****             |
+  *       +                                          ****                +
+  * 100us +                                        **                    +
+  *       +                                       *                      +
+  *       |                                      *                       |
+  *       +                                     *                        +
+  *  10us +                                     *                        +
+  *       +                                                              +
+  *       |                                                              |
+  *       +                                                              +
+  *       +--------------------------------------------------------------+
+  *       0%                    <- zfs_dirty_data_max ->               100%
+  *
+  * Note here that only as the amount of dirty data approaches its limit does
+  * the delay start to increase rapidly. The goal of a properly tuned system
+  * should be to keep the amount of dirty data out of that range by first
+  * ensuring that the appropriate limits are set for the I/O scheduler to reach
+  * optimal throughput on the backend storage, and then by changing the value
+  * of zfs_delay_scale to increase the steepness of the curve.
+  */
+ static void
+ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
+ {
+         dsl_pool_t *dp = tx->tx_pool;
+         uint64_t delay_min_bytes =
+             zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+         hrtime_t wakeup, min_tx_time, now;
+ 
+         if (dirty <= delay_min_bytes)
+                 return;
+ 
+         /*
+          * The caller has already waited until we are under the max.
+          * We make them pass us the amount of dirty data so we don't
+          * have to handle the case of it being >= the max, which could
+          * cause a divide-by-zero if it's == the max.
+          */
+         ASSERT3U(dirty, <, zfs_dirty_data_max);
+ 
+         now = gethrtime();
+         min_tx_time = zfs_delay_scale *
+             (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
+         if (now > tx->tx_start + min_tx_time)
+                 return;
+ 
+         min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
+ 
+         DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
+             uint64_t, min_tx_time);
+ 
+         mutex_enter(&dp->dp_lock);
+         wakeup = MAX(tx->tx_start + min_tx_time,
+             dp->dp_last_wakeup + min_tx_time);
+         dp->dp_last_wakeup = wakeup;
+         mutex_exit(&dp->dp_lock);
+ 
+ #ifdef _KERNEL
+         mutex_enter(&curthread->t_delay_lock);
+         while (cv_timedwait_hires(&curthread->t_delay_cv,
+             &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
+             CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
+                 continue;
+         mutex_exit(&curthread->t_delay_lock);
+ #else
+         hrtime_t delta = wakeup - gethrtime();
+         struct timespec ts;
+         ts.tv_sec = delta / NANOSEC;
+         ts.tv_nsec = delta % NANOSEC;
+         (void) nanosleep(&ts, NULL);
+ #endif
+ }
+ 
  static int
  dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
  {
          dmu_tx_hold_t *txh;
          spa_t *spa = tx->tx_pool->dp_spa;
*** 939,948 ****
--- 1090,1105 ----
                          return (SET_ERROR(EIO));
  
                  return (SET_ERROR(ERESTART));
          }
  
+         if (!tx->tx_waited &&
+             dsl_pool_need_dirty_delay(tx->tx_pool)) {
+                 tx->tx_wait_dirty = B_TRUE;
+                 return (SET_ERROR(ERESTART));
+         }
+ 
          tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
          tx->tx_needassign_txh = NULL;
  
          /*
           * NB: No error returns are allowed after txg_hold_open, but
*** 1063,1085 ****
   *
   * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
   *      blocking, returns immediately with ERESTART.  This should be used
   *      whenever you're holding locks.  On an ERESTART error, the caller
   *      should drop locks, do a dmu_tx_wait(tx), and try again.
   */
  int
  dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
  {
          int err;
  
          ASSERT(tx->tx_txg == 0);
!         ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT);
          ASSERT(!dsl_pool_sync_context(tx->tx_pool));
  
          /* If we might wait, we must not hold the config lock. */
          ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
  
          while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
                  dmu_tx_unassign(tx);
  
                  if (err != ERESTART || txg_how != TXG_WAIT)
                          return (err);
--- 1220,1250 ----
   *
   * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
   *      blocking, returns immediately with ERESTART.  This should be used
   *      whenever you're holding locks.  On an ERESTART error, the caller
   *      should drop locks, do a dmu_tx_wait(tx), and try again.
+  *
+  * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
+  *      has already been called on behalf of this operation (though
+  *      most likely on a different tx).
   */
  int
  dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
  {
          int err;
  
          ASSERT(tx->tx_txg == 0);
!         ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
!             txg_how == TXG_WAITED);
          ASSERT(!dsl_pool_sync_context(tx->tx_pool));
  
          /* If we might wait, we must not hold the config lock. */
          ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
  
+         if (txg_how == TXG_WAITED)
+                 tx->tx_waited = B_TRUE;
+ 
          while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
                  dmu_tx_unassign(tx);
  
                  if (err != ERESTART || txg_how != TXG_WAIT)
                          return (err);
*** 1094,1115 ****
  
  void
  dmu_tx_wait(dmu_tx_t *tx)
  {
          spa_t *spa = tx->tx_pool->dp_spa;
  
          ASSERT(tx->tx_txg == 0);
          ASSERT(!dsl_pool_config_held(tx->tx_pool));
  
          /*
!          * It's possible that the pool has become active after this thread
!          * has tried to obtain a tx. If that's the case then his
!          * tx_lasttried_txg would not have been assigned.
           */
!         if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
!                 txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
          } else if (tx->tx_needassign_txh) {
                  dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
  
                  mutex_enter(&dn->dn_mtx);
                  while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
                          cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
--- 1259,1310 ----
  
  void
  dmu_tx_wait(dmu_tx_t *tx)
  {
          spa_t *spa = tx->tx_pool->dp_spa;
+         dsl_pool_t *dp = tx->tx_pool;
  
          ASSERT(tx->tx_txg == 0);
          ASSERT(!dsl_pool_config_held(tx->tx_pool));
  
+         if (tx->tx_wait_dirty) {
                  /*
!                  * dmu_tx_try_assign() has determined that we need to wait
!                  * because we've consumed much or all of the dirty buffer
!                  * space.
                   */
!                 mutex_enter(&dp->dp_lock);
!                 while (dp->dp_dirty_total >= zfs_dirty_data_max)
!                         cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
!                 uint64_t dirty = dp->dp_dirty_total;
!                 mutex_exit(&dp->dp_lock);
! 
!                 dmu_tx_delay(tx, dirty);
! 
!                 tx->tx_wait_dirty = B_FALSE;
! 
!                 /*
!                  * Note: setting tx_waited only has effect if the caller
!                  * used TX_WAIT.  Otherwise they are going to destroy
!                  * this tx and try again.  The common case, zfs_write(),
!                  * uses TX_WAIT.
!                  */
!                 tx->tx_waited = B_TRUE;
!         } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
!                 /*
!                  * If the pool is suspended we need to wait until it
!                  * is resumed.  Note that it's possible that the pool
!                  * has become active after this thread has tried to
!                  * obtain a tx.  If that's the case then tx_lasttried_txg
!                  * would not have been set.
!                  */
!                 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
          } else if (tx->tx_needassign_txh) {
+                 /*
+                  * A dnode is assigned to the quiescing txg.  Wait for its
+                  * transaction to complete.
+                  */
                  dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
  
                  mutex_enter(&dn->dn_mtx);
                  while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
                          cv_wait(&dn->dn_notxholds, &dn->dn_mtx);