Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dmu_tx.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_tx.c
↓ open down ↓ 46 lines elided ↑ open up ↑
  47   47  dmu_tx_create_dd(dsl_dir_t *dd)
  48   48  {
  49   49          dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
  50   50          tx->tx_dir = dd;
  51   51          if (dd != NULL)
  52   52                  tx->tx_pool = dd->dd_pool;
  53   53          list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
  54   54              offsetof(dmu_tx_hold_t, txh_node));
  55   55          list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
  56   56              offsetof(dmu_tx_callback_t, dcb_node));
       57 +        tx->tx_start = gethrtime();
  57   58  #ifdef ZFS_DEBUG
  58   59          refcount_create(&tx->tx_space_written);
  59   60          refcount_create(&tx->tx_space_freed);
  60   61  #endif
  61   62          return (tx);
  62   63  }
  63   64  
  64   65  dmu_tx_t *
  65   66  dmu_tx_create(objset_t *os)
  66   67  {
↓ open down ↓ 523 lines elided ↑ open up ↑
 590  591          int err;
 591  592          zio_t *zio;
 592  593  
 593  594          ASSERT(tx->tx_txg == 0);
 594  595  
 595  596          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 596  597              object, THT_FREE, off, len);
 597  598          if (txh == NULL)
 598  599                  return;
 599  600          dn = txh->txh_dnode;
      601 +        dmu_tx_count_dnode(txh);
 600  602  
 601  603          if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 602  604                  return;
 603  605          if (len == DMU_OBJECT_END)
 604  606                  len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 605  607  
 606      -        dmu_tx_count_dnode(txh);
 607  608  
 608  609          /*
 609  610           * For i/o error checking, we read the first and last level-0
 610  611           * blocks if they are not aligned, and all the level-1 blocks.
 611  612           *
 612  613           * Note:  dbuf_free_range() assumes that we have not instantiated
 613  614           * any level-0 dbufs that will be completely freed.  Therefore we must
 614  615           * exercise care to not read or count the first and last blocks
 615  616           * if they are blocksize-aligned.
 616  617           */
↓ open down ↓ 287 lines elided ↑ open up ↑
 904  905                          return;
 905  906                  }
 906  907          }
 907  908          DB_DNODE_EXIT(db);
 908  909          panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 909  910              (u_longlong_t)db->db.db_object, db->db_level,
 910  911              (u_longlong_t)db->db_blkid);
 911  912  }
 912  913  #endif
 913  914  
      915 +/*
      916 + * If we can't do 10 iops, something is wrong.  Let us go ahead
      917 + * and hit zfs_dirty_data_max.
      918 + */
      919 +hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
      920 +int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
      921 +
      922 +/*
      923 + * We delay transactions when we've determined that the backend storage
      924 + * isn't able to accommodate the rate of incoming writes.
      925 + *
      926 + * If there is already a transaction waiting, we delay relative to when
      927 + * that transaction finishes waiting.  This way the calculated min_time
      928 + * is independent of the number of threads concurrently executing
      929 + * transactions.
      930 + *
      931 + * If we are the only waiter, wait relative to when the transaction
      932 + * started, rather than the current time.  This credits the transaction for
      933 + * "time already served", e.g. reading indirect blocks.
      934 + *
      935 + * The minimum time for a transaction to take is calculated as:
      936 + *     min_time = scale * (dirty - min) / (max - dirty)
      937 + *     min_time is then capped at zfs_delay_max_ns.
      938 + *
      939 + * The delay has two degrees of freedom that can be adjusted via tunables.
      940 + * The percentage of dirty data at which we start to delay is defined by
      941 + * zfs_delay_min_dirty_percent. This should typically be at or above
      942 + * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
      943 + * delay after writing at full speed has failed to keep up with the incoming
      944 + * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
      945 + * speaking, this variable determines the amount of delay at the midpoint of
      946 + * the curve.
      947 + *
      948 + * delay
      949 + *  10ms +-------------------------------------------------------------*+
      950 + *       |                                                             *|
      951 + *   9ms +                                                             *+
      952 + *       |                                                             *|
      953 + *   8ms +                                                             *+
      954 + *       |                                                            * |
      955 + *   7ms +                                                            * +
      956 + *       |                                                            * |
      957 + *   6ms +                                                            * +
      958 + *       |                                                            * |
      959 + *   5ms +                                                           *  +
      960 + *       |                                                           *  |
      961 + *   4ms +                                                           *  +
      962 + *       |                                                           *  |
      963 + *   3ms +                                                          *   +
      964 + *       |                                                          *   |
      965 + *   2ms +                                              (midpoint) *    +
      966 + *       |                                                  |    **     |
      967 + *   1ms +                                                  v ***       +
      968 + *       |             zfs_delay_scale ---------->     ********         |
      969 + *     0 +-------------------------------------*********----------------+
      970 + *       0%                    <- zfs_dirty_data_max ->               100%
      971 + *
      972 + * Note that since the delay is added to the outstanding time remaining on the
      973 + * most recent transaction, the delay is effectively the inverse of IOPS.
      974 + * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
      975 + * was chosen such that small changes in the amount of accumulated dirty data
      976 + * in the first 3/4 of the curve yield relatively small differences in the
      977 + * amount of delay.
      978 + *
      979 + * The effects can be easier to understand when the amount of delay is
      980 + * represented on a log scale:
      981 + *
      982 + * delay
      983 + * 100ms +-------------------------------------------------------------++
      984 + *       +                                                              +
      985 + *       |                                                              |
      986 + *       +                                                             *+
      987 + *  10ms +                                                             *+
      988 + *       +                                                           ** +
      989 + *       |                                              (midpoint)  **  |
      990 + *       +                                                  |     **    +
      991 + *   1ms +                                                  v ****      +
      992 + *       +             zfs_delay_scale ---------->        *****         +
      993 + *       |                                             ****             |
      994 + *       +                                          ****                +
      995 + * 100us +                                        **                    +
      996 + *       +                                       *                      +
      997 + *       |                                      *                       |
      998 + *       +                                     *                        +
      999 + *  10us +                                     *                        +
     1000 + *       +                                                              +
     1001 + *       |                                                              |
     1002 + *       +                                                              +
     1003 + *       +--------------------------------------------------------------+
     1004 + *       0%                    <- zfs_dirty_data_max ->               100%
     1005 + *
     1006 + * Note here that only as the amount of dirty data approaches its limit does
     1007 + * the delay start to increase rapidly. The goal of a properly tuned system
     1008 + * should be to keep the amount of dirty data out of that range by first
     1009 + * ensuring that the appropriate limits are set for the I/O scheduler to reach
     1010 + * optimal throughput on the backend storage, and then by changing the value
     1011 + * of zfs_delay_scale to increase the steepness of the curve.
     1012 + */
     1013 +static void
     1014 +dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
     1015 +{
     1016 +        dsl_pool_t *dp = tx->tx_pool;
     1017 +        uint64_t delay_min_bytes =
     1018 +            zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
     1019 +        hrtime_t wakeup, min_tx_time, now;
     1020 +
     1021 +        if (dirty <= delay_min_bytes)
     1022 +                return;
     1023 +
     1024 +        /*
     1025 +         * The caller has already waited until we are under the max.
     1026 +         * We make them pass us the amount of dirty data so we don't
     1027 +         * have to handle the case of it being >= the max, which could
     1028 +         * cause a divide-by-zero if it's == the max.
     1029 +         */
     1030 +        ASSERT3U(dirty, <, zfs_dirty_data_max);
     1031 +
     1032 +        now = gethrtime();
     1033 +        min_tx_time = zfs_delay_scale *
     1034 +            (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
     1035 +        if (now > tx->tx_start + min_tx_time)
     1036 +                return;
     1037 +
     1038 +        min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
     1039 +
     1040 +        DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
     1041 +            uint64_t, min_tx_time);
     1042 +
     1043 +        mutex_enter(&dp->dp_lock);
     1044 +        wakeup = MAX(tx->tx_start + min_tx_time,
     1045 +            dp->dp_last_wakeup + min_tx_time);
     1046 +        dp->dp_last_wakeup = wakeup;
     1047 +        mutex_exit(&dp->dp_lock);
     1048 +
     1049 +#ifdef _KERNEL
     1050 +        mutex_enter(&curthread->t_delay_lock);
     1051 +        while (cv_timedwait_hires(&curthread->t_delay_cv,
     1052 +            &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
     1053 +            CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
     1054 +                continue;
     1055 +        mutex_exit(&curthread->t_delay_lock);
     1056 +#else
     1057 +        hrtime_t delta = wakeup - gethrtime();
     1058 +        struct timespec ts;
     1059 +        ts.tv_sec = delta / NANOSEC;
     1060 +        ts.tv_nsec = delta % NANOSEC;
     1061 +        (void) nanosleep(&ts, NULL);
     1062 +#endif
     1063 +}
     1064 +
 914 1065  static int
 915 1066  dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
 916 1067  {
 917 1068          dmu_tx_hold_t *txh;
 918 1069          spa_t *spa = tx->tx_pool->dp_spa;
 919 1070          uint64_t memory, asize, fsize, usize;
 920 1071          uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
 921 1072  
 922 1073          ASSERT0(tx->tx_txg);
 923 1074  
↓ open down ↓ 10 lines elided ↑ open up ↑
 934 1085                   * Note that we always honor the txg_how flag regardless
 935 1086                   * of the failuremode setting.
 936 1087                   */
 937 1088                  if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 938 1089                      txg_how != TXG_WAIT)
 939 1090                          return (SET_ERROR(EIO));
 940 1091  
 941 1092                  return (SET_ERROR(ERESTART));
 942 1093          }
 943 1094  
     1095 +        if (!tx->tx_waited &&
     1096 +            dsl_pool_need_dirty_delay(tx->tx_pool)) {
     1097 +                tx->tx_wait_dirty = B_TRUE;
     1098 +                return (SET_ERROR(ERESTART));
     1099 +        }
     1100 +
 944 1101          tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 945 1102          tx->tx_needassign_txh = NULL;
 946 1103  
 947 1104          /*
 948 1105           * NB: No error returns are allowed after txg_hold_open, but
 949 1106           * before processing the dnode holds, due to the
 950 1107           * dmu_tx_unassign() logic.
 951 1108           */
 952 1109  
 953 1110          towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
↓ open down ↓ 104 lines elided ↑ open up ↑
1058 1215   * Assign tx to a transaction group.  txg_how can be one of:
1059 1216   *
1060 1217   * (1)  TXG_WAIT.  If the current open txg is full, waits until there's
1061 1218   *      a new one.  This should be used when you're not holding locks.
1062 1219   *      It will only fail if we're truly out of space (or over quota).
1063 1220   *
1064 1221   * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
1065 1222   *      blocking, returns immediately with ERESTART.  This should be used
1066 1223   *      whenever you're holding locks.  On an ERESTART error, the caller
1067 1224   *      should drop locks, do a dmu_tx_wait(tx), and try again.
     1225 + *
     1226 + * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
     1227 + *      has already been called on behalf of this operation (though
     1228 + *      most likely on a different tx).
1068 1229   */
1069 1230  int
1070 1231  dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1071 1232  {
1072 1233          int err;
1073 1234  
1074 1235          ASSERT(tx->tx_txg == 0);
1075      -        ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT);
     1236 +        ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
     1237 +            txg_how == TXG_WAITED);
1076 1238          ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1077 1239  
1078 1240          /* If we might wait, we must not hold the config lock. */
1079 1241          ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1080 1242  
     1243 +        if (txg_how == TXG_WAITED)
     1244 +                tx->tx_waited = B_TRUE;
     1245 +
1081 1246          while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1082 1247                  dmu_tx_unassign(tx);
1083 1248  
1084 1249                  if (err != ERESTART || txg_how != TXG_WAIT)
1085 1250                          return (err);
1086 1251  
1087 1252                  dmu_tx_wait(tx);
1088 1253          }
1089 1254  
1090 1255          txg_rele_to_quiesce(&tx->tx_txgh);
1091 1256  
1092 1257          return (0);
1093 1258  }
1094 1259  
1095 1260  void
1096 1261  dmu_tx_wait(dmu_tx_t *tx)
1097 1262  {
1098 1263          spa_t *spa = tx->tx_pool->dp_spa;
     1264 +        dsl_pool_t *dp = tx->tx_pool;
1099 1265  
1100 1266          ASSERT(tx->tx_txg == 0);
1101 1267          ASSERT(!dsl_pool_config_held(tx->tx_pool));
1102 1268  
1103      -        /*
1104      -         * It's possible that the pool has become active after this thread
1105      -         * has tried to obtain a tx. If that's the case then his
1106      -         * tx_lasttried_txg would not have been assigned.
1107      -         */
1108      -        if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1109      -                txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
     1269 +        if (tx->tx_wait_dirty) {
     1270 +                /*
     1271 +                 * dmu_tx_try_assign() has determined that we need to wait
     1272 +                 * because we've consumed much or all of the dirty buffer
     1273 +                 * space.
     1274 +                 */
     1275 +                mutex_enter(&dp->dp_lock);
     1276 +                while (dp->dp_dirty_total >= zfs_dirty_data_max)
     1277 +                        cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
     1278 +                uint64_t dirty = dp->dp_dirty_total;
     1279 +                mutex_exit(&dp->dp_lock);
     1280 +
     1281 +                dmu_tx_delay(tx, dirty);
     1282 +
     1283 +                tx->tx_wait_dirty = B_FALSE;
     1284 +
     1285 +                /*
     1286 +                 * Note: setting tx_waited only has effect if the caller
     1287 +                 * used TX_WAIT.  Otherwise they are going to destroy
     1288 +                 * this tx and try again.  The common case, zfs_write(),
     1289 +                 * uses TX_WAIT.
     1290 +                 */
     1291 +                tx->tx_waited = B_TRUE;
     1292 +        } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
     1293 +                /*
     1294 +                 * If the pool is suspended we need to wait until it
     1295 +                 * is resumed.  Note that it's possible that the pool
     1296 +                 * has become active after this thread has tried to
     1297 +                 * obtain a tx.  If that's the case then tx_lasttried_txg
     1298 +                 * would not have been set.
     1299 +                 */
     1300 +                txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1110 1301          } else if (tx->tx_needassign_txh) {
     1302 +                /*
     1303 +                 * A dnode is assigned to the quiescing txg.  Wait for its
     1304 +                 * transaction to complete.
     1305 +                 */
1111 1306                  dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1112 1307  
1113 1308                  mutex_enter(&dn->dn_mtx);
1114 1309                  while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1115 1310                          cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1116 1311                  mutex_exit(&dn->dn_mtx);
1117 1312                  tx->tx_needassign_txh = NULL;
1118 1313          } else {
1119 1314                  txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1120 1315          }
↓ open down ↓ 294 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX