Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/txg.c
          +++ new/usr/src/uts/common/fs/zfs/txg.c
↓ open down ↓ 37 lines elided ↑ open up ↑
  38   38   *
  39   39   * ZFS transaction groups are, as the name implies, groups of transactions
  40   40   * that act on persistent state. ZFS asserts consistency at the granularity of
  41   41   * these transaction groups. Each successive transaction group (txg) is
  42   42   * assigned a 64-bit consecutive identifier. There are three active
  43   43   * transaction group states: open, quiescing, or syncing. At any given time,
  44   44   * there may be an active txg associated with each state; each active txg may
  45   45   * either be processing, or blocked waiting to enter the next state. There may
  46   46   * be up to three active txgs, and there is always a txg in the open state
  47   47   * (though it may be blocked waiting to enter the quiescing state). In broad
  48      - * strokes, transactions — operations that change in-memory structures — are
       48 + * strokes, transactions -- operations that change in-memory structures -- are
  49   49   * accepted into the txg in the open state, and are completed while the txg is
  50   50   * in the open or quiescing states. The accumulated changes are written to
  51   51   * disk in the syncing state.
  52   52   *
  53   53   * Open
  54   54   *
  55   55   * When a new txg becomes active, it first enters the open state. New
  56      - * transactions — updates to in-memory structures — are assigned to the
       56 + * transactions -- updates to in-memory structures -- are assigned to the
  57   57   * currently open txg. There is always a txg in the open state so that ZFS can
  58   58   * accept new changes (though the txg may refuse new changes if it has hit
  59   59   * some limit). ZFS advances the open txg to the next state for a variety of
  60   60   * reasons such as it hitting a time or size threshold, or the execution of an
  61   61   * administrative action that must be completed in the syncing state.
  62   62   *
  63   63   * Quiescing
  64   64   *
  65   65   * After a txg exits the open state, it enters the quiescing state. The
  66   66   * quiescing state is intended to provide a buffer between accepting new
↓ open down ↓ 290 lines elided ↑ open up ↑
 357  357          int c;
 358  358  
 359  359          /*
 360  360           * Grab all tc_open_locks so nobody else can get into this txg.
 361  361           */
 362  362          for (c = 0; c < max_ncpus; c++)
 363  363                  mutex_enter(&tx->tx_cpu[c].tc_open_lock);
 364  364  
 365  365          ASSERT(txg == tx->tx_open_txg);
 366  366          tx->tx_open_txg++;
      367 +        tx->tx_open_time = gethrtime();
 367  368  
 368  369          DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
 369  370          DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
 370  371  
 371  372          /*
 372  373           * Now that we've incremented tx_open_txg, we can let threads
 373  374           * enter the next transaction group.
 374  375           */
 375  376          for (c = 0; c < max_ncpus; c++)
 376  377                  mutex_exit(&tx->tx_cpu[c].tc_open_lock);
↓ open down ↓ 70 lines elided ↑ open up ↑
 447  448  {
 448  449          spa_t *spa = dp->dp_spa;
 449  450          tx_state_t *tx = &dp->dp_tx;
 450  451          callb_cpr_t cpr;
 451  452          uint64_t start, delta;
 452  453  
 453  454          txg_thread_enter(tx, &cpr);
 454  455  
 455  456          start = delta = 0;
 456  457          for (;;) {
 457      -                uint64_t timer, timeout = zfs_txg_timeout * hz;
      458 +                uint64_t timeout = zfs_txg_timeout * hz;
      459 +                uint64_t timer;
 458  460                  uint64_t txg;
 459  461  
 460  462                  /*
 461  463                   * We sync when we're scanning, there's someone waiting
 462  464                   * on us, or the quiesce thread has handed off a txg to
 463  465                   * us, or we have reached our timeout.
 464  466                   */
 465  467                  timer = (delta >= timeout ? 0 : timeout - delta);
 466  468                  while (!dsl_scan_active(dp->dp_scan) &&
 467  469                      !tx->tx_exiting && timer > 0 &&
 468  470                      tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
 469      -                    tx->tx_quiesced_txg == 0) {
      471 +                    tx->tx_quiesced_txg == 0 &&
      472 +                    dp->dp_dirty_total < zfs_dirty_data_sync) {
 470  473                          dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
 471  474                              tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 472  475                          txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
 473  476                          delta = ddi_get_lbolt() - start;
 474  477                          timer = (delta > timeout ? 0 : timeout - delta);
 475  478                  }
 476  479  
 477  480                  /*
 478  481                   * Wait until the quiesce thread hands off a txg to us,
 479  482                   * prompting it to do so if necessary.
↓ open down ↓ 154 lines elided ↑ open up ↑
 634  637                  txg = tx->tx_open_txg + 1;
 635  638          if (tx->tx_quiesce_txg_waiting < txg)
 636  639                  tx->tx_quiesce_txg_waiting = txg;
 637  640          dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 638  641              txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 639  642          while (tx->tx_open_txg < txg) {
 640  643                  cv_broadcast(&tx->tx_quiesce_more_cv);
 641  644                  cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
 642  645          }
 643  646          mutex_exit(&tx->tx_sync_lock);
      647 +}
      648 +
      649 +/*
      650 + * If there isn't a txg syncing or in the pipeline, push another txg through
      651 + * the pipeline by queiscing the open txg.
      652 + */
      653 +void
      654 +txg_kick(dsl_pool_t *dp)
      655 +{
      656 +        tx_state_t *tx = &dp->dp_tx;
      657 +
      658 +        ASSERT(!dsl_pool_config_held(dp));
      659 +
      660 +        mutex_enter(&tx->tx_sync_lock);
      661 +        if (tx->tx_syncing_txg == 0 &&
      662 +            tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
      663 +            tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
      664 +            tx->tx_quiesced_txg <= tx->tx_synced_txg) {
      665 +                tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
      666 +                cv_broadcast(&tx->tx_quiesce_more_cv);
      667 +        }
      668 +        mutex_exit(&tx->tx_sync_lock);
 644  669  }
 645  670  
 646  671  boolean_t
 647  672  txg_stalled(dsl_pool_t *dp)
 648  673  {
 649  674          tx_state_t *tx = &dp->dp_tx;
 650  675          return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
 651  676  }
 652  677  
 653  678  boolean_t
↓ open down ↓ 172 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX