Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>


  28 #include <sys/txg_impl.h>
  29 #include <sys/dmu_impl.h>
  30 #include <sys/dmu_tx.h>
  31 #include <sys/dsl_pool.h>
  32 #include <sys/dsl_scan.h>
  33 #include <sys/callb.h>
  34 
  35 /*
  36  * ZFS Transaction Groups
  37  * ----------------------
  38  *
  39  * ZFS transaction groups are, as the name implies, groups of transactions
  40  * that act on persistent state. ZFS asserts consistency at the granularity of
  41  * these transaction groups. Each successive transaction group (txg) is
  42  * assigned a 64-bit consecutive identifier. There are three active
  43  * transaction group states: open, quiescing, or syncing. At any given time,
  44  * there may be an active txg associated with each state; each active txg may
  45  * either be processing, or blocked waiting to enter the next state. There may
  46  * be up to three active txgs, and there is always a txg in the open state
  47  * (though it may be blocked waiting to enter the quiescing state). In broad
  48  * strokes, transactions — operations that change in-memory structures — are
  49  * accepted into the txg in the open state, and are completed while the txg is
  50  * in the open or quiescing states. The accumulated changes are written to
  51  * disk in the syncing state.
  52  *
  53  * Open
  54  *
  55  * When a new txg becomes active, it first enters the open state. New
  56  * transactions — updates to in-memory structures — are assigned to the
  57  * currently open txg. There is always a txg in the open state so that ZFS can
  58  * accept new changes (though the txg may refuse new changes if it has hit
  59  * some limit). ZFS advances the open txg to the next state for a variety of
  60  * reasons such as it hitting a time or size threshold, or the execution of an
  61  * administrative action that must be completed in the syncing state.
  62  *
  63  * Quiescing
  64  *
  65  * After a txg exits the open state, it enters the quiescing state. The
  66  * quiescing state is intended to provide a buffer between accepting new
  67  * transactions in the open state and writing them out to stable storage in
  68  * the syncing state. While quiescing, transactions can continue their
  69  * operation without delaying either of the other states. Typically, a txg is
  70  * in the quiescing state very briefly since the operations are bounded by
  71  * software latencies rather than, say, slower I/O latencies. After all
  72  * transactions complete, the txg is ready to enter the next state.
  73  *
  74  * Syncing
  75  *
  76  * In the syncing state, the in-memory state built up during the open and (to


 347  * Blocks until all transactions in the group are committed.
 348  *
 349  * On return, the transaction group has reached a stable state in which it can
 350  * then be passed off to the syncing context.
 351  */
 352 static void
 353 txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 354 {
 355         tx_state_t *tx = &dp->dp_tx;
 356         int g = txg & TXG_MASK;
 357         int c;
 358 
 359         /*
 360          * Grab all tc_open_locks so nobody else can get into this txg.
 361          */
 362         for (c = 0; c < max_ncpus; c++)
 363                 mutex_enter(&tx->tx_cpu[c].tc_open_lock);
 364 
 365         ASSERT(txg == tx->tx_open_txg);
 366         tx->tx_open_txg++;

 367 
 368         DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
 369         DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
 370 
 371         /*
 372          * Now that we've incremented tx_open_txg, we can let threads
 373          * enter the next transaction group.
 374          */
 375         for (c = 0; c < max_ncpus; c++)
 376                 mutex_exit(&tx->tx_cpu[c].tc_open_lock);
 377 
 378         /*
 379          * Quiesce the transaction group by waiting for everyone to txg_exit().
 380          */
 381         for (c = 0; c < max_ncpus; c++) {
 382                 tx_cpu_t *tc = &tx->tx_cpu[c];
 383                 mutex_enter(&tc->tc_lock);
 384                 while (tc->tc_count[g] != 0)
 385                         cv_wait(&tc->tc_cv[g], &tc->tc_lock);
 386                 mutex_exit(&tc->tc_lock);


 437 
 438                 list_move_tail(cb_list, &tc->tc_callbacks[g]);
 439 
 440                 (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
 441                     txg_do_callbacks, cb_list, TQ_SLEEP);
 442         }
 443 }
 444 
 445 static void
 446 txg_sync_thread(dsl_pool_t *dp)
 447 {
 448         spa_t *spa = dp->dp_spa;
 449         tx_state_t *tx = &dp->dp_tx;
 450         callb_cpr_t cpr;
 451         uint64_t start, delta;
 452 
 453         txg_thread_enter(tx, &cpr);
 454 
 455         start = delta = 0;
 456         for (;;) {
 457                 uint64_t timer, timeout = zfs_txg_timeout * hz;

 458                 uint64_t txg;
 459 
 460                 /*
 461                  * We sync when we're scanning, there's someone waiting
 462                  * on us, or the quiesce thread has handed off a txg to
 463                  * us, or we have reached our timeout.
 464                  */
 465                 timer = (delta >= timeout ? 0 : timeout - delta);
 466                 while (!dsl_scan_active(dp->dp_scan) &&
 467                     !tx->tx_exiting && timer > 0 &&
 468                     tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
 469                     tx->tx_quiesced_txg == 0) {

 470                         dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
 471                             tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 472                         txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
 473                         delta = ddi_get_lbolt() - start;
 474                         timer = (delta > timeout ? 0 : timeout - delta);
 475                 }
 476 
 477                 /*
 478                  * Wait until the quiesce thread hands off a txg to us,
 479                  * prompting it to do so if necessary.
 480                  */
 481                 while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
 482                         if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
 483                                 tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
 484                         cv_broadcast(&tx->tx_quiesce_more_cv);
 485                         txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
 486                 }
 487 
 488                 if (tx->tx_exiting)
 489                         txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);


 624 void
 625 txg_wait_open(dsl_pool_t *dp, uint64_t txg)
 626 {
 627         tx_state_t *tx = &dp->dp_tx;
 628 
 629         ASSERT(!dsl_pool_config_held(dp));
 630 
 631         mutex_enter(&tx->tx_sync_lock);
 632         ASSERT(tx->tx_threads == 2);
 633         if (txg == 0)
 634                 txg = tx->tx_open_txg + 1;
 635         if (tx->tx_quiesce_txg_waiting < txg)
 636                 tx->tx_quiesce_txg_waiting = txg;
 637         dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 638             txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 639         while (tx->tx_open_txg < txg) {
 640                 cv_broadcast(&tx->tx_quiesce_more_cv);
 641                 cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
 642         }
 643         mutex_exit(&tx->tx_sync_lock);






















 644 }
 645 
 646 boolean_t
 647 txg_stalled(dsl_pool_t *dp)
 648 {
 649         tx_state_t *tx = &dp->dp_tx;
 650         return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
 651 }
 652 
 653 boolean_t
 654 txg_sync_waiting(dsl_pool_t *dp)
 655 {
 656         tx_state_t *tx = &dp->dp_tx;
 657 
 658         return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
 659             tx->tx_quiesced_txg != 0);
 660 }
 661 
 662 /*
 663  * Per-txg object lists.




  28 #include <sys/txg_impl.h>
  29 #include <sys/dmu_impl.h>
  30 #include <sys/dmu_tx.h>
  31 #include <sys/dsl_pool.h>
  32 #include <sys/dsl_scan.h>
  33 #include <sys/callb.h>
  34 
  35 /*
  36  * ZFS Transaction Groups
  37  * ----------------------
  38  *
  39  * ZFS transaction groups are, as the name implies, groups of transactions
  40  * that act on persistent state. ZFS asserts consistency at the granularity of
  41  * these transaction groups. Each successive transaction group (txg) is
  42  * assigned a 64-bit consecutive identifier. There are three active
  43  * transaction group states: open, quiescing, or syncing. At any given time,
  44  * there may be an active txg associated with each state; each active txg may
  45  * either be processing, or blocked waiting to enter the next state. There may
  46  * be up to three active txgs, and there is always a txg in the open state
  47  * (though it may be blocked waiting to enter the quiescing state). In broad
  48  * strokes, transactions -- operations that change in-memory structures -- are
  49  * accepted into the txg in the open state, and are completed while the txg is
  50  * in the open or quiescing states. The accumulated changes are written to
  51  * disk in the syncing state.
  52  *
  53  * Open
  54  *
  55  * When a new txg becomes active, it first enters the open state. New
  56  * transactions -- updates to in-memory structures -- are assigned to the
  57  * currently open txg. There is always a txg in the open state so that ZFS can
  58  * accept new changes (though the txg may refuse new changes if it has hit
  59  * some limit). ZFS advances the open txg to the next state for a variety of
  60  * reasons such as it hitting a time or size threshold, or the execution of an
  61  * administrative action that must be completed in the syncing state.
  62  *
  63  * Quiescing
  64  *
  65  * After a txg exits the open state, it enters the quiescing state. The
  66  * quiescing state is intended to provide a buffer between accepting new
  67  * transactions in the open state and writing them out to stable storage in
  68  * the syncing state. While quiescing, transactions can continue their
  69  * operation without delaying either of the other states. Typically, a txg is
  70  * in the quiescing state very briefly since the operations are bounded by
  71  * software latencies rather than, say, slower I/O latencies. After all
  72  * transactions complete, the txg is ready to enter the next state.
  73  *
  74  * Syncing
  75  *
  76  * In the syncing state, the in-memory state built up during the open and (to


 347  * Blocks until all transactions in the group are committed.
 348  *
 349  * On return, the transaction group has reached a stable state in which it can
 350  * then be passed off to the syncing context.
 351  */
 352 static void
 353 txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 354 {
 355         tx_state_t *tx = &dp->dp_tx;
 356         int g = txg & TXG_MASK;
 357         int c;
 358 
 359         /*
 360          * Grab all tc_open_locks so nobody else can get into this txg.
 361          */
 362         for (c = 0; c < max_ncpus; c++)
 363                 mutex_enter(&tx->tx_cpu[c].tc_open_lock);
 364 
 365         ASSERT(txg == tx->tx_open_txg);
 366         tx->tx_open_txg++;
 367         tx->tx_open_time = gethrtime();
 368 
 369         DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
 370         DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
 371 
 372         /*
 373          * Now that we've incremented tx_open_txg, we can let threads
 374          * enter the next transaction group.
 375          */
 376         for (c = 0; c < max_ncpus; c++)
 377                 mutex_exit(&tx->tx_cpu[c].tc_open_lock);
 378 
 379         /*
 380          * Quiesce the transaction group by waiting for everyone to txg_exit().
 381          */
 382         for (c = 0; c < max_ncpus; c++) {
 383                 tx_cpu_t *tc = &tx->tx_cpu[c];
 384                 mutex_enter(&tc->tc_lock);
 385                 while (tc->tc_count[g] != 0)
 386                         cv_wait(&tc->tc_cv[g], &tc->tc_lock);
 387                 mutex_exit(&tc->tc_lock);


 438 
 439                 list_move_tail(cb_list, &tc->tc_callbacks[g]);
 440 
 441                 (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
 442                     txg_do_callbacks, cb_list, TQ_SLEEP);
 443         }
 444 }
 445 
 446 static void
 447 txg_sync_thread(dsl_pool_t *dp)
 448 {
 449         spa_t *spa = dp->dp_spa;
 450         tx_state_t *tx = &dp->dp_tx;
 451         callb_cpr_t cpr;
 452         uint64_t start, delta;
 453 
 454         txg_thread_enter(tx, &cpr);
 455 
 456         start = delta = 0;
 457         for (;;) {
 458                 uint64_t timeout = zfs_txg_timeout * hz;
 459                 uint64_t timer;
 460                 uint64_t txg;
 461 
 462                 /*
 463                  * We sync when we're scanning, there's someone waiting
 464                  * on us, or the quiesce thread has handed off a txg to
 465                  * us, or we have reached our timeout.
 466                  */
 467                 timer = (delta >= timeout ? 0 : timeout - delta);
 468                 while (!dsl_scan_active(dp->dp_scan) &&
 469                     !tx->tx_exiting && timer > 0 &&
 470                     tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
 471                     tx->tx_quiesced_txg == 0 &&
 472                     dp->dp_dirty_total < zfs_dirty_data_sync) {
 473                         dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
 474                             tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 475                         txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
 476                         delta = ddi_get_lbolt() - start;
 477                         timer = (delta > timeout ? 0 : timeout - delta);
 478                 }
 479 
 480                 /*
 481                  * Wait until the quiesce thread hands off a txg to us,
 482                  * prompting it to do so if necessary.
 483                  */
 484                 while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
 485                         if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
 486                                 tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
 487                         cv_broadcast(&tx->tx_quiesce_more_cv);
 488                         txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
 489                 }
 490 
 491                 if (tx->tx_exiting)
 492                         txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);


 627 void
 628 txg_wait_open(dsl_pool_t *dp, uint64_t txg)
 629 {
 630         tx_state_t *tx = &dp->dp_tx;
 631 
 632         ASSERT(!dsl_pool_config_held(dp));
 633 
 634         mutex_enter(&tx->tx_sync_lock);
 635         ASSERT(tx->tx_threads == 2);
 636         if (txg == 0)
 637                 txg = tx->tx_open_txg + 1;
 638         if (tx->tx_quiesce_txg_waiting < txg)
 639                 tx->tx_quiesce_txg_waiting = txg;
 640         dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 641             txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 642         while (tx->tx_open_txg < txg) {
 643                 cv_broadcast(&tx->tx_quiesce_more_cv);
 644                 cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
 645         }
 646         mutex_exit(&tx->tx_sync_lock);
 647 }
 648 
 649 /*
 650  * If there isn't a txg syncing or in the pipeline, push another txg through
 651  * the pipeline by queiscing the open txg.
 652  */
 653 void
 654 txg_kick(dsl_pool_t *dp)
 655 {
 656         tx_state_t *tx = &dp->dp_tx;
 657 
 658         ASSERT(!dsl_pool_config_held(dp));
 659 
 660         mutex_enter(&tx->tx_sync_lock);
 661         if (tx->tx_syncing_txg == 0 &&
 662             tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
 663             tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
 664             tx->tx_quiesced_txg <= tx->tx_synced_txg) {
 665                 tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
 666                 cv_broadcast(&tx->tx_quiesce_more_cv);
 667         }
 668         mutex_exit(&tx->tx_sync_lock);
 669 }
 670 
 671 boolean_t
 672 txg_stalled(dsl_pool_t *dp)
 673 {
 674         tx_state_t *tx = &dp->dp_tx;
 675         return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
 676 }
 677 
 678 boolean_t
 679 txg_sync_waiting(dsl_pool_t *dp)
 680 {
 681         tx_state_t *tx = &dp->dp_tx;
 682 
 683         return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
 684             tx->tx_quiesced_txg != 0);
 685 }
 686 
 687 /*
 688  * Per-txg object lists.