Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>


 567         }
 568 
 569         if (used > quota) {
 570                 /* over quota */
 571                 myspace = 0;
 572         } else {
 573                 /*
 574                  * the lesser of the space provided by our parent and
 575                  * the space left in our quota
 576                  */
 577                 myspace = MIN(parentspace, quota - used);
 578         }
 579 
 580         mutex_exit(&dd->dd_lock);
 581 
 582         return (myspace);
 583 }
 584 
 585 struct tempreserve {
 586         list_node_t tr_node;
 587         dsl_pool_t *tr_dp;
 588         dsl_dir_t *tr_ds;
 589         uint64_t tr_size;
 590 };
 591 
 592 static int
 593 dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 594     boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
 595     dmu_tx_t *tx, boolean_t first)
 596 {
 597         uint64_t txg = tx->tx_txg;
 598         uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
 599         uint64_t deferred = 0;
 600         struct tempreserve *tr;
 601         int retval = EDQUOT;
 602         int txgidx = txg & TXG_MASK;
 603         int i;
 604         uint64_t ref_rsrv = 0;
 605 
 606         ASSERT3U(txg, !=, 0);
 607         ASSERT3S(asize, >, 0);


 718         list_t *tr_list;
 719 
 720         if (asize == 0) {
 721                 *tr_cookiep = NULL;
 722                 return (0);
 723         }
 724 
 725         tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 726         list_create(tr_list, sizeof (struct tempreserve),
 727             offsetof(struct tempreserve, tr_node));
 728         ASSERT3S(asize, >, 0);
 729         ASSERT3S(fsize, >=, 0);
 730 
 731         err = arc_tempreserve_space(lsize, tx->tx_txg);
 732         if (err == 0) {
 733                 struct tempreserve *tr;
 734 
 735                 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 736                 tr->tr_size = lsize;
 737                 list_insert_tail(tr_list, tr);
 738 
 739                 err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
 740         } else {
 741                 if (err == EAGAIN) {









 742                         txg_delay(dd->dd_pool, tx->tx_txg,
 743                             MSEC2NSEC(10), MSEC2NSEC(10));
 744                         err = SET_ERROR(ERESTART);
 745                 }
 746                 dsl_pool_memory_pressure(dd->dd_pool);
 747         }
 748 
 749         if (err == 0) {
 750                 struct tempreserve *tr;
 751 
 752                 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 753                 tr->tr_dp = dd->dd_pool;
 754                 tr->tr_size = asize;
 755                 list_insert_tail(tr_list, tr);
 756 
 757                 err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
 758                     FALSE, asize > usize, tr_list, tx, TRUE);
 759         }
 760 
 761         if (err != 0)
 762                 dsl_dir_tempreserve_clear(tr_list, tx);
 763         else
 764                 *tr_cookiep = tr_list;
 765 
 766         return (err);
 767 }
 768 
 769 /*
 770  * Clear a temporary reservation that we previously made with
 771  * dsl_dir_tempreserve_space().
 772  */
 773 void
 774 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
 775 {
 776         int txgidx = tx->tx_txg & TXG_MASK;
 777         list_t *tr_list = tr_cookie;
 778         struct tempreserve *tr;
 779 
 780         ASSERT3U(tx->tx_txg, !=, 0);
 781 
 782         if (tr_cookie == NULL)
 783                 return;
 784 
 785         while (tr = list_head(tr_list)) {
 786                 if (tr->tr_dp) {
 787                         dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx);
 788                 } else if (tr->tr_ds) {
 789                         mutex_enter(&tr->tr_ds->dd_lock);
 790                         ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
 791                             tr->tr_size);
 792                         tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
 793                         mutex_exit(&tr->tr_ds->dd_lock);
 794                 } else {
 795                         arc_tempreserve_clear(tr->tr_size);
 796                 }
 797                 list_remove(tr_list, tr);
 798                 kmem_free(tr, sizeof (struct tempreserve));
 799         }
 800 
 801         kmem_free(tr_list, sizeof (list_t));
 802 }
 803 
 804 static void
 805 dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)






 806 {
 807         int64_t parent_space;
 808         uint64_t est_used;
 809 
 810         mutex_enter(&dd->dd_lock);
 811         if (space > 0)
 812                 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
 813 
 814         est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
 815         parent_space = parent_delta(dd, est_used, space);
 816         mutex_exit(&dd->dd_lock);
 817 
 818         /* Make sure that we clean up dd_space_to* */
 819         dsl_dir_dirty(dd, tx);
 820 
 821         /* XXX this is potentially expensive and unnecessary... */
 822         if (parent_space && dd->dd_parent)
 823                 dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx);
 824 }
 825 
 826 /*
 827  * Call in open context when we think we're going to write/free space,
 828  * eg. when dirtying data.  Be conservative (ie. OK to write less than
 829  * this or free more than this, but don't write more or free less).
 830  */
 831 void
 832 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 833 {
 834         dsl_pool_willuse_space(dd->dd_pool, space, tx);
 835         dsl_dir_willuse_space_impl(dd, space, tx);
 836 }
 837 
 838 /* call from syncing context when we actually write/free space for this dd */
 839 void
 840 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
 841     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
 842 {
 843         int64_t accounted_delta;
 844 
 845         /*
 846          * dsl_dataset_set_refreservation_sync_impl() calls this with
 847          * dd_lock held, so that it can atomically update
 848          * ds->ds_reserved and the dsl_dir accounting, so that
 849          * dsl_dataset_check_quota() can see dataset and dir accounting
 850          * consistently.
 851          */
 852         boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
 853 
 854         ASSERT(dmu_tx_is_syncing(tx));
 855         ASSERT(type < DD_USED_NUM);
 856 
 857         dmu_buf_will_dirty(dd->dd_dbuf, tx);




 567         }
 568 
 569         if (used > quota) {
 570                 /* over quota */
 571                 myspace = 0;
 572         } else {
 573                 /*
 574                  * the lesser of the space provided by our parent and
 575                  * the space left in our quota
 576                  */
 577                 myspace = MIN(parentspace, quota - used);
 578         }
 579 
 580         mutex_exit(&dd->dd_lock);
 581 
 582         return (myspace);
 583 }
 584 
 585 struct tempreserve {
 586         list_node_t tr_node;

 587         dsl_dir_t *tr_ds;
 588         uint64_t tr_size;
 589 };
 590 
 591 static int
 592 dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 593     boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
 594     dmu_tx_t *tx, boolean_t first)
 595 {
 596         uint64_t txg = tx->tx_txg;
 597         uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
 598         uint64_t deferred = 0;
 599         struct tempreserve *tr;
 600         int retval = EDQUOT;
 601         int txgidx = txg & TXG_MASK;
 602         int i;
 603         uint64_t ref_rsrv = 0;
 604 
 605         ASSERT3U(txg, !=, 0);
 606         ASSERT3S(asize, >, 0);


 717         list_t *tr_list;
 718 
 719         if (asize == 0) {
 720                 *tr_cookiep = NULL;
 721                 return (0);
 722         }
 723 
 724         tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 725         list_create(tr_list, sizeof (struct tempreserve),
 726             offsetof(struct tempreserve, tr_node));
 727         ASSERT3S(asize, >, 0);
 728         ASSERT3S(fsize, >=, 0);
 729 
 730         err = arc_tempreserve_space(lsize, tx->tx_txg);
 731         if (err == 0) {
 732                 struct tempreserve *tr;
 733 
 734                 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 735                 tr->tr_size = lsize;
 736                 list_insert_tail(tr_list, tr);


 737         } else {
 738                 if (err == EAGAIN) {
 739                         /*
 740                          * If arc_memory_throttle() detected that pageout
 741                          * is running and we are low on memory, we delay new
 742                          * non-pageout transactions to give pageout an
 743                          * advantage.
 744                          *
 745                          * It is unfortunate to be delaying while the caller's
 746                          * locks are held.
 747                          */
 748                         txg_delay(dd->dd_pool, tx->tx_txg,
 749                             MSEC2NSEC(10), MSEC2NSEC(10));
 750                         err = SET_ERROR(ERESTART);
 751                 }

 752         }
 753 
 754         if (err == 0) {







 755                 err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
 756                     FALSE, asize > usize, tr_list, tx, TRUE);
 757         }
 758 
 759         if (err != 0)
 760                 dsl_dir_tempreserve_clear(tr_list, tx);
 761         else
 762                 *tr_cookiep = tr_list;
 763 
 764         return (err);
 765 }
 766 
 767 /*
 768  * Clear a temporary reservation that we previously made with
 769  * dsl_dir_tempreserve_space().
 770  */
 771 void
 772 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
 773 {
 774         int txgidx = tx->tx_txg & TXG_MASK;
 775         list_t *tr_list = tr_cookie;
 776         struct tempreserve *tr;
 777 
 778         ASSERT3U(tx->tx_txg, !=, 0);
 779 
 780         if (tr_cookie == NULL)
 781                 return;
 782 
 783         while ((tr = list_head(tr_list)) != NULL) {
 784                 if (tr->tr_ds) {


 785                         mutex_enter(&tr->tr_ds->dd_lock);
 786                         ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
 787                             tr->tr_size);
 788                         tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
 789                         mutex_exit(&tr->tr_ds->dd_lock);
 790                 } else {
 791                         arc_tempreserve_clear(tr->tr_size);
 792                 }
 793                 list_remove(tr_list, tr);
 794                 kmem_free(tr, sizeof (struct tempreserve));
 795         }
 796 
 797         kmem_free(tr_list, sizeof (list_t));
 798 }
 799 
 800 /*
 801  * This should be called from open context when we think we're going to write
 802  * or free space, for example when dirtying data. Be conservative; it's okay
 803  * to write less space or free more, but we don't want to write more or free
 804  * less than the amount specified.
 805  */
 806 void
 807 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 808 {
 809         int64_t parent_space;
 810         uint64_t est_used;
 811 
 812         mutex_enter(&dd->dd_lock);
 813         if (space > 0)
 814                 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
 815 
 816         est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
 817         parent_space = parent_delta(dd, est_used, space);
 818         mutex_exit(&dd->dd_lock);
 819 
 820         /* Make sure that we clean up dd_space_to* */
 821         dsl_dir_dirty(dd, tx);
 822 
 823         /* XXX this is potentially expensive and unnecessary... */
 824         if (parent_space && dd->dd_parent)
 825                 dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
 826 }
 827 












 828 /* call from syncing context when we actually write/free space for this dd */
 829 void
 830 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
 831     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
 832 {
 833         int64_t accounted_delta;
 834 
 835         /*
 836          * dsl_dataset_set_refreservation_sync_impl() calls this with
 837          * dd_lock held, so that it can atomically update
 838          * ds->ds_reserved and the dsl_dir accounting, so that
 839          * dsl_dataset_check_quota() can see dataset and dir accounting
 840          * consistently.
 841          */
 842         boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
 843 
 844         ASSERT(dmu_tx_is_syncing(tx));
 845         ASSERT(type < DD_USED_NUM);
 846 
 847         dmu_buf_will_dirty(dd->dd_dbuf, tx);