37 #include <sys/sa.h>
38 #include <sys/sa_impl.h>
39 #include <sys/zfs_context.h>
40 #include <sys/varargs.h>
41
42 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
43 uint64_t arg1, uint64_t arg2);
44
45
46 dmu_tx_t *
47 dmu_tx_create_dd(dsl_dir_t *dd)
48 {
49 dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
50 tx->tx_dir = dd;
51 if (dd != NULL)
52 tx->tx_pool = dd->dd_pool;
53 list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
54 offsetof(dmu_tx_hold_t, txh_node));
55 list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
56 offsetof(dmu_tx_callback_t, dcb_node));
57 #ifdef ZFS_DEBUG
58 refcount_create(&tx->tx_space_written);
59 refcount_create(&tx->tx_space_freed);
60 #endif
61 return (tx);
62 }
63
64 dmu_tx_t *
65 dmu_tx_create(objset_t *os)
66 {
67 dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
68 tx->tx_objset = os;
69 tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
70 return (tx);
71 }
72
73 dmu_tx_t *
74 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
75 {
76 dmu_tx_t *tx = dmu_tx_create_dd(NULL);
580 }
581 txh->txh_space_tofree += space;
582 txh->txh_space_tounref += unref;
583 }
584
585 void
586 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
587 {
588 dmu_tx_hold_t *txh;
589 dnode_t *dn;
590 int err;
591 zio_t *zio;
592
593 ASSERT(tx->tx_txg == 0);
594
595 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
596 object, THT_FREE, off, len);
597 if (txh == NULL)
598 return;
599 dn = txh->txh_dnode;
600
601 if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
602 return;
603 if (len == DMU_OBJECT_END)
604 len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
605
606 dmu_tx_count_dnode(txh);
607
608 /*
609 * For i/o error checking, we read the first and last level-0
610 * blocks if they are not aligned, and all the level-1 blocks.
611 *
612 * Note: dbuf_free_range() assumes that we have not instantiated
613 * any level-0 dbufs that will be completely freed. Therefore we must
614 * exercise care to not read or count the first and last blocks
615 * if they are blocksize-aligned.
616 */
617 if (dn->dn_datablkshift == 0) {
618 if (off != 0 || len < dn->dn_datablksz)
619 dmu_tx_count_write(txh, off, len);
620 } else {
621 /* first block will be modified if it is not aligned */
622 if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
623 dmu_tx_count_write(txh, off, 1);
624 /* last block will be modified if it is not aligned */
625 if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
626 dmu_tx_count_write(txh, off+len, 1);
894 break;
895 case THT_NEWOBJECT:
896 match_object = TRUE;
897 break;
898 default:
899 ASSERT(!"bad txh_type");
900 }
901 }
902 if (match_object && match_offset) {
903 DB_DNODE_EXIT(db);
904 return;
905 }
906 }
907 DB_DNODE_EXIT(db);
908 panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
909 (u_longlong_t)db->db.db_object, db->db_level,
910 (u_longlong_t)db->db_blkid);
911 }
912 #endif
913
914 static int
915 dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
916 {
917 dmu_tx_hold_t *txh;
918 spa_t *spa = tx->tx_pool->dp_spa;
919 uint64_t memory, asize, fsize, usize;
920 uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
921
922 ASSERT0(tx->tx_txg);
923
924 if (tx->tx_err)
925 return (tx->tx_err);
926
927 if (spa_suspended(spa)) {
928 /*
929 * If the user has indicated a blocking failure mode
930 * then return ERESTART which will block in dmu_tx_wait().
931 * Otherwise, return EIO so that an error can get
932 * propagated back to the VOP calls.
933 *
934 * Note that we always honor the txg_how flag regardless
935 * of the failuremode setting.
936 */
937 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
938 txg_how != TXG_WAIT)
939 return (SET_ERROR(EIO));
940
941 return (SET_ERROR(ERESTART));
942 }
943
944 tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
945 tx->tx_needassign_txh = NULL;
946
947 /*
948 * NB: No error returns are allowed after txg_hold_open, but
949 * before processing the dnode holds, due to the
950 * dmu_tx_unassign() logic.
951 */
952
953 towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
954 for (txh = list_head(&tx->tx_holds); txh;
955 txh = list_next(&tx->tx_holds, txh)) {
956 dnode_t *dn = txh->txh_dnode;
957 if (dn != NULL) {
958 mutex_enter(&dn->dn_mtx);
959 if (dn->dn_assigned_txg == tx->tx_txg - 1) {
960 mutex_exit(&dn->dn_mtx);
961 tx->tx_needassign_txh = txh;
962 return (SET_ERROR(ERESTART));
963 }
1048 mutex_exit(&dn->dn_mtx);
1049 }
1050
1051 txg_rele_to_sync(&tx->tx_txgh);
1052
1053 tx->tx_lasttried_txg = tx->tx_txg;
1054 tx->tx_txg = 0;
1055 }
1056
1057 /*
1058 * Assign tx to a transaction group. txg_how can be one of:
1059 *
1060 * (1) TXG_WAIT. If the current open txg is full, waits until there's
1061 * a new one. This should be used when you're not holding locks.
1062 * It will only fail if we're truly out of space (or over quota).
1063 *
1064 * (2) TXG_NOWAIT. If we can't assign into the current open txg without
1065 * blocking, returns immediately with ERESTART. This should be used
1066 * whenever you're holding locks. On an ERESTART error, the caller
1067 * should drop locks, do a dmu_tx_wait(tx), and try again.
1068 */
1069 int
1070 dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1071 {
1072 int err;
1073
1074 ASSERT(tx->tx_txg == 0);
1075 ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT);
1076 ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1077
1078 /* If we might wait, we must not hold the config lock. */
1079 ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1080
1081 while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1082 dmu_tx_unassign(tx);
1083
1084 if (err != ERESTART || txg_how != TXG_WAIT)
1085 return (err);
1086
1087 dmu_tx_wait(tx);
1088 }
1089
1090 txg_rele_to_quiesce(&tx->tx_txgh);
1091
1092 return (0);
1093 }
1094
1095 void
1096 dmu_tx_wait(dmu_tx_t *tx)
1097 {
1098 spa_t *spa = tx->tx_pool->dp_spa;
1099
1100 ASSERT(tx->tx_txg == 0);
1101 ASSERT(!dsl_pool_config_held(tx->tx_pool));
1102
1103 /*
1104 * It's possible that the pool has become active after this thread
1105 * has tried to obtain a tx. If that's the case then his
1106 * tx_lasttried_txg would not have been assigned.
1107 */
1108 if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1109 txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
1110 } else if (tx->tx_needassign_txh) {
1111 dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1112
1113 mutex_enter(&dn->dn_mtx);
1114 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1115 cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1116 mutex_exit(&dn->dn_mtx);
1117 tx->tx_needassign_txh = NULL;
1118 } else {
1119 txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1120 }
1121 }
1122
1123 void
1124 dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
1125 {
1126 #ifdef ZFS_DEBUG
1127 if (tx->tx_dir == NULL || delta == 0)
1128 return;
1129
1130 if (delta > 0) {
|
37 #include <sys/sa.h>
38 #include <sys/sa_impl.h>
39 #include <sys/zfs_context.h>
40 #include <sys/varargs.h>
41
42 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
43 uint64_t arg1, uint64_t arg2);
44
45
46 dmu_tx_t *
47 dmu_tx_create_dd(dsl_dir_t *dd)
48 {
49 dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
50 tx->tx_dir = dd;
51 if (dd != NULL)
52 tx->tx_pool = dd->dd_pool;
53 list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
54 offsetof(dmu_tx_hold_t, txh_node));
55 list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
56 offsetof(dmu_tx_callback_t, dcb_node));
57 tx->tx_start = gethrtime();
58 #ifdef ZFS_DEBUG
59 refcount_create(&tx->tx_space_written);
60 refcount_create(&tx->tx_space_freed);
61 #endif
62 return (tx);
63 }
64
65 dmu_tx_t *
66 dmu_tx_create(objset_t *os)
67 {
68 dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
69 tx->tx_objset = os;
70 tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
71 return (tx);
72 }
73
74 dmu_tx_t *
75 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
76 {
77 dmu_tx_t *tx = dmu_tx_create_dd(NULL);
581 }
582 txh->txh_space_tofree += space;
583 txh->txh_space_tounref += unref;
584 }
585
586 void
587 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
588 {
589 dmu_tx_hold_t *txh;
590 dnode_t *dn;
591 int err;
592 zio_t *zio;
593
594 ASSERT(tx->tx_txg == 0);
595
596 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
597 object, THT_FREE, off, len);
598 if (txh == NULL)
599 return;
600 dn = txh->txh_dnode;
601 dmu_tx_count_dnode(txh);
602
603 if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
604 return;
605 if (len == DMU_OBJECT_END)
606 len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
607
608
609 /*
610 * For i/o error checking, we read the first and last level-0
611 * blocks if they are not aligned, and all the level-1 blocks.
612 *
613 * Note: dbuf_free_range() assumes that we have not instantiated
614 * any level-0 dbufs that will be completely freed. Therefore we must
615 * exercise care to not read or count the first and last blocks
616 * if they are blocksize-aligned.
617 */
618 if (dn->dn_datablkshift == 0) {
619 if (off != 0 || len < dn->dn_datablksz)
620 dmu_tx_count_write(txh, off, len);
621 } else {
622 /* first block will be modified if it is not aligned */
623 if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
624 dmu_tx_count_write(txh, off, 1);
625 /* last block will be modified if it is not aligned */
626 if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
627 dmu_tx_count_write(txh, off+len, 1);
895 break;
896 case THT_NEWOBJECT:
897 match_object = TRUE;
898 break;
899 default:
900 ASSERT(!"bad txh_type");
901 }
902 }
903 if (match_object && match_offset) {
904 DB_DNODE_EXIT(db);
905 return;
906 }
907 }
908 DB_DNODE_EXIT(db);
909 panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
910 (u_longlong_t)db->db.db_object, db->db_level,
911 (u_longlong_t)db->db_blkid);
912 }
913 #endif
914
915 /*
916 * If we can't do 10 iops, something is wrong. Let us go ahead
917 * and hit zfs_dirty_data_max.
918 */
919 hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
920 int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
921
922 /*
923 * We delay transactions when we've determined that the backend storage
924 * isn't able to accommodate the rate of incoming writes.
925 *
926 * If there is already a transaction waiting, we delay relative to when
927 * that transaction finishes waiting. This way the calculated min_time
928 * is independent of the number of threads concurrently executing
929 * transactions.
930 *
931 * If we are the only waiter, wait relative to when the transaction
932 * started, rather than the current time. This credits the transaction for
933 * "time already served", e.g. reading indirect blocks.
934 *
935 * The minimum time for a transaction to take is calculated as:
936 * min_time = scale * (dirty - min) / (max - dirty)
937 * min_time is then capped at zfs_delay_max_ns.
938 *
939 * The delay has two degrees of freedom that can be adjusted via tunables.
940 * The percentage of dirty data at which we start to delay is defined by
941 * zfs_delay_min_dirty_percent. This should typically be at or above
942 * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
943 * delay after writing at full speed has failed to keep up with the incoming
944 * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
945 * speaking, this variable determines the amount of delay at the midpoint of
946 * the curve.
947 *
948 * delay
949 * 10ms +-------------------------------------------------------------*+
950 * | *|
951 * 9ms + *+
952 * | *|
953 * 8ms + *+
954 * | * |
955 * 7ms + * +
956 * | * |
957 * 6ms + * +
958 * | * |
959 * 5ms + * +
960 * | * |
961 * 4ms + * +
962 * | * |
963 * 3ms + * +
964 * | * |
965 * 2ms + (midpoint) * +
966 * | | ** |
967 * 1ms + v *** +
968 * | zfs_delay_scale ----------> ******** |
969 * 0 +-------------------------------------*********----------------+
970 * 0% <- zfs_dirty_data_max -> 100%
971 *
972 * Note that since the delay is added to the outstanding time remaining on the
973 * most recent transaction, the delay is effectively the inverse of IOPS.
974 * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
975 * was chosen such that small changes in the amount of accumulated dirty data
976 * in the first 3/4 of the curve yield relatively small differences in the
977 * amount of delay.
978 *
979 * The effects can be easier to understand when the amount of delay is
980 * represented on a log scale:
981 *
982 * delay
983 * 100ms +-------------------------------------------------------------++
984 * + +
985 * | |
986 * + *+
987 * 10ms + *+
988 * + ** +
989 * | (midpoint) ** |
990 * + | ** +
991 * 1ms + v **** +
992 * + zfs_delay_scale ----------> ***** +
993 * | **** |
994 * + **** +
995 * 100us + ** +
996 * + * +
997 * | * |
998 * + * +
999 * 10us + * +
1000 * + +
1001 * | |
1002 * + +
1003 * +--------------------------------------------------------------+
1004 * 0% <- zfs_dirty_data_max -> 100%
1005 *
1006 * Note here that only as the amount of dirty data approaches its limit does
1007 * the delay start to increase rapidly. The goal of a properly tuned system
1008 * should be to keep the amount of dirty data out of that range by first
1009 * ensuring that the appropriate limits are set for the I/O scheduler to reach
1010 * optimal throughput on the backend storage, and then by changing the value
1011 * of zfs_delay_scale to increase the steepness of the curve.
1012 */
1013 static void
1014 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
1015 {
1016 dsl_pool_t *dp = tx->tx_pool;
1017 uint64_t delay_min_bytes =
1018 zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
1019 hrtime_t wakeup, min_tx_time, now;
1020
1021 if (dirty <= delay_min_bytes)
1022 return;
1023
1024 /*
1025 * The caller has already waited until we are under the max.
1026 * We make them pass us the amount of dirty data so we don't
1027 * have to handle the case of it being >= the max, which could
1028 * cause a divide-by-zero if it's == the max.
1029 */
1030 ASSERT3U(dirty, <, zfs_dirty_data_max);
1031
1032 now = gethrtime();
1033 min_tx_time = zfs_delay_scale *
1034 (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
1035 if (now > tx->tx_start + min_tx_time)
1036 return;
1037
1038 min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
1039
1040 DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
1041 uint64_t, min_tx_time);
1042
1043 mutex_enter(&dp->dp_lock);
1044 wakeup = MAX(tx->tx_start + min_tx_time,
1045 dp->dp_last_wakeup + min_tx_time);
1046 dp->dp_last_wakeup = wakeup;
1047 mutex_exit(&dp->dp_lock);
1048
1049 #ifdef _KERNEL
1050 mutex_enter(&curthread->t_delay_lock);
1051 while (cv_timedwait_hires(&curthread->t_delay_cv,
1052 &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
1053 CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
1054 continue;
1055 mutex_exit(&curthread->t_delay_lock);
1056 #else
1057 hrtime_t delta = wakeup - gethrtime();
1058 struct timespec ts;
1059 ts.tv_sec = delta / NANOSEC;
1060 ts.tv_nsec = delta % NANOSEC;
1061 (void) nanosleep(&ts, NULL);
1062 #endif
1063 }
1064
1065 static int
1066 dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
1067 {
1068 dmu_tx_hold_t *txh;
1069 spa_t *spa = tx->tx_pool->dp_spa;
1070 uint64_t memory, asize, fsize, usize;
1071 uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
1072
1073 ASSERT0(tx->tx_txg);
1074
1075 if (tx->tx_err)
1076 return (tx->tx_err);
1077
1078 if (spa_suspended(spa)) {
1079 /*
1080 * If the user has indicated a blocking failure mode
1081 * then return ERESTART which will block in dmu_tx_wait().
1082 * Otherwise, return EIO so that an error can get
1083 * propagated back to the VOP calls.
1084 *
1085 * Note that we always honor the txg_how flag regardless
1086 * of the failuremode setting.
1087 */
1088 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
1089 txg_how != TXG_WAIT)
1090 return (SET_ERROR(EIO));
1091
1092 return (SET_ERROR(ERESTART));
1093 }
1094
1095 if (!tx->tx_waited &&
1096 dsl_pool_need_dirty_delay(tx->tx_pool)) {
1097 tx->tx_wait_dirty = B_TRUE;
1098 return (SET_ERROR(ERESTART));
1099 }
1100
1101 tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
1102 tx->tx_needassign_txh = NULL;
1103
1104 /*
1105 * NB: No error returns are allowed after txg_hold_open, but
1106 * before processing the dnode holds, due to the
1107 * dmu_tx_unassign() logic.
1108 */
1109
1110 towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
1111 for (txh = list_head(&tx->tx_holds); txh;
1112 txh = list_next(&tx->tx_holds, txh)) {
1113 dnode_t *dn = txh->txh_dnode;
1114 if (dn != NULL) {
1115 mutex_enter(&dn->dn_mtx);
1116 if (dn->dn_assigned_txg == tx->tx_txg - 1) {
1117 mutex_exit(&dn->dn_mtx);
1118 tx->tx_needassign_txh = txh;
1119 return (SET_ERROR(ERESTART));
1120 }
1205 mutex_exit(&dn->dn_mtx);
1206 }
1207
1208 txg_rele_to_sync(&tx->tx_txgh);
1209
1210 tx->tx_lasttried_txg = tx->tx_txg;
1211 tx->tx_txg = 0;
1212 }
1213
1214 /*
1215 * Assign tx to a transaction group. txg_how can be one of:
1216 *
1217 * (1) TXG_WAIT. If the current open txg is full, waits until there's
1218 * a new one. This should be used when you're not holding locks.
1219 * It will only fail if we're truly out of space (or over quota).
1220 *
1221 * (2) TXG_NOWAIT. If we can't assign into the current open txg without
1222 * blocking, returns immediately with ERESTART. This should be used
1223 * whenever you're holding locks. On an ERESTART error, the caller
1224 * should drop locks, do a dmu_tx_wait(tx), and try again.
1225 *
1226 * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait()
1227 * has already been called on behalf of this operation (though
1228 * most likely on a different tx).
1229 */
1230 int
1231 dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1232 {
1233 int err;
1234
1235 ASSERT(tx->tx_txg == 0);
1236 ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
1237 txg_how == TXG_WAITED);
1238 ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1239
1240 /* If we might wait, we must not hold the config lock. */
1241 ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1242
1243 if (txg_how == TXG_WAITED)
1244 tx->tx_waited = B_TRUE;
1245
1246 while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1247 dmu_tx_unassign(tx);
1248
1249 if (err != ERESTART || txg_how != TXG_WAIT)
1250 return (err);
1251
1252 dmu_tx_wait(tx);
1253 }
1254
1255 txg_rele_to_quiesce(&tx->tx_txgh);
1256
1257 return (0);
1258 }
1259
1260 void
1261 dmu_tx_wait(dmu_tx_t *tx)
1262 {
1263 spa_t *spa = tx->tx_pool->dp_spa;
1264 dsl_pool_t *dp = tx->tx_pool;
1265
1266 ASSERT(tx->tx_txg == 0);
1267 ASSERT(!dsl_pool_config_held(tx->tx_pool));
1268
1269 if (tx->tx_wait_dirty) {
1270 /*
1271 * dmu_tx_try_assign() has determined that we need to wait
1272 * because we've consumed much or all of the dirty buffer
1273 * space.
1274 */
1275 mutex_enter(&dp->dp_lock);
1276 while (dp->dp_dirty_total >= zfs_dirty_data_max)
1277 cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1278 uint64_t dirty = dp->dp_dirty_total;
1279 mutex_exit(&dp->dp_lock);
1280
1281 dmu_tx_delay(tx, dirty);
1282
1283 tx->tx_wait_dirty = B_FALSE;
1284
1285 /*
1286 * Note: setting tx_waited only has effect if the caller
1287 * used TX_WAIT. Otherwise they are going to destroy
1288 * this tx and try again. The common case, zfs_write(),
1289 * uses TX_WAIT.
1290 */
1291 tx->tx_waited = B_TRUE;
1292 } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1293 /*
1294 * If the pool is suspended we need to wait until it
1295 * is resumed. Note that it's possible that the pool
1296 * has become active after this thread has tried to
1297 * obtain a tx. If that's the case then tx_lasttried_txg
1298 * would not have been set.
1299 */
1300 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1301 } else if (tx->tx_needassign_txh) {
1302 /*
1303 * A dnode is assigned to the quiescing txg. Wait for its
1304 * transaction to complete.
1305 */
1306 dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1307
1308 mutex_enter(&dn->dn_mtx);
1309 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1310 cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1311 mutex_exit(&dn->dn_mtx);
1312 tx->tx_needassign_txh = NULL;
1313 } else {
1314 txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1315 }
1316 }
1317
1318 void
1319 dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
1320 {
1321 #ifdef ZFS_DEBUG
1322 if (tx->tx_dir == NULL || delta == 0)
1323 return;
1324
1325 if (delta > 0) {
|