Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

@@ -121,11 +121,15 @@
  *      Thread B is in an already-assigned tx, and blocks for this lock.
  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
  *      forever, because the previous txg can't quiesce until B's tx commits.
  *
  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
- *      then drop all locks, call dmu_tx_wait(), and try again.
+ *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
+ *      calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
+ *      to indicate that this operation has already called dmu_tx_wait().
+ *      This will ensure that we don't retry forever, waiting a short bit
+ *      each time.
  *
  *  (5) If the operation succeeded, generate the intent log entry for it
  *      before dropping locks.  This ensures that the ordering of events
  *      in the intent log matches the order in which they actually occurred.
  *      During ZIL replay the zfs_log_* functions will update the sequence

@@ -143,16 +147,17 @@
  * top:
  *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may VN_HOLD())
  *      rw_enter(...);                  // grab any other locks you need
  *      tx = dmu_tx_create(...);        // get DMU tx
  *      dmu_tx_hold_*();                // hold each object you might modify
- *      error = dmu_tx_assign(tx, TXG_NOWAIT);  // try to assign
+ *      error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
  *      if (error) {
  *              rw_exit(...);           // drop locks
  *              zfs_dirent_unlock(dl);  // unlock directory entry
  *              VN_RELE(...);           // release held vnodes
  *              if (error == ERESTART) {
+ *                      waited = B_TRUE;
  *                      dmu_tx_wait(tx);
  *                      dmu_tx_abort(tx);
  *                      goto top;
  *              }
  *              dmu_tx_abort(tx);       // abort DMU tx

@@ -1313,10 +1318,11 @@
         uid_t           uid;
         gid_t           gid = crgetgid(cr);
         zfs_acl_ids_t   acl_ids;
         boolean_t       fuid_dirtied;
         boolean_t       have_acl = B_FALSE;
+        boolean_t       waited = B_FALSE;
 
         /*
          * If we have an ephemeral id, ACL, or XVATTR then
          * make sure file system is at proper version
          */

@@ -1433,14 +1439,15 @@
                 if (!zfsvfs->z_use_sa &&
                     acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
                             0, acl_ids.z_aclp->z_acl_bytes);
                 }
-                error = dmu_tx_assign(tx, TXG_NOWAIT);
+                error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
                 if (error) {
                         zfs_dirent_unlock(dl);
                         if (error == ERESTART) {
+                                waited = B_TRUE;
                                 dmu_tx_wait(tx);
                                 dmu_tx_abort(tx);
                                 goto top;
                         }
                         zfs_acl_ids_free(&acl_ids);

@@ -1568,10 +1575,11 @@
         uint64_t        txtype;
         pathname_t      *realnmp = NULL;
         pathname_t      realnm;
         int             error;
         int             zflg = ZEXISTS;
+        boolean_t       waited = B_FALSE;
 
         ZFS_ENTER(zfsvfs);
         ZFS_VERIFY_ZP(dzp);
         zilog = zfsvfs->z_log;
 

@@ -1656,17 +1664,18 @@
         mutex_exit(&zp->z_lock);
 
         /* charge as an update -- would be nice not to charge at all */
         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
-        error = dmu_tx_assign(tx, TXG_NOWAIT);
+        error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
         if (error) {
                 zfs_dirent_unlock(dl);
                 VN_RELE(vp);
                 if (xzp)
                         VN_RELE(ZTOV(xzp));
                 if (error == ERESTART) {
+                        waited = B_TRUE;
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
                 }
                 if (realnmp)

@@ -1796,10 +1805,11 @@
         ksid_t          *ksid;
         uid_t           uid;
         gid_t           gid = crgetgid(cr);
         zfs_acl_ids_t   acl_ids;
         boolean_t       fuid_dirtied;
+        boolean_t       waited = B_FALSE;
 
         ASSERT(vap->va_type == VDIR);
 
         /*
          * If we have an ephemeral id, ACL, or XVATTR then

@@ -1892,14 +1902,15 @@
         }
 
         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
             ZFS_SA_BASE_ATTR_SIZE);
 
-        error = dmu_tx_assign(tx, TXG_NOWAIT);
+        error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
         if (error) {
                 zfs_dirent_unlock(dl);
                 if (error == ERESTART) {
+                        waited = B_TRUE;
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
                 }
                 zfs_acl_ids_free(&acl_ids);

@@ -1971,10 +1982,11 @@
         zilog_t         *zilog;
         zfs_dirlock_t   *dl;
         dmu_tx_t        *tx;
         int             error;
         int             zflg = ZEXISTS;
+        boolean_t       waited = B_FALSE;
 
         ZFS_ENTER(zfsvfs);
         ZFS_VERIFY_ZP(dzp);
         zilog = zfsvfs->z_log;
 

@@ -2026,17 +2038,18 @@
         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
         zfs_sa_upgrade_txholds(tx, zp);
         zfs_sa_upgrade_txholds(tx, dzp);
-        error = dmu_tx_assign(tx, TXG_NOWAIT);
+        error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
         if (error) {
                 rw_exit(&zp->z_parent_lock);
                 rw_exit(&zp->z_name_lock);
                 zfs_dirent_unlock(dl);
                 VN_RELE(vp);
                 if (error == ERESTART) {
+                        waited = B_TRUE;
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
                 }
                 dmu_tx_abort(tx);

@@ -3360,10 +3373,11 @@
         dmu_tx_t        *tx;
         zfs_zlock_t     *zl;
         int             cmp, serr, terr;
         int             error = 0;
         int             zflg = 0;
+        boolean_t       waited = B_FALSE;
 
         ZFS_ENTER(zfsvfs);
         ZFS_VERIFY_ZP(sdzp);
         zilog = zfsvfs->z_log;
 

@@ -3597,11 +3611,11 @@
                 zfs_sa_upgrade_txholds(tx, tzp);
         }
 
         zfs_sa_upgrade_txholds(tx, szp);
         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-        error = dmu_tx_assign(tx, TXG_NOWAIT);
+        error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
         if (error) {
                 if (zl != NULL)
                         zfs_rename_unlock(&zl);
                 zfs_dirent_unlock(sdl);
                 zfs_dirent_unlock(tdl);

@@ -3611,10 +3625,11 @@
 
                 VN_RELE(ZTOV(szp));
                 if (tzp)
                         VN_RELE(ZTOV(tzp));
                 if (error == ERESTART) {
+                        waited = B_TRUE;
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
                 }
                 dmu_tx_abort(tx);

@@ -3716,10 +3731,11 @@
         int             error;
         int             zflg = ZNEW;
         zfs_acl_ids_t   acl_ids;
         boolean_t       fuid_dirtied;
         uint64_t        txtype = TX_SYMLINK;
+        boolean_t       waited = B_FALSE;
 
         ASSERT(vap->va_type == VLNK);
 
         ZFS_ENTER(zfsvfs);
         ZFS_VERIFY_ZP(dzp);

@@ -3778,14 +3794,15 @@
                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
                     acl_ids.z_aclp->z_acl_bytes);
         }
         if (fuid_dirtied)
                 zfs_fuid_txhold(zfsvfs, tx);
-        error = dmu_tx_assign(tx, TXG_NOWAIT);
+        error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
         if (error) {
                 zfs_dirent_unlock(dl);
                 if (error == ERESTART) {
+                        waited = B_TRUE;
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
                 }
                 zfs_acl_ids_free(&acl_ids);

@@ -3908,10 +3925,11 @@
         vnode_t         *realvp;
         int             error;
         int             zf = ZNEW;
         uint64_t        parent;
         uid_t           owner;
+        boolean_t       waited = B_FALSE;
 
         ASSERT(tdvp->v_type == VDIR);
 
         ZFS_ENTER(zfsvfs);
         ZFS_VERIFY_ZP(dzp);

@@ -3997,14 +4015,15 @@
         tx = dmu_tx_create(zfsvfs->z_os);
         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
         zfs_sa_upgrade_txholds(tx, szp);
         zfs_sa_upgrade_txholds(tx, dzp);
-        error = dmu_tx_assign(tx, TXG_NOWAIT);
+        error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
         if (error) {
                 zfs_dirent_unlock(dl);
                 if (error == ERESTART) {
+                        waited = B_TRUE;
                         dmu_tx_wait(tx);
                         dmu_tx_abort(tx);
                         goto top;
                 }
                 dmu_tx_abort(tx);