Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/zfs_vnops.c
          +++ new/usr/src/uts/common/fs/zfs/zfs_vnops.c
↓ open down ↓ 115 lines elided ↑ open up ↑
 116  116   *      Note, in particular, that if a lock is sometimes acquired before
 117  117   *      the tx assigns, and sometimes after (e.g. z_lock), then failing to
 118  118   *      use a non-blocking assign can deadlock the system.  The scenario:
 119  119   *
 120  120   *      Thread A has grabbed a lock before calling dmu_tx_assign().
 121  121   *      Thread B is in an already-assigned tx, and blocks for this lock.
 122  122   *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 123  123   *      forever, because the previous txg can't quiesce until B's tx commits.
 124  124   *
 125  125   *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 126      - *      then drop all locks, call dmu_tx_wait(), and try again.
      126 + *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
      127 + *      calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
      128 + *      to indicate that this operation has already called dmu_tx_wait().
      129 + *      This will ensure that we don't retry forever, waiting a short bit
      130 + *      each time.
 127  131   *
 128  132   *  (5) If the operation succeeded, generate the intent log entry for it
 129  133   *      before dropping locks.  This ensures that the ordering of events
 130  134   *      in the intent log matches the order in which they actually occurred.
 131  135   *      During ZIL replay the zfs_log_* functions will update the sequence
 132  136   *      number to indicate the zil transaction has replayed.
 133  137   *
 134  138   *  (6) At the end of each vnode op, the DMU tx must always commit,
 135  139   *      regardless of whether there were any errors.
 136  140   *
↓ open down ↓ 1 lines elided ↑ open up ↑
 138  142   *      to ensure that synchronous semantics are provided when necessary.
 139  143   *
 140  144   * In general, this is how things should be ordered in each vnode op:
 141  145   *
 142  146   *      ZFS_ENTER(zfsvfs);              // exit if unmounted
 143  147   * top:
 144  148   *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may VN_HOLD())
 145  149   *      rw_enter(...);                  // grab any other locks you need
 146  150   *      tx = dmu_tx_create(...);        // get DMU tx
 147  151   *      dmu_tx_hold_*();                // hold each object you might modify
 148      - *      error = dmu_tx_assign(tx, TXG_NOWAIT);  // try to assign
      152 + *      error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 149  153   *      if (error) {
 150  154   *              rw_exit(...);           // drop locks
 151  155   *              zfs_dirent_unlock(dl);  // unlock directory entry
 152  156   *              VN_RELE(...);           // release held vnodes
 153  157   *              if (error == ERESTART) {
      158 + *                      waited = B_TRUE;
 154  159   *                      dmu_tx_wait(tx);
 155  160   *                      dmu_tx_abort(tx);
 156  161   *                      goto top;
 157  162   *              }
 158  163   *              dmu_tx_abort(tx);       // abort DMU tx
 159  164   *              ZFS_EXIT(zfsvfs);       // finished in zfs
 160  165   *              return (error);         // really out of space
 161  166   *      }
 162  167   *      error = do_real_work();         // do whatever this VOP does
 163  168   *      if (error == 0)
↓ open down ↓ 1144 lines elided ↑ open up ↑
1308 1313          objset_t        *os;
1309 1314          zfs_dirlock_t   *dl;
1310 1315          dmu_tx_t        *tx;
1311 1316          int             error;
1312 1317          ksid_t          *ksid;
1313 1318          uid_t           uid;
1314 1319          gid_t           gid = crgetgid(cr);
1315 1320          zfs_acl_ids_t   acl_ids;
1316 1321          boolean_t       fuid_dirtied;
1317 1322          boolean_t       have_acl = B_FALSE;
     1323 +        boolean_t       waited = B_FALSE;
1318 1324  
1319 1325          /*
1320 1326           * If we have an ephemeral id, ACL, or XVATTR then
1321 1327           * make sure file system is at proper version
1322 1328           */
1323 1329  
1324 1330          ksid = crgetsid(cr, KSID_OWNER);
1325 1331          if (ksid)
1326 1332                  uid = ksid_getid(ksid);
1327 1333          else
↓ open down ↓ 100 lines elided ↑ open up ↑
1428 1434                  fuid_dirtied = zfsvfs->z_fuid_dirty;
1429 1435                  if (fuid_dirtied)
1430 1436                          zfs_fuid_txhold(zfsvfs, tx);
1431 1437                  dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1432 1438                  dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1433 1439                  if (!zfsvfs->z_use_sa &&
1434 1440                      acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1435 1441                          dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1436 1442                              0, acl_ids.z_aclp->z_acl_bytes);
1437 1443                  }
1438      -                error = dmu_tx_assign(tx, TXG_NOWAIT);
     1444 +                error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1439 1445                  if (error) {
1440 1446                          zfs_dirent_unlock(dl);
1441 1447                          if (error == ERESTART) {
     1448 +                                waited = B_TRUE;
1442 1449                                  dmu_tx_wait(tx);
1443 1450                                  dmu_tx_abort(tx);
1444 1451                                  goto top;
1445 1452                          }
1446 1453                          zfs_acl_ids_free(&acl_ids);
1447 1454                          dmu_tx_abort(tx);
1448 1455                          ZFS_EXIT(zfsvfs);
1449 1456                          return (error);
1450 1457                  }
1451 1458                  zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
↓ open down ↓ 111 lines elided ↑ open up ↑
1563 1570          uint64_t        obj = 0;
1564 1571          zfs_dirlock_t   *dl;
1565 1572          dmu_tx_t        *tx;
1566 1573          boolean_t       may_delete_now, delete_now = FALSE;
1567 1574          boolean_t       unlinked, toobig = FALSE;
1568 1575          uint64_t        txtype;
1569 1576          pathname_t      *realnmp = NULL;
1570 1577          pathname_t      realnm;
1571 1578          int             error;
1572 1579          int             zflg = ZEXISTS;
     1580 +        boolean_t       waited = B_FALSE;
1573 1581  
1574 1582          ZFS_ENTER(zfsvfs);
1575 1583          ZFS_VERIFY_ZP(dzp);
1576 1584          zilog = zfsvfs->z_log;
1577 1585  
1578 1586          if (flags & FIGNORECASE) {
1579 1587                  zflg |= ZCILOOK;
1580 1588                  pn_alloc(&realnm);
1581 1589                  realnmp = &realnm;
1582 1590          }
↓ open down ↓ 68 lines elided ↑ open up ↑
1651 1659          }
1652 1660  
1653 1661          mutex_enter(&zp->z_lock);
1654 1662          if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1655 1663                  dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1656 1664          mutex_exit(&zp->z_lock);
1657 1665  
1658 1666          /* charge as an update -- would be nice not to charge at all */
1659 1667          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1660 1668  
1661      -        error = dmu_tx_assign(tx, TXG_NOWAIT);
     1669 +        error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1662 1670          if (error) {
1663 1671                  zfs_dirent_unlock(dl);
1664 1672                  VN_RELE(vp);
1665 1673                  if (xzp)
1666 1674                          VN_RELE(ZTOV(xzp));
1667 1675                  if (error == ERESTART) {
     1676 +                        waited = B_TRUE;
1668 1677                          dmu_tx_wait(tx);
1669 1678                          dmu_tx_abort(tx);
1670 1679                          goto top;
1671 1680                  }
1672 1681                  if (realnmp)
1673 1682                          pn_free(realnmp);
1674 1683                  dmu_tx_abort(tx);
1675 1684                  ZFS_EXIT(zfsvfs);
1676 1685                  return (error);
1677 1686          }
↓ open down ↓ 113 lines elided ↑ open up ↑
1791 1800          zfs_dirlock_t   *dl;
1792 1801          uint64_t        txtype;
1793 1802          dmu_tx_t        *tx;
1794 1803          int             error;
1795 1804          int             zf = ZNEW;
1796 1805          ksid_t          *ksid;
1797 1806          uid_t           uid;
1798 1807          gid_t           gid = crgetgid(cr);
1799 1808          zfs_acl_ids_t   acl_ids;
1800 1809          boolean_t       fuid_dirtied;
     1810 +        boolean_t       waited = B_FALSE;
1801 1811  
1802 1812          ASSERT(vap->va_type == VDIR);
1803 1813  
1804 1814          /*
1805 1815           * If we have an ephemeral id, ACL, or XVATTR then
1806 1816           * make sure file system is at proper version
1807 1817           */
1808 1818  
1809 1819          ksid = crgetsid(cr, KSID_OWNER);
1810 1820          if (ksid)
↓ open down ↓ 76 lines elided ↑ open up ↑
1887 1897          if (fuid_dirtied)
1888 1898                  zfs_fuid_txhold(zfsvfs, tx);
1889 1899          if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1890 1900                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1891 1901                      acl_ids.z_aclp->z_acl_bytes);
1892 1902          }
1893 1903  
1894 1904          dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1895 1905              ZFS_SA_BASE_ATTR_SIZE);
1896 1906  
1897      -        error = dmu_tx_assign(tx, TXG_NOWAIT);
     1907 +        error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1898 1908          if (error) {
1899 1909                  zfs_dirent_unlock(dl);
1900 1910                  if (error == ERESTART) {
     1911 +                        waited = B_TRUE;
1901 1912                          dmu_tx_wait(tx);
1902 1913                          dmu_tx_abort(tx);
1903 1914                          goto top;
1904 1915                  }
1905 1916                  zfs_acl_ids_free(&acl_ids);
1906 1917                  dmu_tx_abort(tx);
1907 1918                  ZFS_EXIT(zfsvfs);
1908 1919                  return (error);
1909 1920          }
1910 1921  
↓ open down ↓ 55 lines elided ↑ open up ↑
1966 1977  {
1967 1978          znode_t         *dzp = VTOZ(dvp);
1968 1979          znode_t         *zp;
1969 1980          vnode_t         *vp;
1970 1981          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1971 1982          zilog_t         *zilog;
1972 1983          zfs_dirlock_t   *dl;
1973 1984          dmu_tx_t        *tx;
1974 1985          int             error;
1975 1986          int             zflg = ZEXISTS;
     1987 +        boolean_t       waited = B_FALSE;
1976 1988  
1977 1989          ZFS_ENTER(zfsvfs);
1978 1990          ZFS_VERIFY_ZP(dzp);
1979 1991          zilog = zfsvfs->z_log;
1980 1992  
1981 1993          if (flags & FIGNORECASE)
1982 1994                  zflg |= ZCILOOK;
1983 1995  top:
1984 1996          zp = NULL;
1985 1997  
↓ open down ↓ 35 lines elided ↑ open up ↑
2021 2033           * with the treewalk and directory rename code.
2022 2034           */
2023 2035          rw_enter(&zp->z_parent_lock, RW_WRITER);
2024 2036  
2025 2037          tx = dmu_tx_create(zfsvfs->z_os);
2026 2038          dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2027 2039          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2028 2040          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2029 2041          zfs_sa_upgrade_txholds(tx, zp);
2030 2042          zfs_sa_upgrade_txholds(tx, dzp);
2031      -        error = dmu_tx_assign(tx, TXG_NOWAIT);
     2043 +        error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
2032 2044          if (error) {
2033 2045                  rw_exit(&zp->z_parent_lock);
2034 2046                  rw_exit(&zp->z_name_lock);
2035 2047                  zfs_dirent_unlock(dl);
2036 2048                  VN_RELE(vp);
2037 2049                  if (error == ERESTART) {
     2050 +                        waited = B_TRUE;
2038 2051                          dmu_tx_wait(tx);
2039 2052                          dmu_tx_abort(tx);
2040 2053                          goto top;
2041 2054                  }
2042 2055                  dmu_tx_abort(tx);
2043 2056                  ZFS_EXIT(zfsvfs);
2044 2057                  return (error);
2045 2058          }
2046 2059  
2047 2060          error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
↓ open down ↓ 1307 lines elided ↑ open up ↑
3355 3368          znode_t         *sdzp = VTOZ(sdvp);
3356 3369          zfsvfs_t        *zfsvfs = sdzp->z_zfsvfs;
3357 3370          zilog_t         *zilog;
3358 3371          vnode_t         *realvp;
3359 3372          zfs_dirlock_t   *sdl, *tdl;
3360 3373          dmu_tx_t        *tx;
3361 3374          zfs_zlock_t     *zl;
3362 3375          int             cmp, serr, terr;
3363 3376          int             error = 0;
3364 3377          int             zflg = 0;
     3378 +        boolean_t       waited = B_FALSE;
3365 3379  
3366 3380          ZFS_ENTER(zfsvfs);
3367 3381          ZFS_VERIFY_ZP(sdzp);
3368 3382          zilog = zfsvfs->z_log;
3369 3383  
3370 3384          /*
3371 3385           * Make sure we have the real vp for the target directory.
3372 3386           */
3373 3387          if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3374 3388                  tdvp = realvp;
↓ open down ↓ 217 lines elided ↑ open up ↑
3592 3606                  dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3593 3607                  zfs_sa_upgrade_txholds(tx, tdzp);
3594 3608          }
3595 3609          if (tzp) {
3596 3610                  dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3597 3611                  zfs_sa_upgrade_txholds(tx, tzp);
3598 3612          }
3599 3613  
3600 3614          zfs_sa_upgrade_txholds(tx, szp);
3601 3615          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3602      -        error = dmu_tx_assign(tx, TXG_NOWAIT);
     3616 +        error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
3603 3617          if (error) {
3604 3618                  if (zl != NULL)
3605 3619                          zfs_rename_unlock(&zl);
3606 3620                  zfs_dirent_unlock(sdl);
3607 3621                  zfs_dirent_unlock(tdl);
3608 3622  
3609 3623                  if (sdzp == tdzp)
3610 3624                          rw_exit(&sdzp->z_name_lock);
3611 3625  
3612 3626                  VN_RELE(ZTOV(szp));
3613 3627                  if (tzp)
3614 3628                          VN_RELE(ZTOV(tzp));
3615 3629                  if (error == ERESTART) {
     3630 +                        waited = B_TRUE;
3616 3631                          dmu_tx_wait(tx);
3617 3632                          dmu_tx_abort(tx);
3618 3633                          goto top;
3619 3634                  }
3620 3635                  dmu_tx_abort(tx);
3621 3636                  ZFS_EXIT(zfsvfs);
3622 3637                  return (error);
3623 3638          }
3624 3639  
3625 3640          if (tzp)        /* Attempt to remove the existing target */
↓ open down ↓ 85 lines elided ↑ open up ↑
3711 3726          zfs_dirlock_t   *dl;
3712 3727          dmu_tx_t        *tx;
3713 3728          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3714 3729          zilog_t         *zilog;
3715 3730          uint64_t        len = strlen(link);
3716 3731          int             error;
3717 3732          int             zflg = ZNEW;
3718 3733          zfs_acl_ids_t   acl_ids;
3719 3734          boolean_t       fuid_dirtied;
3720 3735          uint64_t        txtype = TX_SYMLINK;
     3736 +        boolean_t       waited = B_FALSE;
3721 3737  
3722 3738          ASSERT(vap->va_type == VLNK);
3723 3739  
3724 3740          ZFS_ENTER(zfsvfs);
3725 3741          ZFS_VERIFY_ZP(dzp);
3726 3742          zilog = zfsvfs->z_log;
3727 3743  
3728 3744          if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3729 3745              NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3730 3746                  ZFS_EXIT(zfsvfs);
↓ open down ↓ 42 lines elided ↑ open up ↑
3773 3789          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3774 3790          dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3775 3791              ZFS_SA_BASE_ATTR_SIZE + len);
3776 3792          dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3777 3793          if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3778 3794                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3779 3795                      acl_ids.z_aclp->z_acl_bytes);
3780 3796          }
3781 3797          if (fuid_dirtied)
3782 3798                  zfs_fuid_txhold(zfsvfs, tx);
3783      -        error = dmu_tx_assign(tx, TXG_NOWAIT);
     3799 +        error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
3784 3800          if (error) {
3785 3801                  zfs_dirent_unlock(dl);
3786 3802                  if (error == ERESTART) {
     3803 +                        waited = B_TRUE;
3787 3804                          dmu_tx_wait(tx);
3788 3805                          dmu_tx_abort(tx);
3789 3806                          goto top;
3790 3807                  }
3791 3808                  zfs_acl_ids_free(&acl_ids);
3792 3809                  dmu_tx_abort(tx);
3793 3810                  ZFS_EXIT(zfsvfs);
3794 3811                  return (error);
3795 3812          }
3796 3813  
↓ open down ↓ 106 lines elided ↑ open up ↑
3903 3920          znode_t         *tzp, *szp;
3904 3921          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3905 3922          zilog_t         *zilog;
3906 3923          zfs_dirlock_t   *dl;
3907 3924          dmu_tx_t        *tx;
3908 3925          vnode_t         *realvp;
3909 3926          int             error;
3910 3927          int             zf = ZNEW;
3911 3928          uint64_t        parent;
3912 3929          uid_t           owner;
     3930 +        boolean_t       waited = B_FALSE;
3913 3931  
3914 3932          ASSERT(tdvp->v_type == VDIR);
3915 3933  
3916 3934          ZFS_ENTER(zfsvfs);
3917 3935          ZFS_VERIFY_ZP(dzp);
3918 3936          zilog = zfsvfs->z_log;
3919 3937  
3920 3938          if (VOP_REALVP(svp, &realvp, ct) == 0)
3921 3939                  svp = realvp;
3922 3940  
↓ open down ↓ 69 lines elided ↑ open up ↑
3992 4010          if (error) {
3993 4011                  ZFS_EXIT(zfsvfs);
3994 4012                  return (error);
3995 4013          }
3996 4014  
3997 4015          tx = dmu_tx_create(zfsvfs->z_os);
3998 4016          dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3999 4017          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4000 4018          zfs_sa_upgrade_txholds(tx, szp);
4001 4019          zfs_sa_upgrade_txholds(tx, dzp);
4002      -        error = dmu_tx_assign(tx, TXG_NOWAIT);
     4020 +        error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
4003 4021          if (error) {
4004 4022                  zfs_dirent_unlock(dl);
4005 4023                  if (error == ERESTART) {
     4024 +                        waited = B_TRUE;
4006 4025                          dmu_tx_wait(tx);
4007 4026                          dmu_tx_abort(tx);
4008 4027                          goto top;
4009 4028                  }
4010 4029                  dmu_tx_abort(tx);
4011 4030                  ZFS_EXIT(zfsvfs);
4012 4031                  return (error);
4013 4032          }
4014 4033  
4015 4034          error = zfs_link_create(dl, szp, tx, 0);
↓ open down ↓ 1244 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX