Print this page
4347 ZPL can use dmu_tx_assign(TXG_WAIT)
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/zfs_vnops.c
          +++ new/usr/src/uts/common/fs/zfs/zfs_vnops.c
↓ open down ↓ 103 lines elided ↑ open up ↑
 104  104   *      can be freed, so the zp may point to freed memory.  Second, the last
 105  105   *      reference will call zfs_zinactive(), which may induce a lot of work --
 106  106   *      pushing cached pages (which acquires range locks) and syncing out
 107  107   *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
 108  108   *      which could deadlock the system if you were already holding one.
 109  109   *      If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
 110  110   *
 111  111   *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 112  112   *      as they can span dmu_tx_assign() calls.
 113  113   *
 114      - *  (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
 115      - *      This is critical because we don't want to block while holding locks.
 116      - *      Note, in particular, that if a lock is sometimes acquired before
 117      - *      the tx assigns, and sometimes after (e.g. z_lock), then failing to
 118      - *      use a non-blocking assign can deadlock the system.  The scenario:
      114 + *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
      115 + *      dmu_tx_assign().  This is critical because we don't want to block
      116 + *      while holding locks.
 119  117   *
      118 + *      If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
      119 + *      reduces lock contention and CPU usage when we must wait (note that if
      120 + *      throughput is constrained by the storage, nearly every transaction
      121 + *      must wait).
      122 + *
      123 + *      Note, in particular, that if a lock is sometimes acquired before
      124 + *      the tx assigns, and sometimes after (e.g. z_lock), then failing
      125 + *      to use a non-blocking assign can deadlock the system.  The scenario:
      126 + *
 120  127   *      Thread A has grabbed a lock before calling dmu_tx_assign().
 121  128   *      Thread B is in an already-assigned tx, and blocks for this lock.
 122  129   *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 123  130   *      forever, because the previous txg can't quiesce until B's tx commits.
 124  131   *
 125  132   *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 126  133   *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 127  134   *      calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
 128  135   *      to indicate that this operation has already called dmu_tx_wait().
 129  136   *      This will ensure that we don't retry forever, waiting a short bit
↓ open down ↓ 591 lines elided ↑ open up ↑
 721  728          end_size = MAX(zp->z_size, woff + n);
 722  729  
 723  730          /*
 724  731           * Write the file in reasonable size chunks.  Each chunk is written
 725  732           * in a separate transaction; this keeps the intent log records small
 726  733           * and allows us to do more fine-grained space accounting.
 727  734           */
 728  735          while (n > 0) {
 729  736                  abuf = NULL;
 730  737                  woff = uio->uio_loffset;
 731      -again:
 732  738                  if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 733  739                      zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 734  740                          if (abuf != NULL)
 735  741                                  dmu_return_arcbuf(abuf);
 736  742                          error = SET_ERROR(EDQUOT);
 737  743                          break;
 738  744                  }
 739  745  
 740  746                  if (xuio && abuf == NULL) {
 741  747                          ASSERT(i_iov < iovcnt);
↓ open down ↓ 31 lines elided ↑ open up ↑
 773  779                          ASSERT(cbytes == max_blksz);
 774  780                  }
 775  781  
 776  782                  /*
 777  783                   * Start a transaction.
 778  784                   */
 779  785                  tx = dmu_tx_create(zfsvfs->z_os);
 780  786                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 781  787                  dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 782  788                  zfs_sa_upgrade_txholds(tx, zp);
 783      -                error = dmu_tx_assign(tx, TXG_NOWAIT);
      789 +                error = dmu_tx_assign(tx, TXG_WAIT);
 784  790                  if (error) {
 785      -                        if (error == ERESTART) {
 786      -                                dmu_tx_wait(tx);
 787      -                                dmu_tx_abort(tx);
 788      -                                goto again;
 789      -                        }
 790  791                          dmu_tx_abort(tx);
 791  792                          if (abuf != NULL)
 792  793                                  dmu_return_arcbuf(abuf);
 793  794                          break;
 794  795                  }
 795  796  
 796  797                  /*
 797  798                   * If zfs_range_lock() over-locked we grow the blocksize
 798  799                   * and then reduce the lock range.  This will only happen
 799  800                   * on the first iteration since zfs_range_reduce() will
↓ open down ↓ 2236 lines elided ↑ open up ↑
3036 3037          if (attrzp) {
3037 3038                  dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3038 3039          }
3039 3040  
3040 3041          fuid_dirtied = zfsvfs->z_fuid_dirty;
3041 3042          if (fuid_dirtied)
3042 3043                  zfs_fuid_txhold(zfsvfs, tx);
3043 3044  
3044 3045          zfs_sa_upgrade_txholds(tx, zp);
3045 3046  
3046      -        err = dmu_tx_assign(tx, TXG_NOWAIT);
3047      -        if (err) {
3048      -                if (err == ERESTART)
3049      -                        dmu_tx_wait(tx);
     3047 +        err = dmu_tx_assign(tx, TXG_WAIT);
     3048 +        if (err)
3050 3049                  goto out;
3051      -        }
3052 3050  
3053 3051          count = 0;
3054 3052          /*
3055 3053           * Set each attribute requested.
3056 3054           * We group settings according to the locks they need to acquire.
3057 3055           *
3058 3056           * Note: you cannot set ctime directly, although it will be
3059 3057           * updated as a side-effect of calling this function.
3060 3058           */
3061 3059  
↓ open down ↓ 1071 lines elided ↑ open up ↑
4133 4131                  if (trunc)
4134 4132                          pvn_write_done(trunc, flags);
4135 4133                  len = zp->z_size - off;
4136 4134          }
4137 4135  
4138 4136          if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4139 4137              zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4140 4138                  err = SET_ERROR(EDQUOT);
4141 4139                  goto out;
4142 4140          }
4143      -top:
4144 4141          tx = dmu_tx_create(zfsvfs->z_os);
4145 4142          dmu_tx_hold_write(tx, zp->z_id, off, len);
4146 4143  
4147 4144          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4148 4145          zfs_sa_upgrade_txholds(tx, zp);
4149      -        err = dmu_tx_assign(tx, TXG_NOWAIT);
     4146 +        err = dmu_tx_assign(tx, TXG_WAIT);
4150 4147          if (err != 0) {
4151      -                if (err == ERESTART) {
4152      -                        dmu_tx_wait(tx);
4153      -                        dmu_tx_abort(tx);
4154      -                        goto top;
4155      -                }
4156 4148                  dmu_tx_abort(tx);
4157 4149                  goto out;
4158 4150          }
4159 4151  
4160 4152          if (zp->z_blksz <= PAGESIZE) {
4161 4153                  caddr_t va = zfs_map_page(pp, S_READ);
4162 4154                  ASSERT3U(len, <=, PAGESIZE);
4163 4155                  dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4164 4156                  zfs_unmap_page(pp, va);
4165 4157          } else {
↓ open down ↓ 1119 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX