Print this page
4347 ZPL can use dmu_tx_assign(TXG_WAIT)
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>


  94  *
  95  *  (1) A check must be made in each zfs thread for a mounted file system.
  96  *      This is done avoiding races using ZFS_ENTER(zfsvfs).
  97  *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  98  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  99  *      can return EIO from the calling function.
 100  *
 101  *  (2) VN_RELE() should always be the last thing except for zil_commit()
 102  *      (if necessary) and ZFS_EXIT(). This is for 3 reasons:
 103  *      First, if it's the last reference, the vnode/znode
 104  *      can be freed, so the zp may point to freed memory.  Second, the last
 105  *      reference will call zfs_zinactive(), which may induce a lot of work --
 106  *      pushing cached pages (which acquires range locks) and syncing out
 107  *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
 108  *      which could deadlock the system if you were already holding one.
 109  *      If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
 110  *
 111  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 112  *      as they can span dmu_tx_assign() calls.
 113  *
 114  *  (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
 115  *      This is critical because we don't want to block while holding locks.







 116  *      Note, in particular, that if a lock is sometimes acquired before
 117  *      the tx assigns, and sometimes after (e.g. z_lock), then failing to
 118  *      use a non-blocking assign can deadlock the system.  The scenario:
 119  *
 120  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 121  *      Thread B is in an already-assigned tx, and blocks for this lock.
 122  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 123  *      forever, because the previous txg can't quiesce until B's tx commits.
 124  *
 125  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 126  *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 127  *      calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
 128  *      to indicate that this operation has already called dmu_tx_wait().
 129  *      This will ensure that we don't retry forever, waiting a short bit
 130  *      each time.
 131  *
 132  *  (5) If the operation succeeded, generate the intent log entry for it
 133  *      before dropping locks.  This ensures that the ordering of events
 134  *      in the intent log matches the order in which they actually occurred.
 135  *      During ZIL replay the zfs_log_* functions will update the sequence
 136  *      number to indicate the zil transaction has replayed.
 137  *
 138  *  (6) At the end of each vnode op, the DMU tx must always commit,


 711                 ZFS_EXIT(zfsvfs);
 712                 return (SET_ERROR(EFBIG));
 713         }
 714 
 715         if ((woff + n) > limit || woff > (limit - n))
 716                 n = limit - woff;
 717 
 718         /* Will this write extend the file length? */
 719         write_eof = (woff + n > zp->z_size);
 720 
 721         end_size = MAX(zp->z_size, woff + n);
 722 
 723         /*
 724          * Write the file in reasonable size chunks.  Each chunk is written
 725          * in a separate transaction; this keeps the intent log records small
 726          * and allows us to do more fine-grained space accounting.
 727          */
 728         while (n > 0) {
 729                 abuf = NULL;
 730                 woff = uio->uio_loffset;
 731 again:
 732                 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 733                     zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 734                         if (abuf != NULL)
 735                                 dmu_return_arcbuf(abuf);
 736                         error = SET_ERROR(EDQUOT);
 737                         break;
 738                 }
 739 
 740                 if (xuio && abuf == NULL) {
 741                         ASSERT(i_iov < iovcnt);
 742                         aiov = &iovp[i_iov];
 743                         abuf = dmu_xuio_arcbuf(xuio, i_iov);
 744                         dmu_xuio_clear(xuio, i_iov);
 745                         DTRACE_PROBE3(zfs_cp_write, int, i_iov,
 746                             iovec_t *, aiov, arc_buf_t *, abuf);
 747                         ASSERT((aiov->iov_base == abuf->b_data) ||
 748                             ((char *)aiov->iov_base - (char *)abuf->b_data +
 749                             aiov->iov_len == arc_buf_size(abuf)));
 750                         i_iov++;
 751                 } else if (abuf == NULL && n >= max_blksz &&


 763 
 764                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 765                             max_blksz);
 766                         ASSERT(abuf != NULL);
 767                         ASSERT(arc_buf_size(abuf) == max_blksz);
 768                         if (error = uiocopy(abuf->b_data, max_blksz,
 769                             UIO_WRITE, uio, &cbytes)) {
 770                                 dmu_return_arcbuf(abuf);
 771                                 break;
 772                         }
 773                         ASSERT(cbytes == max_blksz);
 774                 }
 775 
 776                 /*
 777                  * Start a transaction.
 778                  */
 779                 tx = dmu_tx_create(zfsvfs->z_os);
 780                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 781                 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 782                 zfs_sa_upgrade_txholds(tx, zp);
 783                 error = dmu_tx_assign(tx, TXG_NOWAIT);
 784                 if (error) {
 785                         if (error == ERESTART) {
 786                                 dmu_tx_wait(tx);
 787                                 dmu_tx_abort(tx);
 788                                 goto again;
 789                         }
 790                         dmu_tx_abort(tx);
 791                         if (abuf != NULL)
 792                                 dmu_return_arcbuf(abuf);
 793                         break;
 794                 }
 795 
 796                 /*
 797                  * If zfs_range_lock() over-locked we grow the blocksize
 798                  * and then reduce the lock range.  This will only happen
 799                  * on the first iteration since zfs_range_reduce() will
 800                  * shrink down r_len to the appropriate size.
 801                  */
 802                 if (rl->r_len == UINT64_MAX) {
 803                         uint64_t new_blksz;
 804 
 805                         if (zp->z_blksz > max_blksz) {
 806                                 ASSERT(!ISP2(zp->z_blksz));
 807                                 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
 808                         } else {
 809                                 new_blksz = MIN(end_size, max_blksz);
 810                         }


3026                 mutex_exit(&zp->z_lock);
3027                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3028         } else {
3029                 if ((mask & AT_XVATTR) &&
3030                     XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3031                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3032                 else
3033                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3034         }
3035 
3036         if (attrzp) {
3037                 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3038         }
3039 
3040         fuid_dirtied = zfsvfs->z_fuid_dirty;
3041         if (fuid_dirtied)
3042                 zfs_fuid_txhold(zfsvfs, tx);
3043 
3044         zfs_sa_upgrade_txholds(tx, zp);
3045 
3046         err = dmu_tx_assign(tx, TXG_NOWAIT);
3047         if (err) {
3048                 if (err == ERESTART)
3049                         dmu_tx_wait(tx);
3050                 goto out;
3051         }
3052 
3053         count = 0;
3054         /*
3055          * Set each attribute requested.
3056          * We group settings according to the locks they need to acquire.
3057          *
3058          * Note: you cannot set ctime directly, although it will be
3059          * updated as a side-effect of calling this function.
3060          */
3061 
3062 
3063         if (mask & (AT_UID|AT_GID|AT_MODE))
3064                 mutex_enter(&zp->z_acl_lock);
3065         mutex_enter(&zp->z_lock);
3066 
3067         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3068             &zp->z_pflags, sizeof (zp->z_pflags));
3069 
3070         if (attrzp) {
3071                 if (mask & (AT_UID|AT_GID|AT_MODE))


4123         if (off >= zp->z_size) {
4124                 /* ignore all pages */
4125                 err = 0;
4126                 goto out;
4127         } else if (off + len > zp->z_size) {
4128                 int npages = btopr(zp->z_size - off);
4129                 page_t *trunc;
4130 
4131                 page_list_break(&pp, &trunc, npages);
4132                 /* ignore pages past end of file */
4133                 if (trunc)
4134                         pvn_write_done(trunc, flags);
4135                 len = zp->z_size - off;
4136         }
4137 
4138         if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4139             zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4140                 err = SET_ERROR(EDQUOT);
4141                 goto out;
4142         }
4143 top:
4144         tx = dmu_tx_create(zfsvfs->z_os);
4145         dmu_tx_hold_write(tx, zp->z_id, off, len);
4146 
4147         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4148         zfs_sa_upgrade_txholds(tx, zp);
4149         err = dmu_tx_assign(tx, TXG_NOWAIT);
4150         if (err != 0) {
4151                 if (err == ERESTART) {
4152                         dmu_tx_wait(tx);
4153                         dmu_tx_abort(tx);
4154                         goto top;
4155                 }
4156                 dmu_tx_abort(tx);
4157                 goto out;
4158         }
4159 
4160         if (zp->z_blksz <= PAGESIZE) {
4161                 caddr_t va = zfs_map_page(pp, S_READ);
4162                 ASSERT3U(len, <=, PAGESIZE);
4163                 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4164                 zfs_unmap_page(pp, va);
4165         } else {
4166                 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4167         }
4168 
4169         if (err == 0) {
4170                 uint64_t mtime[2], ctime[2];
4171                 sa_bulk_attr_t bulk[3];
4172                 int count = 0;
4173 
4174                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4175                     &mtime, 16);
4176                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,




  94  *
  95  *  (1) A check must be made in each zfs thread for a mounted file system.
  96  *      This is done avoiding races using ZFS_ENTER(zfsvfs).
  97  *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  98  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  99  *      can return EIO from the calling function.
 100  *
 101  *  (2) VN_RELE() should always be the last thing except for zil_commit()
 102  *      (if necessary) and ZFS_EXIT(). This is for 3 reasons:
 103  *      First, if it's the last reference, the vnode/znode
 104  *      can be freed, so the zp may point to freed memory.  Second, the last
 105  *      reference will call zfs_zinactive(), which may induce a lot of work --
 106  *      pushing cached pages (which acquires range locks) and syncing out
 107  *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
 108  *      which could deadlock the system if you were already holding one.
 109  *      If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
 110  *
 111  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 112  *      as they can span dmu_tx_assign() calls.
 113  *
 114  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
 115  *      dmu_tx_assign().  This is critical because we don't want to block
 116  *      while holding locks.
 117  *
 118  *      If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
 119  *      reduces lock contention and CPU usage when we must wait (note that if
 120  *      throughput is constrained by the storage, nearly every transaction
 121  *      must wait).
 122  *
 123  *      Note, in particular, that if a lock is sometimes acquired before
 124  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
 125  *      to use a non-blocking assign can deadlock the system.  The scenario:
 126  *
 127  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 128  *      Thread B is in an already-assigned tx, and blocks for this lock.
 129  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 130  *      forever, because the previous txg can't quiesce until B's tx commits.
 131  *
 132  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 133  *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 134  *      calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
 135  *      to indicate that this operation has already called dmu_tx_wait().
 136  *      This will ensure that we don't retry forever, waiting a short bit
 137  *      each time.
 138  *
 139  *  (5) If the operation succeeded, generate the intent log entry for it
 140  *      before dropping locks.  This ensures that the ordering of events
 141  *      in the intent log matches the order in which they actually occurred.
 142  *      During ZIL replay the zfs_log_* functions will update the sequence
 143  *      number to indicate the zil transaction has replayed.
 144  *
 145  *  (6) At the end of each vnode op, the DMU tx must always commit,


 718                 ZFS_EXIT(zfsvfs);
 719                 return (SET_ERROR(EFBIG));
 720         }
 721 
 722         if ((woff + n) > limit || woff > (limit - n))
 723                 n = limit - woff;
 724 
 725         /* Will this write extend the file length? */
 726         write_eof = (woff + n > zp->z_size);
 727 
 728         end_size = MAX(zp->z_size, woff + n);
 729 
 730         /*
 731          * Write the file in reasonable size chunks.  Each chunk is written
 732          * in a separate transaction; this keeps the intent log records small
 733          * and allows us to do more fine-grained space accounting.
 734          */
 735         while (n > 0) {
 736                 abuf = NULL;
 737                 woff = uio->uio_loffset;

 738                 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 739                     zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 740                         if (abuf != NULL)
 741                                 dmu_return_arcbuf(abuf);
 742                         error = SET_ERROR(EDQUOT);
 743                         break;
 744                 }
 745 
 746                 if (xuio && abuf == NULL) {
 747                         ASSERT(i_iov < iovcnt);
 748                         aiov = &iovp[i_iov];
 749                         abuf = dmu_xuio_arcbuf(xuio, i_iov);
 750                         dmu_xuio_clear(xuio, i_iov);
 751                         DTRACE_PROBE3(zfs_cp_write, int, i_iov,
 752                             iovec_t *, aiov, arc_buf_t *, abuf);
 753                         ASSERT((aiov->iov_base == abuf->b_data) ||
 754                             ((char *)aiov->iov_base - (char *)abuf->b_data +
 755                             aiov->iov_len == arc_buf_size(abuf)));
 756                         i_iov++;
 757                 } else if (abuf == NULL && n >= max_blksz &&


 769 
 770                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 771                             max_blksz);
 772                         ASSERT(abuf != NULL);
 773                         ASSERT(arc_buf_size(abuf) == max_blksz);
 774                         if (error = uiocopy(abuf->b_data, max_blksz,
 775                             UIO_WRITE, uio, &cbytes)) {
 776                                 dmu_return_arcbuf(abuf);
 777                                 break;
 778                         }
 779                         ASSERT(cbytes == max_blksz);
 780                 }
 781 
 782                 /*
 783                  * Start a transaction.
 784                  */
 785                 tx = dmu_tx_create(zfsvfs->z_os);
 786                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 787                 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 788                 zfs_sa_upgrade_txholds(tx, zp);
 789                 error = dmu_tx_assign(tx, TXG_WAIT);
 790                 if (error) {


 791                         dmu_tx_abort(tx);



 792                         if (abuf != NULL)
 793                                 dmu_return_arcbuf(abuf);
 794                         break;
 795                 }
 796 
 797                 /*
 798                  * If zfs_range_lock() over-locked we grow the blocksize
 799                  * and then reduce the lock range.  This will only happen
 800                  * on the first iteration since zfs_range_reduce() will
 801                  * shrink down r_len to the appropriate size.
 802                  */
 803                 if (rl->r_len == UINT64_MAX) {
 804                         uint64_t new_blksz;
 805 
 806                         if (zp->z_blksz > max_blksz) {
 807                                 ASSERT(!ISP2(zp->z_blksz));
 808                                 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
 809                         } else {
 810                                 new_blksz = MIN(end_size, max_blksz);
 811                         }


3027                 mutex_exit(&zp->z_lock);
3028                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3029         } else {
3030                 if ((mask & AT_XVATTR) &&
3031                     XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3032                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3033                 else
3034                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3035         }
3036 
3037         if (attrzp) {
3038                 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3039         }
3040 
3041         fuid_dirtied = zfsvfs->z_fuid_dirty;
3042         if (fuid_dirtied)
3043                 zfs_fuid_txhold(zfsvfs, tx);
3044 
3045         zfs_sa_upgrade_txholds(tx, zp);
3046 
3047         err = dmu_tx_assign(tx, TXG_WAIT);
3048         if (err)


3049                 goto out;

3050 
3051         count = 0;
3052         /*
3053          * Set each attribute requested.
3054          * We group settings according to the locks they need to acquire.
3055          *
3056          * Note: you cannot set ctime directly, although it will be
3057          * updated as a side-effect of calling this function.
3058          */
3059 
3060 
3061         if (mask & (AT_UID|AT_GID|AT_MODE))
3062                 mutex_enter(&zp->z_acl_lock);
3063         mutex_enter(&zp->z_lock);
3064 
3065         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3066             &zp->z_pflags, sizeof (zp->z_pflags));
3067 
3068         if (attrzp) {
3069                 if (mask & (AT_UID|AT_GID|AT_MODE))


4121         if (off >= zp->z_size) {
4122                 /* ignore all pages */
4123                 err = 0;
4124                 goto out;
4125         } else if (off + len > zp->z_size) {
4126                 int npages = btopr(zp->z_size - off);
4127                 page_t *trunc;
4128 
4129                 page_list_break(&pp, &trunc, npages);
4130                 /* ignore pages past end of file */
4131                 if (trunc)
4132                         pvn_write_done(trunc, flags);
4133                 len = zp->z_size - off;
4134         }
4135 
4136         if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4137             zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4138                 err = SET_ERROR(EDQUOT);
4139                 goto out;
4140         }

4141         tx = dmu_tx_create(zfsvfs->z_os);
4142         dmu_tx_hold_write(tx, zp->z_id, off, len);
4143 
4144         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4145         zfs_sa_upgrade_txholds(tx, zp);
4146         err = dmu_tx_assign(tx, TXG_WAIT);
4147         if (err != 0) {


4148                 dmu_tx_abort(tx);



4149                 goto out;
4150         }
4151 
4152         if (zp->z_blksz <= PAGESIZE) {
4153                 caddr_t va = zfs_map_page(pp, S_READ);
4154                 ASSERT3U(len, <=, PAGESIZE);
4155                 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4156                 zfs_unmap_page(pp, va);
4157         } else {
4158                 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4159         }
4160 
4161         if (err == 0) {
4162                 uint64_t mtime[2], ctime[2];
4163                 sa_bulk_attr_t bulk[3];
4164                 int count = 0;
4165 
4166                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4167                     &mtime, 16);
4168                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,