94 *
95 * (1) A check must be made in each zfs thread for a mounted file system.
96 * This is done avoiding races using ZFS_ENTER(zfsvfs).
97 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
98 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
99 * can return EIO from the calling function.
100 *
101 * (2) VN_RELE() should always be the last thing except for zil_commit()
102 * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
103 * First, if it's the last reference, the vnode/znode
104 * can be freed, so the zp may point to freed memory. Second, the last
105 * reference will call zfs_zinactive(), which may induce a lot of work --
106 * pushing cached pages (which acquires range locks) and syncing out
107 * cached atime changes. Third, zfs_zinactive() may require a new tx,
108 * which could deadlock the system if you were already holding one.
109 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
110 *
111 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
112 * as they can span dmu_tx_assign() calls.
113 *
114 * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
115 * This is critical because we don't want to block while holding locks.
116 * Note, in particular, that if a lock is sometimes acquired before
117 * the tx assigns, and sometimes after (e.g. z_lock), then failing to
118 * use a non-blocking assign can deadlock the system. The scenario:
119 *
120 * Thread A has grabbed a lock before calling dmu_tx_assign().
121 * Thread B is in an already-assigned tx, and blocks for this lock.
122 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
123 * forever, because the previous txg can't quiesce until B's tx commits.
124 *
125 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
126 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
127 * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
128 * to indicate that this operation has already called dmu_tx_wait().
129 * This will ensure that we don't retry forever, waiting a short bit
130 * each time.
131 *
132 * (5) If the operation succeeded, generate the intent log entry for it
133 * before dropping locks. This ensures that the ordering of events
134 * in the intent log matches the order in which they actually occurred.
135 * During ZIL replay the zfs_log_* functions will update the sequence
136 * number to indicate the zil transaction has replayed.
137 *
138 * (6) At the end of each vnode op, the DMU tx must always commit,
711 ZFS_EXIT(zfsvfs);
712 return (SET_ERROR(EFBIG));
713 }
714
715 if ((woff + n) > limit || woff > (limit - n))
716 n = limit - woff;
717
718 /* Will this write extend the file length? */
719 write_eof = (woff + n > zp->z_size);
720
721 end_size = MAX(zp->z_size, woff + n);
722
723 /*
724 * Write the file in reasonable size chunks. Each chunk is written
725 * in a separate transaction; this keeps the intent log records small
726 * and allows us to do more fine-grained space accounting.
727 */
728 while (n > 0) {
729 abuf = NULL;
730 woff = uio->uio_loffset;
731 again:
732 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
733 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
734 if (abuf != NULL)
735 dmu_return_arcbuf(abuf);
736 error = SET_ERROR(EDQUOT);
737 break;
738 }
739
740 if (xuio && abuf == NULL) {
741 ASSERT(i_iov < iovcnt);
742 aiov = &iovp[i_iov];
743 abuf = dmu_xuio_arcbuf(xuio, i_iov);
744 dmu_xuio_clear(xuio, i_iov);
745 DTRACE_PROBE3(zfs_cp_write, int, i_iov,
746 iovec_t *, aiov, arc_buf_t *, abuf);
747 ASSERT((aiov->iov_base == abuf->b_data) ||
748 ((char *)aiov->iov_base - (char *)abuf->b_data +
749 aiov->iov_len == arc_buf_size(abuf)));
750 i_iov++;
751 } else if (abuf == NULL && n >= max_blksz &&
763
764 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
765 max_blksz);
766 ASSERT(abuf != NULL);
767 ASSERT(arc_buf_size(abuf) == max_blksz);
768 if (error = uiocopy(abuf->b_data, max_blksz,
769 UIO_WRITE, uio, &cbytes)) {
770 dmu_return_arcbuf(abuf);
771 break;
772 }
773 ASSERT(cbytes == max_blksz);
774 }
775
776 /*
777 * Start a transaction.
778 */
779 tx = dmu_tx_create(zfsvfs->z_os);
780 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
781 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
782 zfs_sa_upgrade_txholds(tx, zp);
783 error = dmu_tx_assign(tx, TXG_NOWAIT);
784 if (error) {
785 if (error == ERESTART) {
786 dmu_tx_wait(tx);
787 dmu_tx_abort(tx);
788 goto again;
789 }
790 dmu_tx_abort(tx);
791 if (abuf != NULL)
792 dmu_return_arcbuf(abuf);
793 break;
794 }
795
796 /*
797 * If zfs_range_lock() over-locked we grow the blocksize
798 * and then reduce the lock range. This will only happen
799 * on the first iteration since zfs_range_reduce() will
800 * shrink down r_len to the appropriate size.
801 */
802 if (rl->r_len == UINT64_MAX) {
803 uint64_t new_blksz;
804
805 if (zp->z_blksz > max_blksz) {
806 ASSERT(!ISP2(zp->z_blksz));
807 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
808 } else {
809 new_blksz = MIN(end_size, max_blksz);
810 }
3026 mutex_exit(&zp->z_lock);
3027 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3028 } else {
3029 if ((mask & AT_XVATTR) &&
3030 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3031 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3032 else
3033 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3034 }
3035
3036 if (attrzp) {
3037 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3038 }
3039
3040 fuid_dirtied = zfsvfs->z_fuid_dirty;
3041 if (fuid_dirtied)
3042 zfs_fuid_txhold(zfsvfs, tx);
3043
3044 zfs_sa_upgrade_txholds(tx, zp);
3045
3046 err = dmu_tx_assign(tx, TXG_NOWAIT);
3047 if (err) {
3048 if (err == ERESTART)
3049 dmu_tx_wait(tx);
3050 goto out;
3051 }
3052
3053 count = 0;
3054 /*
3055 * Set each attribute requested.
3056 * We group settings according to the locks they need to acquire.
3057 *
3058 * Note: you cannot set ctime directly, although it will be
3059 * updated as a side-effect of calling this function.
3060 */
3061
3062
3063 if (mask & (AT_UID|AT_GID|AT_MODE))
3064 mutex_enter(&zp->z_acl_lock);
3065 mutex_enter(&zp->z_lock);
3066
3067 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3068 &zp->z_pflags, sizeof (zp->z_pflags));
3069
3070 if (attrzp) {
3071 if (mask & (AT_UID|AT_GID|AT_MODE))
4123 if (off >= zp->z_size) {
4124 /* ignore all pages */
4125 err = 0;
4126 goto out;
4127 } else if (off + len > zp->z_size) {
4128 int npages = btopr(zp->z_size - off);
4129 page_t *trunc;
4130
4131 page_list_break(&pp, &trunc, npages);
4132 /* ignore pages past end of file */
4133 if (trunc)
4134 pvn_write_done(trunc, flags);
4135 len = zp->z_size - off;
4136 }
4137
4138 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4139 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4140 err = SET_ERROR(EDQUOT);
4141 goto out;
4142 }
4143 top:
4144 tx = dmu_tx_create(zfsvfs->z_os);
4145 dmu_tx_hold_write(tx, zp->z_id, off, len);
4146
4147 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4148 zfs_sa_upgrade_txholds(tx, zp);
4149 err = dmu_tx_assign(tx, TXG_NOWAIT);
4150 if (err != 0) {
4151 if (err == ERESTART) {
4152 dmu_tx_wait(tx);
4153 dmu_tx_abort(tx);
4154 goto top;
4155 }
4156 dmu_tx_abort(tx);
4157 goto out;
4158 }
4159
4160 if (zp->z_blksz <= PAGESIZE) {
4161 caddr_t va = zfs_map_page(pp, S_READ);
4162 ASSERT3U(len, <=, PAGESIZE);
4163 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4164 zfs_unmap_page(pp, va);
4165 } else {
4166 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4167 }
4168
4169 if (err == 0) {
4170 uint64_t mtime[2], ctime[2];
4171 sa_bulk_attr_t bulk[3];
4172 int count = 0;
4173
4174 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4175 &mtime, 16);
4176 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
|
94 *
95 * (1) A check must be made in each zfs thread for a mounted file system.
96 * This is done avoiding races using ZFS_ENTER(zfsvfs).
97 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
98 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
99 * can return EIO from the calling function.
100 *
101 * (2) VN_RELE() should always be the last thing except for zil_commit()
102 * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
103 * First, if it's the last reference, the vnode/znode
104 * can be freed, so the zp may point to freed memory. Second, the last
105 * reference will call zfs_zinactive(), which may induce a lot of work --
106 * pushing cached pages (which acquires range locks) and syncing out
107 * cached atime changes. Third, zfs_zinactive() may require a new tx,
108 * which could deadlock the system if you were already holding one.
109 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
110 *
111 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
112 * as they can span dmu_tx_assign() calls.
113 *
114 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
115 * dmu_tx_assign(). This is critical because we don't want to block
116 * while holding locks.
117 *
118 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
119 * reduces lock contention and CPU usage when we must wait (note that if
120 * throughput is constrained by the storage, nearly every transaction
121 * must wait).
122 *
123 * Note, in particular, that if a lock is sometimes acquired before
124 * the tx assigns, and sometimes after (e.g. z_lock), then failing
125 * to use a non-blocking assign can deadlock the system. The scenario:
126 *
127 * Thread A has grabbed a lock before calling dmu_tx_assign().
128 * Thread B is in an already-assigned tx, and blocks for this lock.
129 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
130 * forever, because the previous txg can't quiesce until B's tx commits.
131 *
132 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
133 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
134 * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
135 * to indicate that this operation has already called dmu_tx_wait().
136 * This will ensure that we don't retry forever, waiting a short bit
137 * each time.
138 *
139 * (5) If the operation succeeded, generate the intent log entry for it
140 * before dropping locks. This ensures that the ordering of events
141 * in the intent log matches the order in which they actually occurred.
142 * During ZIL replay the zfs_log_* functions will update the sequence
143 * number to indicate the zil transaction has replayed.
144 *
145 * (6) At the end of each vnode op, the DMU tx must always commit,
718 ZFS_EXIT(zfsvfs);
719 return (SET_ERROR(EFBIG));
720 }
721
722 if ((woff + n) > limit || woff > (limit - n))
723 n = limit - woff;
724
725 /* Will this write extend the file length? */
726 write_eof = (woff + n > zp->z_size);
727
728 end_size = MAX(zp->z_size, woff + n);
729
730 /*
731 * Write the file in reasonable size chunks. Each chunk is written
732 * in a separate transaction; this keeps the intent log records small
733 * and allows us to do more fine-grained space accounting.
734 */
735 while (n > 0) {
736 abuf = NULL;
737 woff = uio->uio_loffset;
738 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
739 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
740 if (abuf != NULL)
741 dmu_return_arcbuf(abuf);
742 error = SET_ERROR(EDQUOT);
743 break;
744 }
745
746 if (xuio && abuf == NULL) {
747 ASSERT(i_iov < iovcnt);
748 aiov = &iovp[i_iov];
749 abuf = dmu_xuio_arcbuf(xuio, i_iov);
750 dmu_xuio_clear(xuio, i_iov);
751 DTRACE_PROBE3(zfs_cp_write, int, i_iov,
752 iovec_t *, aiov, arc_buf_t *, abuf);
753 ASSERT((aiov->iov_base == abuf->b_data) ||
754 ((char *)aiov->iov_base - (char *)abuf->b_data +
755 aiov->iov_len == arc_buf_size(abuf)));
756 i_iov++;
757 } else if (abuf == NULL && n >= max_blksz &&
769
770 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
771 max_blksz);
772 ASSERT(abuf != NULL);
773 ASSERT(arc_buf_size(abuf) == max_blksz);
774 if (error = uiocopy(abuf->b_data, max_blksz,
775 UIO_WRITE, uio, &cbytes)) {
776 dmu_return_arcbuf(abuf);
777 break;
778 }
779 ASSERT(cbytes == max_blksz);
780 }
781
782 /*
783 * Start a transaction.
784 */
785 tx = dmu_tx_create(zfsvfs->z_os);
786 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
787 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
788 zfs_sa_upgrade_txholds(tx, zp);
789 error = dmu_tx_assign(tx, TXG_WAIT);
790 if (error) {
791 dmu_tx_abort(tx);
792 if (abuf != NULL)
793 dmu_return_arcbuf(abuf);
794 break;
795 }
796
797 /*
798 * If zfs_range_lock() over-locked we grow the blocksize
799 * and then reduce the lock range. This will only happen
800 * on the first iteration since zfs_range_reduce() will
801 * shrink down r_len to the appropriate size.
802 */
803 if (rl->r_len == UINT64_MAX) {
804 uint64_t new_blksz;
805
806 if (zp->z_blksz > max_blksz) {
807 ASSERT(!ISP2(zp->z_blksz));
808 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
809 } else {
810 new_blksz = MIN(end_size, max_blksz);
811 }
3027 mutex_exit(&zp->z_lock);
3028 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3029 } else {
3030 if ((mask & AT_XVATTR) &&
3031 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3032 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3033 else
3034 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3035 }
3036
3037 if (attrzp) {
3038 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3039 }
3040
3041 fuid_dirtied = zfsvfs->z_fuid_dirty;
3042 if (fuid_dirtied)
3043 zfs_fuid_txhold(zfsvfs, tx);
3044
3045 zfs_sa_upgrade_txholds(tx, zp);
3046
3047 err = dmu_tx_assign(tx, TXG_WAIT);
3048 if (err)
3049 goto out;
3050
3051 count = 0;
3052 /*
3053 * Set each attribute requested.
3054 * We group settings according to the locks they need to acquire.
3055 *
3056 * Note: you cannot set ctime directly, although it will be
3057 * updated as a side-effect of calling this function.
3058 */
3059
3060
3061 if (mask & (AT_UID|AT_GID|AT_MODE))
3062 mutex_enter(&zp->z_acl_lock);
3063 mutex_enter(&zp->z_lock);
3064
3065 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3066 &zp->z_pflags, sizeof (zp->z_pflags));
3067
3068 if (attrzp) {
3069 if (mask & (AT_UID|AT_GID|AT_MODE))
4121 if (off >= zp->z_size) {
4122 /* ignore all pages */
4123 err = 0;
4124 goto out;
4125 } else if (off + len > zp->z_size) {
4126 int npages = btopr(zp->z_size - off);
4127 page_t *trunc;
4128
4129 page_list_break(&pp, &trunc, npages);
4130 /* ignore pages past end of file */
4131 if (trunc)
4132 pvn_write_done(trunc, flags);
4133 len = zp->z_size - off;
4134 }
4135
4136 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4137 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4138 err = SET_ERROR(EDQUOT);
4139 goto out;
4140 }
4141 tx = dmu_tx_create(zfsvfs->z_os);
4142 dmu_tx_hold_write(tx, zp->z_id, off, len);
4143
4144 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4145 zfs_sa_upgrade_txholds(tx, zp);
4146 err = dmu_tx_assign(tx, TXG_WAIT);
4147 if (err != 0) {
4148 dmu_tx_abort(tx);
4149 goto out;
4150 }
4151
4152 if (zp->z_blksz <= PAGESIZE) {
4153 caddr_t va = zfs_map_page(pp, S_READ);
4154 ASSERT3U(len, <=, PAGESIZE);
4155 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4156 zfs_unmap_page(pp, va);
4157 } else {
4158 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4159 }
4160
4161 if (err == 0) {
4162 uint64_t mtime[2], ctime[2];
4163 sa_bulk_attr_t bulk[3];
4164 int count = 0;
4165
4166 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4167 &mtime, 16);
4168 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
|