illumos-gate Sdiff usr/src/uts/common/fs/zfs/zfs

Print this page

Possibility to physically reserve space without writing leaf blocks

  57 #include <sys/zfs_dir.h>
  58 #include <sys/zfs_acl.h>
  59 #include <sys/zfs_ioctl.h>
  60 #include <sys/fs/zfs.h>
  61 #include <sys/dmu.h>
  62 #include <sys/dmu_objset.h>
  63 #include <sys/spa.h>
  64 #include <sys/txg.h>
  65 #include <sys/dbuf.h>
  66 #include <sys/zap.h>
  67 #include <sys/sa.h>
  68 #include <sys/dirent.h>
  69 #include <sys/policy.h>
  70 #include <sys/sunddi.h>
  71 #include <sys/filio.h>
  72 #include <sys/sid.h>
  73 #include "fs/fs_subr.h"
  74 #include <sys/zfs_ctldir.h>
  75 #include <sys/zfs_fuid.h>
  76 #include <sys/zfs_sa.h>

  77 #include <sys/dnlc.h>
  78 #include <sys/zfs_rlock.h>
  79 #include <sys/extdirent.h>
  80 #include <sys/kidmap.h>
  81 #include <sys/cred.h>
  82 #include <sys/attr.h>
  83 
  84 /*
  85  * Programming rules.
  86  *
  87  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  88  * properly lock its in-core state, create a DMU transaction, do the work,
  89  * record this work in the intent log (ZIL), commit the DMU transaction,
  90  * and wait for the intent log to commit if it is a synchronous operation.
  91  * Moreover, the vnode ops must work in both normal and log replay context.
  92  * The ordering of events is important to avoid deadlocks and references
  93  * to freed memory.  The example below illustrates the following Big Rules:
  94  *
  95  *  (1) A check must be made in each zfs thread for a mounted file system.
  96  *      This is done avoiding races using ZFS_ENTER(zfsvfs).

 275                 return (SET_ERROR(ENXIO));
 276 
 277         /*
 278          * We could find a hole that begins after the logical end-of-file,
 279          * because dmu_offset_next() only works on whole blocks.  If the
 280          * EOF falls mid-block, then indicate that the "virtual hole"
 281          * at the end of the file begins at the logical EOF, rather than
 282          * at the end of the last block.
 283          */
 284         if (noff > file_sz) {
 285                 ASSERT(hole);
 286                 noff = file_sz;
 287         }
 288 
 289         if (noff < *off)
 290                 return (error);
 291         *off = noff;
 292         return (error);
 293 }
 294 




 295 /* ARGSUSED */
 296 static int
 297 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
 298     int *rvalp, caller_context_t *ct)
 299 {
 300         offset_t off;
 301         int error;
 302         zfsvfs_t *zfsvfs;
 303         znode_t *zp;

 304 
 305         switch (com) {
 306         case _FIOFFS:
 307                 return (zfs_sync(vp->v_vfsp, 0, cred));
 308 
 309                 /*
 310                  * The following two ioctls are used by bfu.  Faking out,
 311                  * necessary to avoid bfu errors.
 312                  */
 313         case _FIOGDIO:
 314         case _FIOSDIO:
 315                 return (0);
 316 
 317         case _FIO_SEEK_DATA:
 318         case _FIO_SEEK_HOLE:
 319                 if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 320                         return (SET_ERROR(EFAULT));
 321 
 322                 zp = VTOZ(vp);
 323                 zfsvfs = zp->z_zfsvfs;
 324                 ZFS_ENTER(zfsvfs);
 325                 ZFS_VERIFY_ZP(zp);
 326 
 327                 /* offset parameter is in/out */
 328                 error = zfs_holey(vp, com, &off);
 329                 ZFS_EXIT(zfsvfs);
 330                 if (error)
 331                         return (error);
 332                 if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 333                         return (SET_ERROR(EFAULT));
 334                 return (0);





 335         }
 336         return (SET_ERROR(ENOTTY));
 337 }
 338 
 339 /*
 340  * Utility functions to map and unmap a single physical page.  These
 341  * are used to manage the mappable copies of ZFS file data, and therefore
 342  * do not update ref/mod bits.
 343  */
 344 caddr_t
 345 zfs_map_page(page_t *pp, enum seg_rw rw)
 346 {
 347         if (kpm_enable)
 348                 return (hat_kpm_mapin(pp, 0));
 349         ASSERT(rw == S_READ || rw == S_WRITE);
 350         return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
 351             (caddr_t)-1));
 352 }
 353 
 354 void

 939 
 940         zfs_range_unlock(rl);
 941 
 942         /*
 943          * If we're in replay mode, or we made no progress, return error.
 944          * Otherwise, it's at least a partial write, so it's successful.
 945          */
 946         if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 947                 ZFS_EXIT(zfsvfs);
 948                 return (error);
 949         }
 950 
 951         if (ioflag & (FSYNC | FDSYNC) ||
 952             zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 953                 zil_commit(zilog, zp->z_id);
 954 
 955         ZFS_EXIT(zfsvfs);
 956         return (0);
 957 }
 958 
































































































 959 void
 960 zfs_get_done(zgd_t *zgd, int error)
 961 {
 962         znode_t *zp = zgd->zgd_private;
 963         objset_t *os = zp->z_zfsvfs->z_os;
 964 
 965         if (zgd->zgd_db)
 966                 dmu_buf_rele(zgd->zgd_db, zgd);
 967 
 968         zfs_range_unlock(zgd->zgd_rl);
 969 
 970         /*
 971          * Release the vnode asynchronously as we currently have the
 972          * txg stopped from syncing.
 973          */
 974         VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 975 
 976         if (error == 0 && zgd->zgd_bp)
 977                 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 978

  57 #include <sys/zfs_dir.h>
  58 #include <sys/zfs_acl.h>
  59 #include <sys/zfs_ioctl.h>
  60 #include <sys/fs/zfs.h>
  61 #include <sys/dmu.h>
  62 #include <sys/dmu_objset.h>
  63 #include <sys/spa.h>
  64 #include <sys/txg.h>
  65 #include <sys/dbuf.h>
  66 #include <sys/zap.h>
  67 #include <sys/sa.h>
  68 #include <sys/dirent.h>
  69 #include <sys/policy.h>
  70 #include <sys/sunddi.h>
  71 #include <sys/filio.h>
  72 #include <sys/sid.h>
  73 #include "fs/fs_subr.h"
  74 #include <sys/zfs_ctldir.h>
  75 #include <sys/zfs_fuid.h>
  76 #include <sys/zfs_sa.h>
  77 #include <sys/zfeature.h>
  78 #include <sys/dnlc.h>
  79 #include <sys/zfs_rlock.h>
  80 #include <sys/extdirent.h>
  81 #include <sys/kidmap.h>
  82 #include <sys/cred.h>
  83 #include <sys/attr.h>
  84 
  85 /*
  86  * Programming rules.
  87  *
  88  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  89  * properly lock its in-core state, create a DMU transaction, do the work,
  90  * record this work in the intent log (ZIL), commit the DMU transaction,
  91  * and wait for the intent log to commit if it is a synchronous operation.
  92  * Moreover, the vnode ops must work in both normal and log replay context.
  93  * The ordering of events is important to avoid deadlocks and references
  94  * to freed memory.  The example below illustrates the following Big Rules:
  95  *
  96  *  (1) A check must be made in each zfs thread for a mounted file system.
  97  *      This is done avoiding races using ZFS_ENTER(zfsvfs).

 276                 return (SET_ERROR(ENXIO));
 277 
 278         /*
 279          * We could find a hole that begins after the logical end-of-file,
 280          * because dmu_offset_next() only works on whole blocks.  If the
 281          * EOF falls mid-block, then indicate that the "virtual hole"
 282          * at the end of the file begins at the logical EOF, rather than
 283          * at the end of the last block.
 284          */
 285         if (noff > file_sz) {
 286                 ASSERT(hole);
 287                 noff = file_sz;
 288         }
 289 
 290         if (noff < *off)
 291                 return (error);
 292         *off = noff;
 293         return (error);
 294 }
 295 
 296 
 297 static int zfs_zero_write(vnode_t *vp, uint64_t size, cred_t *cr,
 298     caller_context_t *ct);
 299 
 300 /* ARGSUSED */
 301 static int
 302 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
 303     int *rvalp, caller_context_t *ct)
 304 {
 305         offset_t off;
 306         int error;
 307         zfsvfs_t *zfsvfs;
 308         znode_t *zp;
 309         uint64_t size;
 310 
 311         switch (com) {
 312         case _FIOFFS:
 313                 return (zfs_sync(vp->v_vfsp, 0, cred));
 314 
 315                 /*
 316                  * The following two ioctls are used by bfu.  Faking out,
 317                  * necessary to avoid bfu errors.
 318                  */
 319         case _FIOGDIO:
 320         case _FIOSDIO:
 321                 return (0);
 322 
 323         case _FIO_SEEK_DATA:
 324         case _FIO_SEEK_HOLE:
 325                 if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 326                         return (SET_ERROR(EFAULT));
 327 
 328                 zp = VTOZ(vp);
 329                 zfsvfs = zp->z_zfsvfs;
 330                 ZFS_ENTER(zfsvfs);
 331                 ZFS_VERIFY_ZP(zp);
 332 
 333                 /* offset parameter is in/out */
 334                 error = zfs_holey(vp, com, &off);
 335                 ZFS_EXIT(zfsvfs);
 336                 if (error)
 337                         return (error);
 338                 if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 339                         return (SET_ERROR(EFAULT));
 340                 return (0);
 341         case _FIO_RESERVE_SPACE:
 342                 if (ddi_copyin((void *)data, &size, sizeof (size), flag))
 343                         return (EFAULT);
 344                 error = zfs_zero_write(vp, size, cred, ct);
 345                 return (error);
 346         }
 347         return (SET_ERROR(ENOTTY));
 348 }
 349 
 350 /*
 351  * Utility functions to map and unmap a single physical page.  These
 352  * are used to manage the mappable copies of ZFS file data, and therefore
 353  * do not update ref/mod bits.
 354  */
 355 caddr_t
 356 zfs_map_page(page_t *pp, enum seg_rw rw)
 357 {
 358         if (kpm_enable)
 359                 return (hat_kpm_mapin(pp, 0));
 360         ASSERT(rw == S_READ || rw == S_WRITE);
 361         return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
 362             (caddr_t)-1));
 363 }
 364 
 365 void

 950 
 951         zfs_range_unlock(rl);
 952 
 953         /*
 954          * If we're in replay mode, or we made no progress, return error.
 955          * Otherwise, it's at least a partial write, so it's successful.
 956          */
 957         if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 958                 ZFS_EXIT(zfsvfs);
 959                 return (error);
 960         }
 961 
 962         if (ioflag & (FSYNC | FDSYNC) ||
 963             zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 964                 zil_commit(zilog, zp->z_id);
 965 
 966         ZFS_EXIT(zfsvfs);
 967         return (0);
 968 }
 969 
 970 #define ZFS_RESERVE_CHUNK (2 * 1024 * 1024)
 971 /* ARGSUSED */
 972 static int
 973 zfs_zero_write(vnode_t *vp, uint64_t size, cred_t *cr, caller_context_t *ct)
 974 {
 975         znode_t         *zp = VTOZ(vp);
 976         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 977         int             count = 0;
 978         sa_bulk_attr_t  bulk[4];
 979         uint64_t        mtime[2], ctime[2];
 980         rl_t            *rl;
 981         int             error = 0;
 982         dmu_tx_t        *tx = NULL;
 983         uint64_t        end_size;
 984         uint64_t        pos = 0;
 985 
 986         if (zp->z_size > 0)
 987                 return (EFBIG);
 988         if (size == 0)
 989                 return (0);
 990 
 991         ZFS_ENTER(zfsvfs);
 992         ZFS_VERIFY_ZP(zp);
 993 
 994         if (!spa_feature_is_enabled(zfsvfs->z_os->os_spa,
 995             SPA_FEATURE_SPACE_RESERVATION))
 996         {
 997                 ZFS_EXIT(zfsvfs);
 998                 return (ENOTSUP);
 999         }
1000 
1001         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
1002         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
1003         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1004             &zp->z_size, 8);
1005         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1006             &zp->z_pflags, 8);
1007 
1008         /*
1009          * If immutable or not appending then return EPERM
1010          */
1011         if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY))) {
1012                 ZFS_EXIT(zfsvfs);
1013                 return (EPERM);
1014         }
1015 
1016         rl = zfs_range_lock(zp, 0, size, RL_WRITER);
1017 
1018         if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1019             zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1020                 error = EDQUOT;
1021                 goto out;
1022         }
1023 
1024         while (pos < size) {
1025                 uint64_t length = size - pos;
1026                 length = MIN(length, ZFS_RESERVE_CHUNK);
1027 again:
1028                 tx = dmu_tx_create(zfsvfs->z_os);
1029                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1030                 dmu_tx_hold_write(tx, zp->z_id, pos, length);
1031                 zfs_sa_upgrade_txholds(tx, zp);
1032                 error = dmu_tx_assign(tx, TXG_NOWAIT);
1033                 if (error) {
1034                         if (error == ERESTART) {
1035                                 dmu_tx_wait(tx);
1036                                 dmu_tx_abort(tx);
1037                                 goto again;
1038                         }
1039                         dmu_tx_abort(tx);
1040                         goto out;
1041                 }
1042 
1043                 if (pos == 0)
1044                         zfs_grow_blocksize(zp, MIN(size, zfsvfs->z_max_blksz), tx);
1045                 dmu_write_zero(zfsvfs->z_os, zp->z_id, pos, length, tx);
1046 
1047                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
1048 
1049                 pos += length;
1050                 while ((end_size = zp->z_size) < pos)
1051                         (void) atomic_cas_64(&zp->z_size, end_size, pos);
1052 
1053                 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1054 
1055                 dmu_tx_commit(tx);
1056                 if (error)
1057                         goto out;
1058         }
1059 out:
1060         zfs_range_unlock(rl);
1061         ZFS_EXIT(zfsvfs);
1062 
1063         return (error);
1064 }
1065 
1066 void
1067 zfs_get_done(zgd_t *zgd, int error)
1068 {
1069         znode_t *zp = zgd->zgd_private;
1070         objset_t *os = zp->z_zfsvfs->z_os;
1071 
1072         if (zgd->zgd_db)
1073                 dmu_buf_rele(zgd->zgd_db, zgd);
1074 
1075         zfs_range_unlock(zgd->zgd_rl);
1076 
1077         /*
1078          * Release the vnode asynchronously as we currently have the
1079          * txg stopped from syncing.
1080          */
1081         VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1082 
1083         if (error == 0 && zgd->zgd_bp)
1084                 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1085