illumos-gate Wdiff usr/src/uts/common/fs/zfs/zfs_vnops.c

Print this page

3006 VERIFY[S,U,P] and ASSERT[S,U,P] frequently check if first argument is zero

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/zfs_vnops.c
          +++ new/usr/src/uts/common/fs/zfs/zfs_vnops.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *

↓ open down ↓

12 lines elided

↑ open up ↑

  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright (c) 2012 by Delphix. All rights reserved.
  23   24   */
  24   25  
       26 +
       27 +
       28 +
  25   29  /* Portions Copyright 2007 Jeremy Teo */
  26   30  /* Portions Copyright 2010 Robert Milkowski */
  27   31  
  28   32  #include <sys/types.h>
  29   33  #include <sys/param.h>
  30   34  #include <sys/time.h>
  31   35  #include <sys/systm.h>
  32   36  #include <sys/sysmacros.h>
  33   37  #include <sys/resource.h>
  34   38  #include <sys/vfs.h>

  35   39  #include <sys/vfs_opreg.h>
  36   40  #include <sys/vnode.h>
  37   41  #include <sys/file.h>
  38   42  #include <sys/stat.h>
  39   43  #include <sys/kmem.h>
  40   44  #include <sys/taskq.h>
  41   45  #include <sys/uio.h>
  42   46  #include <sys/vmsystm.h>
  43   47  #include <sys/atomic.h>
  44   48  #include <sys/vm.h>
  45   49  #include <vm/seg_vn.h>
  46   50  #include <vm/pvn.h>
  47   51  #include <vm/as.h>
  48   52  #include <vm/kpm.h>
  49   53  #include <vm/seg_kpm.h>
  50   54  #include <sys/mman.h>
  51   55  #include <sys/pathname.h>
  52   56  #include <sys/cmn_err.h>
  53   57  #include <sys/errno.h>
  54   58  #include <sys/unistd.h>
  55   59  #include <sys/zfs_dir.h>
  56   60  #include <sys/zfs_acl.h>
  57   61  #include <sys/zfs_ioctl.h>
  58   62  #include <sys/fs/zfs.h>
  59   63  #include <sys/dmu.h>
  60   64  #include <sys/dmu_objset.h>
  61   65  #include <sys/spa.h>
  62   66  #include <sys/txg.h>
  63   67  #include <sys/dbuf.h>
  64   68  #include <sys/zap.h>
  65   69  #include <sys/sa.h>
  66   70  #include <sys/dirent.h>
  67   71  #include <sys/policy.h>
  68   72  #include <sys/sunddi.h>
  69   73  #include <sys/filio.h>
  70   74  #include <sys/sid.h>
  71   75  #include "fs/fs_subr.h"
  72   76  #include <sys/zfs_ctldir.h>
  73   77  #include <sys/zfs_fuid.h>
  74   78  #include <sys/zfs_sa.h>
  75   79  #include <sys/dnlc.h>
  76   80  #include <sys/zfs_rlock.h>
  77   81  #include <sys/extdirent.h>
  78   82  #include <sys/kidmap.h>
  79   83  #include <sys/cred.h>
  80   84  #include <sys/attr.h>
  81   85  
  82   86  /*
  83   87   * Programming rules.
  84   88   *
  85   89   * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  86   90   * properly lock its in-core state, create a DMU transaction, do the work,
  87   91   * record this work in the intent log (ZIL), commit the DMU transaction,
  88   92   * and wait for the intent log to commit if it is a synchronous operation.
  89   93   * Moreover, the vnode ops must work in both normal and log replay context.
  90   94   * The ordering of events is important to avoid deadlocks and references
  91   95   * to freed memory.  The example below illustrates the following Big Rules:
  92   96   *
  93   97   *  (1) A check must be made in each zfs thread for a mounted file system.
  94   98   *      This is done avoiding races using ZFS_ENTER(zfsvfs).
  95   99   *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  96  100   *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  97  101   *      can return EIO from the calling function.
  98  102   *
  99  103   *  (2) VN_RELE() should always be the last thing except for zil_commit()
 100  104   *      (if necessary) and ZFS_EXIT(). This is for 3 reasons:
 101  105   *      First, if it's the last reference, the vnode/znode
 102  106   *      can be freed, so the zp may point to freed memory.  Second, the last
 103  107   *      reference will call zfs_zinactive(), which may induce a lot of work --
 104  108   *      pushing cached pages (which acquires range locks) and syncing out
 105  109   *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
 106  110   *      which could deadlock the system if you were already holding one.
 107  111   *      If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
 108  112   *
 109  113   *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 110  114   *      as they can span dmu_tx_assign() calls.
 111  115   *
 112  116   *  (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
 113  117   *      This is critical because we don't want to block while holding locks.
 114  118   *      Note, in particular, that if a lock is sometimes acquired before
 115  119   *      the tx assigns, and sometimes after (e.g. z_lock), then failing to
 116  120   *      use a non-blocking assign can deadlock the system.  The scenario:
 117  121   *
 118  122   *      Thread A has grabbed a lock before calling dmu_tx_assign().
 119  123   *      Thread B is in an already-assigned tx, and blocks for this lock.
 120  124   *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 121  125   *      forever, because the previous txg can't quiesce until B's tx commits.
 122  126   *
 123  127   *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 124  128   *      then drop all locks, call dmu_tx_wait(), and try again.
 125  129   *
 126  130   *  (5) If the operation succeeded, generate the intent log entry for it
 127  131   *      before dropping locks.  This ensures that the ordering of events
 128  132   *      in the intent log matches the order in which they actually occurred.
 129  133   *      During ZIL replay the zfs_log_* functions will update the sequence
 130  134   *      number to indicate the zil transaction has replayed.
 131  135   *
 132  136   *  (6) At the end of each vnode op, the DMU tx must always commit,
 133  137   *      regardless of whether there were any errors.
 134  138   *
 135  139   *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 136  140   *      to ensure that synchronous semantics are provided when necessary.
 137  141   *
 138  142   * In general, this is how things should be ordered in each vnode op:
 139  143   *
 140  144   *      ZFS_ENTER(zfsvfs);              // exit if unmounted
 141  145   * top:
 142  146   *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may VN_HOLD())
 143  147   *      rw_enter(...);                  // grab any other locks you need
 144  148   *      tx = dmu_tx_create(...);        // get DMU tx
 145  149   *      dmu_tx_hold_*();                // hold each object you might modify
 146  150   *      error = dmu_tx_assign(tx, TXG_NOWAIT);  // try to assign
 147  151   *      if (error) {
 148  152   *              rw_exit(...);           // drop locks
 149  153   *              zfs_dirent_unlock(dl);  // unlock directory entry
 150  154   *              VN_RELE(...);           // release held vnodes
 151  155   *              if (error == ERESTART) {
 152  156   *                      dmu_tx_wait(tx);
 153  157   *                      dmu_tx_abort(tx);
 154  158   *                      goto top;
 155  159   *              }
 156  160   *              dmu_tx_abort(tx);       // abort DMU tx
 157  161   *              ZFS_EXIT(zfsvfs);       // finished in zfs
 158  162   *              return (error);         // really out of space
 159  163   *      }
 160  164   *      error = do_real_work();         // do whatever this VOP does
 161  165   *      if (error == 0)
 162  166   *              zfs_log_*(...);         // on success, make ZIL entry
 163  167   *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 164  168   *      rw_exit(...);                   // drop locks
 165  169   *      zfs_dirent_unlock(dl);          // unlock directory entry
 166  170   *      VN_RELE(...);                   // release held vnodes
 167  171   *      zil_commit(zilog, foid);        // synchronous when necessary
 168  172   *      ZFS_EXIT(zfsvfs);               // finished in zfs
 169  173   *      return (error);                 // done, report error
 170  174   */
 171  175  
 172  176  /* ARGSUSED */
 173  177  static int
 174  178  zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 175  179  {
 176  180          znode_t *zp = VTOZ(*vpp);
 177  181          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 178  182  
 179  183          ZFS_ENTER(zfsvfs);
 180  184          ZFS_VERIFY_ZP(zp);
 181  185  
 182  186          if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 183  187              ((flag & FAPPEND) == 0)) {
 184  188                  ZFS_EXIT(zfsvfs);
 185  189                  return (EPERM);
 186  190          }
 187  191  
 188  192          if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 189  193              ZTOV(zp)->v_type == VREG &&
 190  194              !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
 191  195                  if (fs_vscan(*vpp, cr, 0) != 0) {
 192  196                          ZFS_EXIT(zfsvfs);
 193  197                          return (EACCES);
 194  198                  }
 195  199          }
 196  200  
 197  201          /* Keep a count of the synchronous opens in the znode */
 198  202          if (flag & (FSYNC | FDSYNC))
 199  203                  atomic_inc_32(&zp->z_sync_cnt);
 200  204  
 201  205          ZFS_EXIT(zfsvfs);
 202  206          return (0);
 203  207  }
 204  208  
 205  209  /* ARGSUSED */
 206  210  static int
 207  211  zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 208  212      caller_context_t *ct)
 209  213  {
 210  214          znode_t *zp = VTOZ(vp);
 211  215          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 212  216  
 213  217          /*
 214  218           * Clean up any locks held by this process on the vp.
 215  219           */
 216  220          cleanlocks(vp, ddi_get_pid(), 0);
 217  221          cleanshares(vp, ddi_get_pid());
 218  222  
 219  223          ZFS_ENTER(zfsvfs);
 220  224          ZFS_VERIFY_ZP(zp);
 221  225  
 222  226          /* Decrement the synchronous opens in the znode */
 223  227          if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 224  228                  atomic_dec_32(&zp->z_sync_cnt);
 225  229  
 226  230          if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 227  231              ZTOV(zp)->v_type == VREG &&
 228  232              !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
 229  233                  VERIFY(fs_vscan(vp, cr, 1) == 0);
 230  234  
 231  235          ZFS_EXIT(zfsvfs);
 232  236          return (0);
 233  237  }
 234  238  
 235  239  /*
 236  240   * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
 237  241   * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
 238  242   */
 239  243  static int
 240  244  zfs_holey(vnode_t *vp, int cmd, offset_t *off)
 241  245  {
 242  246          znode_t *zp = VTOZ(vp);
 243  247          uint64_t noff = (uint64_t)*off; /* new offset */
 244  248          uint64_t file_sz;
 245  249          int error;
 246  250          boolean_t hole;
 247  251  
 248  252          file_sz = zp->z_size;
 249  253          if (noff >= file_sz)  {
 250  254                  return (ENXIO);
 251  255          }
 252  256  
 253  257          if (cmd == _FIO_SEEK_HOLE)
 254  258                  hole = B_TRUE;
 255  259          else
 256  260                  hole = B_FALSE;
 257  261  
 258  262          error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 259  263  
 260  264          /* end of file? */
 261  265          if ((error == ESRCH) || (noff > file_sz)) {
 262  266                  /*
 263  267                   * Handle the virtual hole at the end of file.
 264  268                   */
 265  269                  if (hole) {
 266  270                          *off = file_sz;
 267  271                          return (0);
 268  272                  }
 269  273                  return (ENXIO);
 270  274          }
 271  275  
 272  276          if (noff < *off)
 273  277                  return (error);
 274  278          *off = noff;
 275  279          return (error);
 276  280  }
 277  281  
 278  282  /* ARGSUSED */
 279  283  static int
 280  284  zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
 281  285      int *rvalp, caller_context_t *ct)
 282  286  {
 283  287          offset_t off;
 284  288          int error;
 285  289          zfsvfs_t *zfsvfs;
 286  290          znode_t *zp;
 287  291  
 288  292          switch (com) {
 289  293          case _FIOFFS:
 290  294                  return (zfs_sync(vp->v_vfsp, 0, cred));
 291  295  
 292  296                  /*
 293  297                   * The following two ioctls are used by bfu.  Faking out,
 294  298                   * necessary to avoid bfu errors.
 295  299                   */
 296  300          case _FIOGDIO:
 297  301          case _FIOSDIO:
 298  302                  return (0);
 299  303  
 300  304          case _FIO_SEEK_DATA:
 301  305          case _FIO_SEEK_HOLE:
 302  306                  if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 303  307                          return (EFAULT);
 304  308  
 305  309                  zp = VTOZ(vp);
 306  310                  zfsvfs = zp->z_zfsvfs;
 307  311                  ZFS_ENTER(zfsvfs);
 308  312                  ZFS_VERIFY_ZP(zp);
 309  313  
 310  314                  /* offset parameter is in/out */
 311  315                  error = zfs_holey(vp, com, &off);
 312  316                  ZFS_EXIT(zfsvfs);
 313  317                  if (error)
 314  318                          return (error);
 315  319                  if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 316  320                          return (EFAULT);
 317  321                  return (0);
 318  322          }
 319  323          return (ENOTTY);
 320  324  }
 321  325  
 322  326  /*
 323  327   * Utility functions to map and unmap a single physical page.  These
 324  328   * are used to manage the mappable copies of ZFS file data, and therefore
 325  329   * do not update ref/mod bits.
 326  330   */
 327  331  caddr_t
 328  332  zfs_map_page(page_t *pp, enum seg_rw rw)
 329  333  {
 330  334          if (kpm_enable)
 331  335                  return (hat_kpm_mapin(pp, 0));
 332  336          ASSERT(rw == S_READ || rw == S_WRITE);
 333  337          return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
 334  338              (caddr_t)-1));
 335  339  }
 336  340  
 337  341  void
 338  342  zfs_unmap_page(page_t *pp, caddr_t addr)
 339  343  {
 340  344          if (kpm_enable) {
 341  345                  hat_kpm_mapout(pp, 0, addr);
 342  346          } else {
 343  347                  ppmapout(addr);
 344  348          }
 345  349  }
 346  350  
 347  351  /*
 348  352   * When a file is memory mapped, we must keep the IO data synchronized
 349  353   * between the DMU cache and the memory mapped pages.  What this means:
 350  354   *
 351  355   * On Write:    If we find a memory mapped page, we write to *both*
 352  356   *              the page and the dmu buffer.
 353  357   */
 354  358  static void
 355  359  update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
 356  360  {
 357  361          int64_t off;
 358  362  
 359  363          off = start & PAGEOFFSET;
 360  364          for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 361  365                  page_t *pp;
 362  366                  uint64_t nbytes = MIN(PAGESIZE - off, len);
 363  367  
 364  368                  if (pp = page_lookup(vp, start, SE_SHARED)) {
 365  369                          caddr_t va;
 366  370  
 367  371                          va = zfs_map_page(pp, S_WRITE);
 368  372                          (void) dmu_read(os, oid, start+off, nbytes, va+off,
 369  373                              DMU_READ_PREFETCH);
 370  374                          zfs_unmap_page(pp, va);
 371  375                          page_unlock(pp);
 372  376                  }
 373  377                  len -= nbytes;
 374  378                  off = 0;
 375  379          }
 376  380  }
 377  381  
 378  382  /*
 379  383   * When a file is memory mapped, we must keep the IO data synchronized
 380  384   * between the DMU cache and the memory mapped pages.  What this means:
 381  385   *
 382  386   * On Read:     We "read" preferentially from memory mapped pages,
 383  387   *              else we default from the dmu buffer.
 384  388   *
 385  389   * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 386  390   *      the file is memory mapped.
 387  391   */
 388  392  static int
 389  393  mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 390  394  {
 391  395          znode_t *zp = VTOZ(vp);
 392  396          objset_t *os = zp->z_zfsvfs->z_os;
 393  397          int64_t start, off;
 394  398          int len = nbytes;
 395  399          int error = 0;
 396  400  
 397  401          start = uio->uio_loffset;
 398  402          off = start & PAGEOFFSET;
 399  403          for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 400  404                  page_t *pp;
 401  405                  uint64_t bytes = MIN(PAGESIZE - off, len);
 402  406  
 403  407                  if (pp = page_lookup(vp, start, SE_SHARED)) {
 404  408                          caddr_t va;
 405  409  
 406  410                          va = zfs_map_page(pp, S_READ);
 407  411                          error = uiomove(va + off, bytes, UIO_READ, uio);
 408  412                          zfs_unmap_page(pp, va);
 409  413                          page_unlock(pp);
 410  414                  } else {
 411  415                          error = dmu_read_uio(os, zp->z_id, uio, bytes);
 412  416                  }
 413  417                  len -= bytes;
 414  418                  off = 0;
 415  419                  if (error)
 416  420                          break;
 417  421          }
 418  422          return (error);
 419  423  }
 420  424  
 421  425  offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 422  426  
 423  427  /*
 424  428   * Read bytes from specified file into supplied buffer.
 425  429   *
 426  430   *      IN:     vp      - vnode of file to be read from.
 427  431   *              uio     - structure supplying read location, range info,
 428  432   *                        and return buffer.
 429  433   *              ioflag  - SYNC flags; used to provide FRSYNC semantics.
 430  434   *              cr      - credentials of caller.
 431  435   *              ct      - caller context
 432  436   *
 433  437   *      OUT:    uio     - updated offset and range, buffer filled.
 434  438   *
 435  439   *      RETURN: 0 if success
 436  440   *              error code if failure
 437  441   *
 438  442   * Side Effects:
 439  443   *      vp - atime updated if byte count > 0
 440  444   */
 441  445  /* ARGSUSED */
 442  446  static int
 443  447  zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 444  448  {
 445  449          znode_t         *zp = VTOZ(vp);
 446  450          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 447  451          objset_t        *os;
 448  452          ssize_t         n, nbytes;
 449  453          int             error;
 450  454          rl_t            *rl;
 451  455          xuio_t          *xuio = NULL;
 452  456  
 453  457          ZFS_ENTER(zfsvfs);
 454  458          ZFS_VERIFY_ZP(zp);
 455  459          os = zfsvfs->z_os;
 456  460  
 457  461          if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 458  462                  ZFS_EXIT(zfsvfs);
 459  463                  return (EACCES);
 460  464          }
 461  465  
 462  466          /*
 463  467           * Validate file offset
 464  468           */
 465  469          if (uio->uio_loffset < (offset_t)0) {
 466  470                  ZFS_EXIT(zfsvfs);
 467  471                  return (EINVAL);
 468  472          }
 469  473  
 470  474          /*
 471  475           * Fasttrack empty reads
 472  476           */
 473  477          if (uio->uio_resid == 0) {
 474  478                  ZFS_EXIT(zfsvfs);
 475  479                  return (0);
 476  480          }
 477  481  
 478  482          /*
 479  483           * Check for mandatory locks
 480  484           */
 481  485          if (MANDMODE(zp->z_mode)) {
 482  486                  if (error = chklock(vp, FREAD,
 483  487                      uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 484  488                          ZFS_EXIT(zfsvfs);
 485  489                          return (error);
 486  490                  }
 487  491          }
 488  492  
 489  493          /*
 490  494           * If we're in FRSYNC mode, sync out this znode before reading it.
 491  495           */
 492  496          if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 493  497                  zil_commit(zfsvfs->z_log, zp->z_id);
 494  498  
 495  499          /*
 496  500           * Lock the range against changes.
 497  501           */
 498  502          rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 499  503  
 500  504          /*
 501  505           * If we are reading past end-of-file we can skip
 502  506           * to the end; but we might still need to set atime.
 503  507           */
 504  508          if (uio->uio_loffset >= zp->z_size) {
 505  509                  error = 0;
 506  510                  goto out;
 507  511          }
 508  512  
 509  513          ASSERT(uio->uio_loffset < zp->z_size);
 510  514          n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 511  515  
 512  516          if ((uio->uio_extflg == UIO_XUIO) &&
 513  517              (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 514  518                  int nblk;
 515  519                  int blksz = zp->z_blksz;
 516  520                  uint64_t offset = uio->uio_loffset;
 517  521  
 518  522                  xuio = (xuio_t *)uio;
 519  523                  if ((ISP2(blksz))) {
 520  524                          nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
 521  525                              blksz)) / blksz;
 522  526                  } else {
 523  527                          ASSERT(offset + n <= blksz);
 524  528                          nblk = 1;
 525  529                  }
 526  530                  (void) dmu_xuio_init(xuio, nblk);
 527  531  
 528  532                  if (vn_has_cached_data(vp)) {
 529  533                          /*
 530  534                           * For simplicity, we always allocate a full buffer
 531  535                           * even if we only expect to read a portion of a block.
 532  536                           */
 533  537                          while (--nblk >= 0) {
 534  538                                  (void) dmu_xuio_add(xuio,
 535  539                                      dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 536  540                                      blksz), 0, blksz);
 537  541                          }
 538  542                  }
 539  543          }
 540  544  
 541  545          while (n > 0) {
 542  546                  nbytes = MIN(n, zfs_read_chunk_size -
 543  547                      P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 544  548  
 545  549                  if (vn_has_cached_data(vp))
 546  550                          error = mappedread(vp, nbytes, uio);
 547  551                  else
 548  552                          error = dmu_read_uio(os, zp->z_id, uio, nbytes);
 549  553                  if (error) {
 550  554                          /* convert checksum errors into IO errors */
 551  555                          if (error == ECKSUM)
 552  556                                  error = EIO;
 553  557                          break;
 554  558                  }
 555  559  
 556  560                  n -= nbytes;
 557  561          }
 558  562  out:
 559  563          zfs_range_unlock(rl);
 560  564  
 561  565          ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 562  566          ZFS_EXIT(zfsvfs);
 563  567          return (error);
 564  568  }
 565  569  
 566  570  /*
 567  571   * Write the bytes to a file.
 568  572   *
 569  573   *      IN:     vp      - vnode of file to be written to.
 570  574   *              uio     - structure supplying write location, range info,
 571  575   *                        and data buffer.
 572  576   *              ioflag  - FAPPEND flag set if in append mode.
 573  577   *              cr      - credentials of caller.
 574  578   *              ct      - caller context (NFS/CIFS fem monitor only)
 575  579   *
 576  580   *      OUT:    uio     - updated offset and range.
 577  581   *
 578  582   *      RETURN: 0 if success
 579  583   *              error code if failure
 580  584   *
 581  585   * Timestamps:
 582  586   *      vp - ctime|mtime updated if byte count > 0
 583  587   */
 584  588  
 585  589  /* ARGSUSED */
 586  590  static int
 587  591  zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 588  592  {
 589  593          znode_t         *zp = VTOZ(vp);
 590  594          rlim64_t        limit = uio->uio_llimit;
 591  595          ssize_t         start_resid = uio->uio_resid;
 592  596          ssize_t         tx_bytes;
 593  597          uint64_t        end_size;
 594  598          dmu_tx_t        *tx;
 595  599          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 596  600          zilog_t         *zilog;
 597  601          offset_t        woff;
 598  602          ssize_t         n, nbytes;
 599  603          rl_t            *rl;
 600  604          int             max_blksz = zfsvfs->z_max_blksz;
 601  605          int             error;
 602  606          arc_buf_t       *abuf;
 603  607          iovec_t         *aiov;
 604  608          xuio_t          *xuio = NULL;
 605  609          int             i_iov = 0;
 606  610          int             iovcnt = uio->uio_iovcnt;
 607  611          iovec_t         *iovp = uio->uio_iov;
 608  612          int             write_eof;
 609  613          int             count = 0;
 610  614          sa_bulk_attr_t  bulk[4];
 611  615          uint64_t        mtime[2], ctime[2];
 612  616  
 613  617          /*
 614  618           * Fasttrack empty write
 615  619           */
 616  620          n = start_resid;
 617  621          if (n == 0)
 618  622                  return (0);
 619  623  
 620  624          if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 621  625                  limit = MAXOFFSET_T;
 622  626  
 623  627          ZFS_ENTER(zfsvfs);
 624  628          ZFS_VERIFY_ZP(zp);
 625  629  
 626  630          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 627  631          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 628  632          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 629  633              &zp->z_size, 8);
 630  634          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 631  635              &zp->z_pflags, 8);
 632  636  
 633  637          /*
 634  638           * If immutable or not appending then return EPERM
 635  639           */
 636  640          if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
 637  641              ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 638  642              (uio->uio_loffset < zp->z_size))) {
 639  643                  ZFS_EXIT(zfsvfs);
 640  644                  return (EPERM);
 641  645          }
 642  646  
 643  647          zilog = zfsvfs->z_log;
 644  648  
 645  649          /*
 646  650           * Validate file offset
 647  651           */
 648  652          woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 649  653          if (woff < 0) {
 650  654                  ZFS_EXIT(zfsvfs);
 651  655                  return (EINVAL);
 652  656          }
 653  657  
 654  658          /*
 655  659           * Check for mandatory locks before calling zfs_range_lock()
 656  660           * in order to prevent a deadlock with locks set via fcntl().
 657  661           */
 658  662          if (MANDMODE((mode_t)zp->z_mode) &&
 659  663              (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 660  664                  ZFS_EXIT(zfsvfs);
 661  665                  return (error);
 662  666          }
 663  667  
 664  668          /*
 665  669           * Pre-fault the pages to ensure slow (eg NFS) pages
 666  670           * don't hold up txg.
 667  671           * Skip this if uio contains loaned arc_buf.
 668  672           */
 669  673          if ((uio->uio_extflg == UIO_XUIO) &&
 670  674              (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
 671  675                  xuio = (xuio_t *)uio;
 672  676          else
 673  677                  uio_prefaultpages(MIN(n, max_blksz), uio);
 674  678  
 675  679          /*
 676  680           * If in append mode, set the io offset pointer to eof.
 677  681           */
 678  682          if (ioflag & FAPPEND) {
 679  683                  /*
 680  684                   * Obtain an appending range lock to guarantee file append
 681  685                   * semantics.  We reset the write offset once we have the lock.
 682  686                   */
 683  687                  rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 684  688                  woff = rl->r_off;
 685  689                  if (rl->r_len == UINT64_MAX) {
 686  690                          /*
 687  691                           * We overlocked the file because this write will cause
 688  692                           * the file block size to increase.
 689  693                           * Note that zp_size cannot change with this lock held.
 690  694                           */
 691  695                          woff = zp->z_size;
 692  696                  }
 693  697                  uio->uio_loffset = woff;
 694  698          } else {
 695  699                  /*
 696  700                   * Note that if the file block size will change as a result of
 697  701                   * this write, then this range lock will lock the entire file
 698  702                   * so that we can re-write the block safely.
 699  703                   */
 700  704                  rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 701  705          }
 702  706  
 703  707          if (woff >= limit) {
 704  708                  zfs_range_unlock(rl);
 705  709                  ZFS_EXIT(zfsvfs);
 706  710                  return (EFBIG);
 707  711          }
 708  712  
 709  713          if ((woff + n) > limit || woff > (limit - n))
 710  714                  n = limit - woff;
 711  715  
 712  716          /* Will this write extend the file length? */
 713  717          write_eof = (woff + n > zp->z_size);
 714  718  
 715  719          end_size = MAX(zp->z_size, woff + n);
 716  720  
 717  721          /*
 718  722           * Write the file in reasonable size chunks.  Each chunk is written
 719  723           * in a separate transaction; this keeps the intent log records small
 720  724           * and allows us to do more fine-grained space accounting.
 721  725           */
 722  726          while (n > 0) {
 723  727                  abuf = NULL;
 724  728                  woff = uio->uio_loffset;
 725  729  again:
 726  730                  if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 727  731                      zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 728  732                          if (abuf != NULL)
 729  733                                  dmu_return_arcbuf(abuf);
 730  734                          error = EDQUOT;
 731  735                          break;
 732  736                  }
 733  737  
 734  738                  if (xuio && abuf == NULL) {
 735  739                          ASSERT(i_iov < iovcnt);
 736  740                          aiov = &iovp[i_iov];
 737  741                          abuf = dmu_xuio_arcbuf(xuio, i_iov);
 738  742                          dmu_xuio_clear(xuio, i_iov);
 739  743                          DTRACE_PROBE3(zfs_cp_write, int, i_iov,
 740  744                              iovec_t *, aiov, arc_buf_t *, abuf);
 741  745                          ASSERT((aiov->iov_base == abuf->b_data) ||
 742  746                              ((char *)aiov->iov_base - (char *)abuf->b_data +
 743  747                              aiov->iov_len == arc_buf_size(abuf)));
 744  748                          i_iov++;
 745  749                  } else if (abuf == NULL && n >= max_blksz &&
 746  750                      woff >= zp->z_size &&
 747  751                      P2PHASE(woff, max_blksz) == 0 &&
 748  752                      zp->z_blksz == max_blksz) {
 749  753                          /*
 750  754                           * This write covers a full block.  "Borrow" a buffer
 751  755                           * from the dmu so that we can fill it before we enter
 752  756                           * a transaction.  This avoids the possibility of
 753  757                           * holding up the transaction if the data copy hangs
 754  758                           * up on a pagefault (e.g., from an NFS server mapping).
 755  759                           */
 756  760                          size_t cbytes;
 757  761  
 758  762                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 759  763                              max_blksz);
 760  764                          ASSERT(abuf != NULL);
 761  765                          ASSERT(arc_buf_size(abuf) == max_blksz);
 762  766                          if (error = uiocopy(abuf->b_data, max_blksz,
 763  767                              UIO_WRITE, uio, &cbytes)) {
 764  768                                  dmu_return_arcbuf(abuf);
 765  769                                  break;
 766  770                          }
 767  771                          ASSERT(cbytes == max_blksz);
 768  772                  }
 769  773  
 770  774                  /*
 771  775                   * Start a transaction.
 772  776                   */
 773  777                  tx = dmu_tx_create(zfsvfs->z_os);
 774  778                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 775  779                  dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 776  780                  zfs_sa_upgrade_txholds(tx, zp);
 777  781                  error = dmu_tx_assign(tx, TXG_NOWAIT);
 778  782                  if (error) {
 779  783                          if (error == ERESTART) {
 780  784                                  dmu_tx_wait(tx);
 781  785                                  dmu_tx_abort(tx);
 782  786                                  goto again;
 783  787                          }
 784  788                          dmu_tx_abort(tx);
 785  789                          if (abuf != NULL)
 786  790                                  dmu_return_arcbuf(abuf);
 787  791                          break;
 788  792                  }
 789  793  
 790  794                  /*
 791  795                   * If zfs_range_lock() over-locked we grow the blocksize
 792  796                   * and then reduce the lock range.  This will only happen
 793  797                   * on the first iteration since zfs_range_reduce() will
 794  798                   * shrink down r_len to the appropriate size.
 795  799                   */
 796  800                  if (rl->r_len == UINT64_MAX) {
 797  801                          uint64_t new_blksz;
 798  802  
 799  803                          if (zp->z_blksz > max_blksz) {
 800  804                                  ASSERT(!ISP2(zp->z_blksz));
 801  805                                  new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
 802  806                          } else {
 803  807                                  new_blksz = MIN(end_size, max_blksz);
 804  808                          }
 805  809                          zfs_grow_blocksize(zp, new_blksz, tx);
 806  810                          zfs_range_reduce(rl, woff, n);
 807  811                  }
 808  812  
 809  813                  /*
 810  814                   * XXX - should we really limit each write to z_max_blksz?
 811  815                   * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 812  816                   */
 813  817                  nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 814  818  
 815  819                  if (abuf == NULL) {
 816  820                          tx_bytes = uio->uio_resid;
 817  821                          error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 818  822                              uio, nbytes, tx);
 819  823                          tx_bytes -= uio->uio_resid;
 820  824                  } else {
 821  825                          tx_bytes = nbytes;
 822  826                          ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
 823  827                          /*
 824  828                           * If this is not a full block write, but we are
 825  829                           * extending the file past EOF and this data starts
 826  830                           * block-aligned, use assign_arcbuf().  Otherwise,
 827  831                           * write via dmu_write().
 828  832                           */
 829  833                          if (tx_bytes < max_blksz && (!write_eof ||
 830  834                              aiov->iov_base != abuf->b_data)) {
 831  835                                  ASSERT(xuio);
 832  836                                  dmu_write(zfsvfs->z_os, zp->z_id, woff,
 833  837                                      aiov->iov_len, aiov->iov_base, tx);
 834  838                                  dmu_return_arcbuf(abuf);
 835  839                                  xuio_stat_wbuf_copied();
 836  840                          } else {
 837  841                                  ASSERT(xuio || tx_bytes == max_blksz);
 838  842                                  dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
 839  843                                      woff, abuf, tx);
 840  844                          }
 841  845                          ASSERT(tx_bytes <= uio->uio_resid);
 842  846                          uioskip(uio, tx_bytes);
 843  847                  }
 844  848                  if (tx_bytes && vn_has_cached_data(vp)) {
 845  849                          update_pages(vp, woff,
 846  850                              tx_bytes, zfsvfs->z_os, zp->z_id);
 847  851                  }
 848  852  
 849  853                  /*
 850  854                   * If we made no progress, we're done.  If we made even
 851  855                   * partial progress, update the znode and ZIL accordingly.
 852  856                   */
 853  857                  if (tx_bytes == 0) {
 854  858                          (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 855  859                              (void *)&zp->z_size, sizeof (uint64_t), tx);
 856  860                          dmu_tx_commit(tx);
 857  861                          ASSERT(error != 0);
 858  862                          break;
 859  863                  }
 860  864  
 861  865                  /*
 862  866                   * Clear Set-UID/Set-GID bits on successful write if not
 863  867                   * privileged and at least one of the excute bits is set.
 864  868                   *
 865  869                   * It would be nice to to this after all writes have
 866  870                   * been done, but that would still expose the ISUID/ISGID
 867  871                   * to another app after the partial write is committed.
 868  872                   *
 869  873                   * Note: we don't call zfs_fuid_map_id() here because
 870  874                   * user 0 is not an ephemeral uid.
 871  875                   */
 872  876                  mutex_enter(&zp->z_acl_lock);
 873  877                  if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 874  878                      (S_IXUSR >> 6))) != 0 &&
 875  879                      (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 876  880                      secpolicy_vnode_setid_retain(cr,
 877  881                      (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
 878  882                          uint64_t newmode;
 879  883                          zp->z_mode &= ~(S_ISUID | S_ISGID);
 880  884                          newmode = zp->z_mode;
 881  885                          (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 882  886                              (void *)&newmode, sizeof (uint64_t), tx);
 883  887                  }
 884  888                  mutex_exit(&zp->z_acl_lock);
 885  889  
 886  890                  zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 887  891                      B_TRUE);
 888  892  
 889  893                  /*
 890  894                   * Update the file size (zp_size) if it has changed;
 891  895                   * account for possible concurrent updates.
 892  896                   */
 893  897                  while ((end_size = zp->z_size) < uio->uio_loffset) {
 894  898                          (void) atomic_cas_64(&zp->z_size, end_size,
 895  899                              uio->uio_loffset);
 896  900                          ASSERT(error == 0);
 897  901                  }
 898  902                  /*
 899  903                   * If we are replaying and eof is non zero then force
 900  904                   * the file size to the specified eof. Note, there's no
 901  905                   * concurrency during replay.
 902  906                   */
 903  907                  if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 904  908                          zp->z_size = zfsvfs->z_replay_eof;
 905  909  
 906  910                  error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 907  911  
 908  912                  zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 909  913                  dmu_tx_commit(tx);
 910  914  
 911  915                  if (error != 0)
 912  916                          break;
 913  917                  ASSERT(tx_bytes == nbytes);
 914  918                  n -= nbytes;
 915  919  
 916  920                  if (!xuio && n > 0)
 917  921                          uio_prefaultpages(MIN(n, max_blksz), uio);
 918  922          }
 919  923  
 920  924          zfs_range_unlock(rl);
 921  925  
 922  926          /*
 923  927           * If we're in replay mode, or we made no progress, return error.
 924  928           * Otherwise, it's at least a partial write, so it's successful.
 925  929           */
 926  930          if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 927  931                  ZFS_EXIT(zfsvfs);
 928  932                  return (error);
 929  933          }
 930  934  
 931  935          if (ioflag & (FSYNC | FDSYNC) ||
 932  936              zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 933  937                  zil_commit(zilog, zp->z_id);
 934  938  
 935  939          ZFS_EXIT(zfsvfs);
 936  940          return (0);
 937  941  }
 938  942  
 939  943  void
 940  944  zfs_get_done(zgd_t *zgd, int error)
 941  945  {
 942  946          znode_t *zp = zgd->zgd_private;
 943  947          objset_t *os = zp->z_zfsvfs->z_os;
 944  948  
 945  949          if (zgd->zgd_db)
 946  950                  dmu_buf_rele(zgd->zgd_db, zgd);
 947  951  
 948  952          zfs_range_unlock(zgd->zgd_rl);
 949  953  
 950  954          /*
 951  955           * Release the vnode asynchronously as we currently have the
 952  956           * txg stopped from syncing.
 953  957           */
 954  958          VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 955  959  
 956  960          if (error == 0 && zgd->zgd_bp)
 957  961                  zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 958  962  
 959  963          kmem_free(zgd, sizeof (zgd_t));
 960  964  }
 961  965  
 962  966  #ifdef DEBUG
 963  967  static int zil_fault_io = 0;
 964  968  #endif
 965  969  
 966  970  /*
 967  971   * Get data to generate a TX_WRITE intent log record.
 968  972   */
 969  973  int
 970  974  zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 971  975  {
 972  976          zfsvfs_t *zfsvfs = arg;
 973  977          objset_t *os = zfsvfs->z_os;
 974  978          znode_t *zp;
 975  979          uint64_t object = lr->lr_foid;
 976  980          uint64_t offset = lr->lr_offset;
 977  981          uint64_t size = lr->lr_length;
 978  982          blkptr_t *bp = &lr->lr_blkptr;
 979  983          dmu_buf_t *db;
 980  984          zgd_t *zgd;
 981  985          int error = 0;
 982  986  
 983  987          ASSERT(zio != NULL);
 984  988          ASSERT(size != 0);
 985  989  
 986  990          /*
 987  991           * Nothing to do if the file has been removed
 988  992           */
 989  993          if (zfs_zget(zfsvfs, object, &zp) != 0)
 990  994                  return (ENOENT);
 991  995          if (zp->z_unlinked) {
 992  996                  /*
 993  997                   * Release the vnode asynchronously as we currently have the
 994  998                   * txg stopped from syncing.
 995  999                   */
 996 1000                  VN_RELE_ASYNC(ZTOV(zp),
 997 1001                      dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 998 1002                  return (ENOENT);
 999 1003          }
1000 1004  
1001 1005          zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1002 1006          zgd->zgd_zilog = zfsvfs->z_log;
1003 1007          zgd->zgd_private = zp;
1004 1008  
1005 1009          /*
1006 1010           * Write records come in two flavors: immediate and indirect.
1007 1011           * For small writes it's cheaper to store the data with the
1008 1012           * log record (immediate); for large writes it's cheaper to
1009 1013           * sync the data and get a pointer to it (indirect) so that
1010 1014           * we don't have to write the data twice.
1011 1015           */
1012 1016          if (buf != NULL) { /* immediate write */
1013 1017                  zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1014 1018                  /* test for truncation needs to be done while range locked */
1015 1019                  if (offset >= zp->z_size) {
1016 1020                          error = ENOENT;
1017 1021                  } else {
1018 1022                          error = dmu_read(os, object, offset, size, buf,
1019 1023                              DMU_READ_NO_PREFETCH);
1020 1024                  }
1021 1025                  ASSERT(error == 0 || error == ENOENT);
1022 1026          } else { /* indirect write */
1023 1027                  /*
1024 1028                   * Have to lock the whole block to ensure when it's
1025 1029                   * written out and it's checksum is being calculated
1026 1030                   * that no one can change the data. We need to re-check
1027 1031                   * blocksize after we get the lock in case it's changed!
1028 1032                   */
1029 1033                  for (;;) {
1030 1034                          uint64_t blkoff;
1031 1035                          size = zp->z_blksz;
1032 1036                          blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1033 1037                          offset -= blkoff;
1034 1038                          zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1035 1039                              RL_READER);
1036 1040                          if (zp->z_blksz == size)
1037 1041                                  break;
1038 1042                          offset += blkoff;
1039 1043                          zfs_range_unlock(zgd->zgd_rl);
1040 1044                  }
1041 1045                  /* test for truncation needs to be done while range locked */
1042 1046                  if (lr->lr_offset >= zp->z_size)
1043 1047                          error = ENOENT;
1044 1048  #ifdef DEBUG
1045 1049                  if (zil_fault_io) {
1046 1050                          error = EIO;
1047 1051                          zil_fault_io = 0;
1048 1052                  }
1049 1053  #endif
1050 1054                  if (error == 0)
1051 1055                          error = dmu_buf_hold(os, object, offset, zgd, &db,
1052 1056                              DMU_READ_NO_PREFETCH);
1053 1057  
1054 1058                  if (error == 0) {
1055 1059                          zgd->zgd_db = db;
1056 1060                          zgd->zgd_bp = bp;
1057 1061  
1058 1062                          ASSERT(db->db_offset == offset);
1059 1063                          ASSERT(db->db_size == size);
1060 1064  
1061 1065                          error = dmu_sync(zio, lr->lr_common.lrc_txg,
1062 1066                              zfs_get_done, zgd);
1063 1067                          ASSERT(error || lr->lr_length <= zp->z_blksz);
1064 1068  
1065 1069                          /*
1066 1070                           * On success, we need to wait for the write I/O
1067 1071                           * initiated by dmu_sync() to complete before we can
1068 1072                           * release this dbuf.  We will finish everything up
1069 1073                           * in the zfs_get_done() callback.
1070 1074                           */
1071 1075                          if (error == 0)
1072 1076                                  return (0);
1073 1077  
1074 1078                          if (error == EALREADY) {
1075 1079                                  lr->lr_common.lrc_txtype = TX_WRITE2;
1076 1080                                  error = 0;
1077 1081                          }
1078 1082                  }
1079 1083          }
1080 1084  
1081 1085          zfs_get_done(zgd, error);
1082 1086  
1083 1087          return (error);
1084 1088  }
1085 1089  
1086 1090  /*ARGSUSED*/
1087 1091  static int
1088 1092  zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1089 1093      caller_context_t *ct)
1090 1094  {
1091 1095          znode_t *zp = VTOZ(vp);
1092 1096          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1093 1097          int error;
1094 1098  
1095 1099          ZFS_ENTER(zfsvfs);
1096 1100          ZFS_VERIFY_ZP(zp);
1097 1101  
1098 1102          if (flag & V_ACE_MASK)
1099 1103                  error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1100 1104          else
1101 1105                  error = zfs_zaccess_rwx(zp, mode, flag, cr);
1102 1106  
1103 1107          ZFS_EXIT(zfsvfs);
1104 1108          return (error);
1105 1109  }
1106 1110  
1107 1111  /*
1108 1112   * If vnode is for a device return a specfs vnode instead.
1109 1113   */
1110 1114  static int
1111 1115  specvp_check(vnode_t **vpp, cred_t *cr)
1112 1116  {
1113 1117          int error = 0;
1114 1118  
1115 1119          if (IS_DEVVP(*vpp)) {
1116 1120                  struct vnode *svp;
1117 1121  
1118 1122                  svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1119 1123                  VN_RELE(*vpp);
1120 1124                  if (svp == NULL)
1121 1125                          error = ENOSYS;
1122 1126                  *vpp = svp;
1123 1127          }
1124 1128          return (error);
1125 1129  }
1126 1130  
1127 1131  
1128 1132  /*
1129 1133   * Lookup an entry in a directory, or an extended attribute directory.
1130 1134   * If it exists, return a held vnode reference for it.
1131 1135   *
1132 1136   *      IN:     dvp     - vnode of directory to search.
1133 1137   *              nm      - name of entry to lookup.
1134 1138   *              pnp     - full pathname to lookup [UNUSED].
1135 1139   *              flags   - LOOKUP_XATTR set if looking for an attribute.
1136 1140   *              rdir    - root directory vnode [UNUSED].
1137 1141   *              cr      - credentials of caller.
1138 1142   *              ct      - caller context
1139 1143   *              direntflags - directory lookup flags
1140 1144   *              realpnp - returned pathname.
1141 1145   *
1142 1146   *      OUT:    vpp     - vnode of located entry, NULL if not found.
1143 1147   *
1144 1148   *      RETURN: 0 if success
1145 1149   *              error code if failure
1146 1150   *
1147 1151   * Timestamps:
1148 1152   *      NA
1149 1153   */
1150 1154  /* ARGSUSED */
1151 1155  static int
1152 1156  zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1153 1157      int flags, vnode_t *rdir, cred_t *cr,  caller_context_t *ct,
1154 1158      int *direntflags, pathname_t *realpnp)
1155 1159  {
1156 1160          znode_t *zdp = VTOZ(dvp);
1157 1161          zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1158 1162          int     error = 0;
1159 1163  
1160 1164          /* fast path */
1161 1165          if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1162 1166  
1163 1167                  if (dvp->v_type != VDIR) {
1164 1168                          return (ENOTDIR);
1165 1169                  } else if (zdp->z_sa_hdl == NULL) {
1166 1170                          return (EIO);
1167 1171                  }
1168 1172  
1169 1173                  if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1170 1174                          error = zfs_fastaccesschk_execute(zdp, cr);
1171 1175                          if (!error) {
1172 1176                                  *vpp = dvp;
1173 1177                                  VN_HOLD(*vpp);
1174 1178                                  return (0);
1175 1179                          }
1176 1180                          return (error);
1177 1181                  } else {
1178 1182                          vnode_t *tvp = dnlc_lookup(dvp, nm);
1179 1183  
1180 1184                          if (tvp) {
1181 1185                                  error = zfs_fastaccesschk_execute(zdp, cr);
1182 1186                                  if (error) {
1183 1187                                          VN_RELE(tvp);
1184 1188                                          return (error);
1185 1189                                  }
1186 1190                                  if (tvp == DNLC_NO_VNODE) {
1187 1191                                          VN_RELE(tvp);
1188 1192                                          return (ENOENT);
1189 1193                                  } else {
1190 1194                                          *vpp = tvp;
1191 1195                                          return (specvp_check(vpp, cr));
1192 1196                                  }
1193 1197                          }
1194 1198                  }
1195 1199          }
1196 1200  
1197 1201          DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1198 1202  
1199 1203          ZFS_ENTER(zfsvfs);
1200 1204          ZFS_VERIFY_ZP(zdp);
1201 1205  
1202 1206          *vpp = NULL;
1203 1207  
1204 1208          if (flags & LOOKUP_XATTR) {
1205 1209                  /*
1206 1210                   * If the xattr property is off, refuse the lookup request.
1207 1211                   */
1208 1212                  if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1209 1213                          ZFS_EXIT(zfsvfs);
1210 1214                          return (EINVAL);
1211 1215                  }
1212 1216  
1213 1217                  /*
1214 1218                   * We don't allow recursive attributes..
1215 1219                   * Maybe someday we will.
1216 1220                   */
1217 1221                  if (zdp->z_pflags & ZFS_XATTR) {
1218 1222                          ZFS_EXIT(zfsvfs);
1219 1223                          return (EINVAL);
1220 1224                  }
1221 1225  
1222 1226                  if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1223 1227                          ZFS_EXIT(zfsvfs);
1224 1228                          return (error);
1225 1229                  }
1226 1230  
1227 1231                  /*
1228 1232                   * Do we have permission to get into attribute directory?
1229 1233                   */
1230 1234  
1231 1235                  if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1232 1236                      B_FALSE, cr)) {
1233 1237                          VN_RELE(*vpp);
1234 1238                          *vpp = NULL;
1235 1239                  }
1236 1240  
1237 1241                  ZFS_EXIT(zfsvfs);
1238 1242                  return (error);
1239 1243          }
1240 1244  
1241 1245          if (dvp->v_type != VDIR) {
1242 1246                  ZFS_EXIT(zfsvfs);
1243 1247                  return (ENOTDIR);
1244 1248          }
1245 1249  
1246 1250          /*
1247 1251           * Check accessibility of directory.
1248 1252           */
1249 1253  
1250 1254          if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1251 1255                  ZFS_EXIT(zfsvfs);
1252 1256                  return (error);
1253 1257          }
1254 1258  
1255 1259          if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1256 1260              NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1257 1261                  ZFS_EXIT(zfsvfs);
1258 1262                  return (EILSEQ);
1259 1263          }
1260 1264  
1261 1265          error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1262 1266          if (error == 0)
1263 1267                  error = specvp_check(vpp, cr);
1264 1268  
1265 1269          ZFS_EXIT(zfsvfs);
1266 1270          return (error);
1267 1271  }
1268 1272  
1269 1273  /*
1270 1274   * Attempt to create a new entry in a directory.  If the entry
1271 1275   * already exists, truncate the file if permissible, else return
1272 1276   * an error.  Return the vp of the created or trunc'd file.
1273 1277   *
1274 1278   *      IN:     dvp     - vnode of directory to put new file entry in.
1275 1279   *              name    - name of new file entry.
1276 1280   *              vap     - attributes of new file.
1277 1281   *              excl    - flag indicating exclusive or non-exclusive mode.
1278 1282   *              mode    - mode to open file with.
1279 1283   *              cr      - credentials of caller.
1280 1284   *              flag    - large file flag [UNUSED].
1281 1285   *              ct      - caller context
1282 1286   *              vsecp   - ACL to be set
1283 1287   *
1284 1288   *      OUT:    vpp     - vnode of created or trunc'd entry.
1285 1289   *
1286 1290   *      RETURN: 0 if success
1287 1291   *              error code if failure
1288 1292   *
1289 1293   * Timestamps:
1290 1294   *      dvp - ctime|mtime updated if new entry created
1291 1295   *       vp - ctime|mtime always, atime if new
1292 1296   */
1293 1297  
1294 1298  /* ARGSUSED */
1295 1299  static int
1296 1300  zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
1297 1301      int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
1298 1302      vsecattr_t *vsecp)
1299 1303  {
1300 1304          znode_t         *zp, *dzp = VTOZ(dvp);
1301 1305          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1302 1306          zilog_t         *zilog;
1303 1307          objset_t        *os;
1304 1308          zfs_dirlock_t   *dl;
1305 1309          dmu_tx_t        *tx;
1306 1310          int             error;
1307 1311          ksid_t          *ksid;
1308 1312          uid_t           uid;
1309 1313          gid_t           gid = crgetgid(cr);
1310 1314          zfs_acl_ids_t   acl_ids;
1311 1315          boolean_t       fuid_dirtied;
1312 1316          boolean_t       have_acl = B_FALSE;
1313 1317  
1314 1318          /*
1315 1319           * If we have an ephemeral id, ACL, or XVATTR then
1316 1320           * make sure file system is at proper version
1317 1321           */
1318 1322  
1319 1323          ksid = crgetsid(cr, KSID_OWNER);
1320 1324          if (ksid)
1321 1325                  uid = ksid_getid(ksid);
1322 1326          else
1323 1327                  uid = crgetuid(cr);
1324 1328  
1325 1329          if (zfsvfs->z_use_fuids == B_FALSE &&
1326 1330              (vsecp || (vap->va_mask & AT_XVATTR) ||
1327 1331              IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1328 1332                  return (EINVAL);
1329 1333  
1330 1334          ZFS_ENTER(zfsvfs);
1331 1335          ZFS_VERIFY_ZP(dzp);
1332 1336          os = zfsvfs->z_os;
1333 1337          zilog = zfsvfs->z_log;
1334 1338  
1335 1339          if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1336 1340              NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1337 1341                  ZFS_EXIT(zfsvfs);
1338 1342                  return (EILSEQ);
1339 1343          }
1340 1344  
1341 1345          if (vap->va_mask & AT_XVATTR) {
1342 1346                  if ((error = secpolicy_xvattr((xvattr_t *)vap,
1343 1347                      crgetuid(cr), cr, vap->va_type)) != 0) {
1344 1348                          ZFS_EXIT(zfsvfs);
1345 1349                          return (error);
1346 1350                  }
1347 1351          }
1348 1352  top:
1349 1353          *vpp = NULL;
1350 1354  
1351 1355          if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1352 1356                  vap->va_mode &= ~VSVTX;
1353 1357  
1354 1358          if (*name == '\0') {
1355 1359                  /*
1356 1360                   * Null component name refers to the directory itself.
1357 1361                   */
1358 1362                  VN_HOLD(dvp);
1359 1363                  zp = dzp;
1360 1364                  dl = NULL;
1361 1365                  error = 0;
1362 1366          } else {
1363 1367                  /* possible VN_HOLD(zp) */
1364 1368                  int zflg = 0;
1365 1369  
1366 1370                  if (flag & FIGNORECASE)
1367 1371                          zflg |= ZCILOOK;
1368 1372  
1369 1373                  error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1370 1374                      NULL, NULL);
1371 1375                  if (error) {
1372 1376                          if (have_acl)
1373 1377                                  zfs_acl_ids_free(&acl_ids);
1374 1378                          if (strcmp(name, "..") == 0)
1375 1379                                  error = EISDIR;
1376 1380                          ZFS_EXIT(zfsvfs);
1377 1381                          return (error);
1378 1382                  }
1379 1383          }
1380 1384  
1381 1385          if (zp == NULL) {
1382 1386                  uint64_t txtype;
1383 1387  
1384 1388                  /*
1385 1389                   * Create a new file object and update the directory
1386 1390                   * to reference it.
1387 1391                   */
1388 1392                  if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1389 1393                          if (have_acl)
1390 1394                                  zfs_acl_ids_free(&acl_ids);
1391 1395                          goto out;
1392 1396                  }
1393 1397  
1394 1398                  /*
1395 1399                   * We only support the creation of regular files in
1396 1400                   * extended attribute directories.
1397 1401                   */
1398 1402  
1399 1403                  if ((dzp->z_pflags & ZFS_XATTR) &&
1400 1404                      (vap->va_type != VREG)) {
1401 1405                          if (have_acl)
1402 1406                                  zfs_acl_ids_free(&acl_ids);
1403 1407                          error = EINVAL;
1404 1408                          goto out;
1405 1409                  }
1406 1410  
1407 1411                  if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1408 1412                      cr, vsecp, &acl_ids)) != 0)
1409 1413                          goto out;
1410 1414                  have_acl = B_TRUE;
1411 1415  
1412 1416                  if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1413 1417                          zfs_acl_ids_free(&acl_ids);
1414 1418                          error = EDQUOT;
1415 1419                          goto out;
1416 1420                  }
1417 1421  
1418 1422                  tx = dmu_tx_create(os);
1419 1423  
1420 1424                  dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1421 1425                      ZFS_SA_BASE_ATTR_SIZE);
1422 1426  
1423 1427                  fuid_dirtied = zfsvfs->z_fuid_dirty;
1424 1428                  if (fuid_dirtied)
1425 1429                          zfs_fuid_txhold(zfsvfs, tx);
1426 1430                  dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1427 1431                  dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1428 1432                  if (!zfsvfs->z_use_sa &&
1429 1433                      acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1430 1434                          dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1431 1435                              0, acl_ids.z_aclp->z_acl_bytes);
1432 1436                  }
1433 1437                  error = dmu_tx_assign(tx, TXG_NOWAIT);
1434 1438                  if (error) {
1435 1439                          zfs_dirent_unlock(dl);
1436 1440                          if (error == ERESTART) {
1437 1441                                  dmu_tx_wait(tx);
1438 1442                                  dmu_tx_abort(tx);
1439 1443                                  goto top;
1440 1444                          }
1441 1445                          zfs_acl_ids_free(&acl_ids);
1442 1446                          dmu_tx_abort(tx);
1443 1447                          ZFS_EXIT(zfsvfs);
1444 1448                          return (error);
1445 1449                  }
1446 1450                  zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1447 1451  
1448 1452                  if (fuid_dirtied)
1449 1453                          zfs_fuid_sync(zfsvfs, tx);
1450 1454  
1451 1455                  (void) zfs_link_create(dl, zp, tx, ZNEW);
1452 1456                  txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1453 1457                  if (flag & FIGNORECASE)
1454 1458                          txtype |= TX_CI;
1455 1459                  zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1456 1460                      vsecp, acl_ids.z_fuidp, vap);
1457 1461                  zfs_acl_ids_free(&acl_ids);
1458 1462                  dmu_tx_commit(tx);
1459 1463          } else {
1460 1464                  int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1461 1465  
1462 1466                  if (have_acl)
1463 1467                          zfs_acl_ids_free(&acl_ids);
1464 1468                  have_acl = B_FALSE;
1465 1469  
1466 1470                  /*
1467 1471                   * A directory entry already exists for this name.
1468 1472                   */
1469 1473                  /*
1470 1474                   * Can't truncate an existing file if in exclusive mode.
1471 1475                   */
1472 1476                  if (excl == EXCL) {
1473 1477                          error = EEXIST;
1474 1478                          goto out;
1475 1479                  }
1476 1480                  /*
1477 1481                   * Can't open a directory for writing.
1478 1482                   */
1479 1483                  if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1480 1484                          error = EISDIR;
1481 1485                          goto out;
1482 1486                  }
1483 1487                  /*
1484 1488                   * Verify requested access to file.
1485 1489                   */
1486 1490                  if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1487 1491                          goto out;
1488 1492                  }
1489 1493  
1490 1494                  mutex_enter(&dzp->z_lock);
1491 1495                  dzp->z_seq++;
1492 1496                  mutex_exit(&dzp->z_lock);
1493 1497  
1494 1498                  /*
1495 1499                   * Truncate regular files if requested.
1496 1500                   */
1497 1501                  if ((ZTOV(zp)->v_type == VREG) &&
1498 1502                      (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1499 1503                          /* we can't hold any locks when calling zfs_freesp() */
1500 1504                          zfs_dirent_unlock(dl);
1501 1505                          dl = NULL;
1502 1506                          error = zfs_freesp(zp, 0, 0, mode, TRUE);
1503 1507                          if (error == 0) {
1504 1508                                  vnevent_create(ZTOV(zp), ct);
1505 1509                          }
1506 1510                  }
1507 1511          }
1508 1512  out:
1509 1513  
1510 1514          if (dl)
1511 1515                  zfs_dirent_unlock(dl);
1512 1516  
1513 1517          if (error) {
1514 1518                  if (zp)
1515 1519                          VN_RELE(ZTOV(zp));
1516 1520          } else {
1517 1521                  *vpp = ZTOV(zp);
1518 1522                  error = specvp_check(vpp, cr);
1519 1523          }
1520 1524  
1521 1525          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1522 1526                  zil_commit(zilog, 0);
1523 1527  
1524 1528          ZFS_EXIT(zfsvfs);
1525 1529          return (error);
1526 1530  }
1527 1531  
1528 1532  /*
1529 1533   * Remove an entry from a directory.
1530 1534   *
1531 1535   *      IN:     dvp     - vnode of directory to remove entry from.
1532 1536   *              name    - name of entry to remove.
1533 1537   *              cr      - credentials of caller.
1534 1538   *              ct      - caller context
1535 1539   *              flags   - case flags
1536 1540   *
1537 1541   *      RETURN: 0 if success
1538 1542   *              error code if failure
1539 1543   *
1540 1544   * Timestamps:
1541 1545   *      dvp - ctime|mtime
1542 1546   *       vp - ctime (if nlink > 0)
1543 1547   */
1544 1548  
1545 1549  uint64_t null_xattr = 0;
1546 1550  
1547 1551  /*ARGSUSED*/
1548 1552  static int
1549 1553  zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1550 1554      int flags)
1551 1555  {
1552 1556          znode_t         *zp, *dzp = VTOZ(dvp);
1553 1557          znode_t         *xzp;
1554 1558          vnode_t         *vp;
1555 1559          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1556 1560          zilog_t         *zilog;
1557 1561          uint64_t        acl_obj, xattr_obj;
1558 1562          uint64_t        xattr_obj_unlinked = 0;
1559 1563          uint64_t        obj = 0;
1560 1564          zfs_dirlock_t   *dl;
1561 1565          dmu_tx_t        *tx;
1562 1566          boolean_t       may_delete_now, delete_now = FALSE;
1563 1567          boolean_t       unlinked, toobig = FALSE;
1564 1568          uint64_t        txtype;
1565 1569          pathname_t      *realnmp = NULL;
1566 1570          pathname_t      realnm;
1567 1571          int             error;
1568 1572          int             zflg = ZEXISTS;
1569 1573  
1570 1574          ZFS_ENTER(zfsvfs);
1571 1575          ZFS_VERIFY_ZP(dzp);
1572 1576          zilog = zfsvfs->z_log;
1573 1577  
1574 1578          if (flags & FIGNORECASE) {
1575 1579                  zflg |= ZCILOOK;
1576 1580                  pn_alloc(&realnm);
1577 1581                  realnmp = &realnm;
1578 1582          }
1579 1583  
1580 1584  top:
1581 1585          xattr_obj = 0;
1582 1586          xzp = NULL;
1583 1587          /*
1584 1588           * Attempt to lock directory; fail if entry doesn't exist.
1585 1589           */
1586 1590          if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1587 1591              NULL, realnmp)) {
1588 1592                  if (realnmp)
1589 1593                          pn_free(realnmp);
1590 1594                  ZFS_EXIT(zfsvfs);
1591 1595                  return (error);
1592 1596          }
1593 1597  
1594 1598          vp = ZTOV(zp);
1595 1599  
1596 1600          if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1597 1601                  goto out;
1598 1602          }
1599 1603  
1600 1604          /*
1601 1605           * Need to use rmdir for removing directories.
1602 1606           */
1603 1607          if (vp->v_type == VDIR) {
1604 1608                  error = EPERM;
1605 1609                  goto out;
1606 1610          }
1607 1611  
1608 1612          vnevent_remove(vp, dvp, name, ct);
1609 1613  
1610 1614          if (realnmp)
1611 1615                  dnlc_remove(dvp, realnmp->pn_buf);
1612 1616          else
1613 1617                  dnlc_remove(dvp, name);
1614 1618  
1615 1619          mutex_enter(&vp->v_lock);
1616 1620          may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1617 1621          mutex_exit(&vp->v_lock);
1618 1622  
1619 1623          /*
1620 1624           * We may delete the znode now, or we may put it in the unlinked set;
1621 1625           * it depends on whether we're the last link, and on whether there are
1622 1626           * other holds on the vnode.  So we dmu_tx_hold() the right things to
1623 1627           * allow for either case.
1624 1628           */
1625 1629          obj = zp->z_id;
1626 1630          tx = dmu_tx_create(zfsvfs->z_os);
1627 1631          dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1628 1632          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1629 1633          zfs_sa_upgrade_txholds(tx, zp);
1630 1634          zfs_sa_upgrade_txholds(tx, dzp);
1631 1635          if (may_delete_now) {
1632 1636                  toobig =
1633 1637                      zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;

↓ open down ↓

1599 lines elided

↑ open up ↑

1634 1638                  /* if the file is too big, only hold_free a token amount */
1635 1639                  dmu_tx_hold_free(tx, zp->z_id, 0,
1636 1640                      (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1637 1641          }
1638 1642  
1639 1643          /* are there any extended attributes? */
1640 1644          error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1641 1645              &xattr_obj, sizeof (xattr_obj));
1642 1646          if (error == 0 && xattr_obj) {
1643 1647                  error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1644      -                ASSERT3U(error, ==, 0);
     1648 +                ASSERT0(error);
1645 1649                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1646 1650                  dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1647 1651          }
1648 1652  
1649 1653          mutex_enter(&zp->z_lock);
1650 1654          if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1651 1655                  dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1652 1656          mutex_exit(&zp->z_lock);
1653 1657  
1654 1658          /* charge as an update -- would be nice not to charge at all */

1655 1659          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1656 1660  
1657 1661          error = dmu_tx_assign(tx, TXG_NOWAIT);
1658 1662          if (error) {
1659 1663                  zfs_dirent_unlock(dl);
1660 1664                  VN_RELE(vp);
1661 1665                  if (xzp)
1662 1666                          VN_RELE(ZTOV(xzp));
1663 1667                  if (error == ERESTART) {
1664 1668                          dmu_tx_wait(tx);
1665 1669                          dmu_tx_abort(tx);
1666 1670                          goto top;
1667 1671                  }
1668 1672                  if (realnmp)
1669 1673                          pn_free(realnmp);
1670 1674                  dmu_tx_abort(tx);
1671 1675                  ZFS_EXIT(zfsvfs);
1672 1676                  return (error);
1673 1677          }
1674 1678  
1675 1679          /*
1676 1680           * Remove the directory entry.
1677 1681           */
1678 1682          error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1679 1683  
1680 1684          if (error) {
1681 1685                  dmu_tx_commit(tx);
1682 1686                  goto out;
1683 1687          }
1684 1688  
1685 1689          if (unlinked) {
1686 1690  
1687 1691                  /*
1688 1692                   * Hold z_lock so that we can make sure that the ACL obj
1689 1693                   * hasn't changed.  Could have been deleted due to
1690 1694                   * zfs_sa_upgrade().
1691 1695                   */
1692 1696                  mutex_enter(&zp->z_lock);
1693 1697                  mutex_enter(&vp->v_lock);
1694 1698                  (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1695 1699                      &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1696 1700                  delete_now = may_delete_now && !toobig &&
1697 1701                      vp->v_count == 1 && !vn_has_cached_data(vp) &&
1698 1702                      xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1699 1703                      acl_obj;
1700 1704                  mutex_exit(&vp->v_lock);
1701 1705          }
1702 1706  
1703 1707          if (delete_now) {
1704 1708                  if (xattr_obj_unlinked) {
1705 1709                          ASSERT3U(xzp->z_links, ==, 2);
1706 1710                          mutex_enter(&xzp->z_lock);
1707 1711                          xzp->z_unlinked = 1;
1708 1712                          xzp->z_links = 0;
1709 1713                          error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1710 1714                              &xzp->z_links, sizeof (xzp->z_links), tx);
1711 1715                          ASSERT3U(error,  ==,  0);

↓ open down ↓

57 lines elided

↑ open up ↑

1712 1716                          mutex_exit(&xzp->z_lock);
1713 1717                          zfs_unlinked_add(xzp, tx);
1714 1718  
1715 1719                          if (zp->z_is_sa)
1716 1720                                  error = sa_remove(zp->z_sa_hdl,
1717 1721                                      SA_ZPL_XATTR(zfsvfs), tx);
1718 1722                          else
1719 1723                                  error = sa_update(zp->z_sa_hdl,
1720 1724                                      SA_ZPL_XATTR(zfsvfs), &null_xattr,
1721 1725                                      sizeof (uint64_t), tx);
1722      -                        ASSERT3U(error, ==, 0);
     1726 +                        ASSERT0(error);
1723 1727                  }
1724 1728                  mutex_enter(&vp->v_lock);
1725 1729                  vp->v_count--;
1726      -                ASSERT3U(vp->v_count, ==, 0);
     1730 +                ASSERT0(vp->v_count);
1727 1731                  mutex_exit(&vp->v_lock);
1728 1732                  mutex_exit(&zp->z_lock);
1729 1733                  zfs_znode_delete(zp, tx);
1730 1734          } else if (unlinked) {
1731 1735                  mutex_exit(&zp->z_lock);
1732 1736                  zfs_unlinked_add(zp, tx);
1733 1737          }
1734 1738  
1735 1739          txtype = TX_REMOVE;
1736 1740          if (flags & FIGNORECASE)

1737 1741                  txtype |= TX_CI;
1738 1742          zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1739 1743  
1740 1744          dmu_tx_commit(tx);
1741 1745  out:
1742 1746          if (realnmp)
1743 1747                  pn_free(realnmp);
1744 1748  
1745 1749          zfs_dirent_unlock(dl);
1746 1750  
1747 1751          if (!delete_now)
1748 1752                  VN_RELE(vp);
1749 1753          if (xzp)
1750 1754                  VN_RELE(ZTOV(xzp));
1751 1755  
1752 1756          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1753 1757                  zil_commit(zilog, 0);
1754 1758  
1755 1759          ZFS_EXIT(zfsvfs);
1756 1760          return (error);
1757 1761  }
1758 1762  
1759 1763  /*
1760 1764   * Create a new directory and insert it into dvp using the name
1761 1765   * provided.  Return a pointer to the inserted directory.
1762 1766   *
1763 1767   *      IN:     dvp     - vnode of directory to add subdir to.
1764 1768   *              dirname - name of new directory.
1765 1769   *              vap     - attributes of new directory.
1766 1770   *              cr      - credentials of caller.
1767 1771   *              ct      - caller context
1768 1772   *              vsecp   - ACL to be set
1769 1773   *
1770 1774   *      OUT:    vpp     - vnode of created directory.
1771 1775   *
1772 1776   *      RETURN: 0 if success
1773 1777   *              error code if failure
1774 1778   *
1775 1779   * Timestamps:
1776 1780   *      dvp - ctime|mtime updated
1777 1781   *       vp - ctime|mtime|atime updated
1778 1782   */
1779 1783  /*ARGSUSED*/
1780 1784  static int
1781 1785  zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1782 1786      caller_context_t *ct, int flags, vsecattr_t *vsecp)
1783 1787  {
1784 1788          znode_t         *zp, *dzp = VTOZ(dvp);
1785 1789          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1786 1790          zilog_t         *zilog;
1787 1791          zfs_dirlock_t   *dl;
1788 1792          uint64_t        txtype;
1789 1793          dmu_tx_t        *tx;
1790 1794          int             error;
1791 1795          int             zf = ZNEW;
1792 1796          ksid_t          *ksid;
1793 1797          uid_t           uid;
1794 1798          gid_t           gid = crgetgid(cr);
1795 1799          zfs_acl_ids_t   acl_ids;
1796 1800          boolean_t       fuid_dirtied;
1797 1801  
1798 1802          ASSERT(vap->va_type == VDIR);
1799 1803  
1800 1804          /*
1801 1805           * If we have an ephemeral id, ACL, or XVATTR then
1802 1806           * make sure file system is at proper version
1803 1807           */
1804 1808  
1805 1809          ksid = crgetsid(cr, KSID_OWNER);
1806 1810          if (ksid)
1807 1811                  uid = ksid_getid(ksid);
1808 1812          else
1809 1813                  uid = crgetuid(cr);
1810 1814          if (zfsvfs->z_use_fuids == B_FALSE &&
1811 1815              (vsecp || (vap->va_mask & AT_XVATTR) ||
1812 1816              IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1813 1817                  return (EINVAL);
1814 1818  
1815 1819          ZFS_ENTER(zfsvfs);
1816 1820          ZFS_VERIFY_ZP(dzp);
1817 1821          zilog = zfsvfs->z_log;
1818 1822  
1819 1823          if (dzp->z_pflags & ZFS_XATTR) {
1820 1824                  ZFS_EXIT(zfsvfs);
1821 1825                  return (EINVAL);
1822 1826          }
1823 1827  
1824 1828          if (zfsvfs->z_utf8 && u8_validate(dirname,
1825 1829              strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1826 1830                  ZFS_EXIT(zfsvfs);
1827 1831                  return (EILSEQ);
1828 1832          }
1829 1833          if (flags & FIGNORECASE)
1830 1834                  zf |= ZCILOOK;
1831 1835  
1832 1836          if (vap->va_mask & AT_XVATTR) {
1833 1837                  if ((error = secpolicy_xvattr((xvattr_t *)vap,
1834 1838                      crgetuid(cr), cr, vap->va_type)) != 0) {
1835 1839                          ZFS_EXIT(zfsvfs);
1836 1840                          return (error);
1837 1841                  }
1838 1842          }
1839 1843  
1840 1844          if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1841 1845              vsecp, &acl_ids)) != 0) {
1842 1846                  ZFS_EXIT(zfsvfs);
1843 1847                  return (error);
1844 1848          }
1845 1849          /*
1846 1850           * First make sure the new directory doesn't exist.
1847 1851           *
1848 1852           * Existence is checked first to make sure we don't return
1849 1853           * EACCES instead of EEXIST which can cause some applications
1850 1854           * to fail.
1851 1855           */
1852 1856  top:
1853 1857          *vpp = NULL;
1854 1858  
1855 1859          if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1856 1860              NULL, NULL)) {
1857 1861                  zfs_acl_ids_free(&acl_ids);
1858 1862                  ZFS_EXIT(zfsvfs);
1859 1863                  return (error);
1860 1864          }
1861 1865  
1862 1866          if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1863 1867                  zfs_acl_ids_free(&acl_ids);
1864 1868                  zfs_dirent_unlock(dl);
1865 1869                  ZFS_EXIT(zfsvfs);
1866 1870                  return (error);
1867 1871          }
1868 1872  
1869 1873          if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1870 1874                  zfs_acl_ids_free(&acl_ids);
1871 1875                  zfs_dirent_unlock(dl);
1872 1876                  ZFS_EXIT(zfsvfs);
1873 1877                  return (EDQUOT);
1874 1878          }
1875 1879  
1876 1880          /*
1877 1881           * Add a new entry to the directory.
1878 1882           */
1879 1883          tx = dmu_tx_create(zfsvfs->z_os);
1880 1884          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1881 1885          dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1882 1886          fuid_dirtied = zfsvfs->z_fuid_dirty;
1883 1887          if (fuid_dirtied)
1884 1888                  zfs_fuid_txhold(zfsvfs, tx);
1885 1889          if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1886 1890                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1887 1891                      acl_ids.z_aclp->z_acl_bytes);
1888 1892          }
1889 1893  
1890 1894          dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1891 1895              ZFS_SA_BASE_ATTR_SIZE);
1892 1896  
1893 1897          error = dmu_tx_assign(tx, TXG_NOWAIT);
1894 1898          if (error) {
1895 1899                  zfs_dirent_unlock(dl);
1896 1900                  if (error == ERESTART) {
1897 1901                          dmu_tx_wait(tx);
1898 1902                          dmu_tx_abort(tx);
1899 1903                          goto top;
1900 1904                  }
1901 1905                  zfs_acl_ids_free(&acl_ids);
1902 1906                  dmu_tx_abort(tx);
1903 1907                  ZFS_EXIT(zfsvfs);
1904 1908                  return (error);
1905 1909          }
1906 1910  
1907 1911          /*
1908 1912           * Create new node.
1909 1913           */
1910 1914          zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1911 1915  
1912 1916          if (fuid_dirtied)
1913 1917                  zfs_fuid_sync(zfsvfs, tx);
1914 1918  
1915 1919          /*
1916 1920           * Now put new name in parent dir.
1917 1921           */
1918 1922          (void) zfs_link_create(dl, zp, tx, ZNEW);
1919 1923  
1920 1924          *vpp = ZTOV(zp);
1921 1925  
1922 1926          txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1923 1927          if (flags & FIGNORECASE)
1924 1928                  txtype |= TX_CI;
1925 1929          zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1926 1930              acl_ids.z_fuidp, vap);
1927 1931  
1928 1932          zfs_acl_ids_free(&acl_ids);
1929 1933  
1930 1934          dmu_tx_commit(tx);
1931 1935  
1932 1936          zfs_dirent_unlock(dl);
1933 1937  
1934 1938          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1935 1939                  zil_commit(zilog, 0);
1936 1940  
1937 1941          ZFS_EXIT(zfsvfs);
1938 1942          return (0);
1939 1943  }
1940 1944  
1941 1945  /*
1942 1946   * Remove a directory subdir entry.  If the current working
1943 1947   * directory is the same as the subdir to be removed, the
1944 1948   * remove will fail.
1945 1949   *
1946 1950   *      IN:     dvp     - vnode of directory to remove from.
1947 1951   *              name    - name of directory to be removed.
1948 1952   *              cwd     - vnode of current working directory.
1949 1953   *              cr      - credentials of caller.
1950 1954   *              ct      - caller context
1951 1955   *              flags   - case flags
1952 1956   *
1953 1957   *      RETURN: 0 if success
1954 1958   *              error code if failure
1955 1959   *
1956 1960   * Timestamps:
1957 1961   *      dvp - ctime|mtime updated
1958 1962   */
1959 1963  /*ARGSUSED*/
1960 1964  static int
1961 1965  zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
1962 1966      caller_context_t *ct, int flags)
1963 1967  {
1964 1968          znode_t         *dzp = VTOZ(dvp);
1965 1969          znode_t         *zp;
1966 1970          vnode_t         *vp;
1967 1971          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1968 1972          zilog_t         *zilog;
1969 1973          zfs_dirlock_t   *dl;
1970 1974          dmu_tx_t        *tx;
1971 1975          int             error;
1972 1976          int             zflg = ZEXISTS;
1973 1977  
1974 1978          ZFS_ENTER(zfsvfs);
1975 1979          ZFS_VERIFY_ZP(dzp);
1976 1980          zilog = zfsvfs->z_log;
1977 1981  
1978 1982          if (flags & FIGNORECASE)
1979 1983                  zflg |= ZCILOOK;
1980 1984  top:
1981 1985          zp = NULL;
1982 1986  
1983 1987          /*
1984 1988           * Attempt to lock directory; fail if entry doesn't exist.
1985 1989           */
1986 1990          if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1987 1991              NULL, NULL)) {
1988 1992                  ZFS_EXIT(zfsvfs);
1989 1993                  return (error);
1990 1994          }
1991 1995  
1992 1996          vp = ZTOV(zp);
1993 1997  
1994 1998          if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1995 1999                  goto out;
1996 2000          }
1997 2001  
1998 2002          if (vp->v_type != VDIR) {
1999 2003                  error = ENOTDIR;
2000 2004                  goto out;
2001 2005          }
2002 2006  
2003 2007          if (vp == cwd) {
2004 2008                  error = EINVAL;
2005 2009                  goto out;
2006 2010          }
2007 2011  
2008 2012          vnevent_rmdir(vp, dvp, name, ct);
2009 2013  
2010 2014          /*
2011 2015           * Grab a lock on the directory to make sure that noone is
2012 2016           * trying to add (or lookup) entries while we are removing it.
2013 2017           */
2014 2018          rw_enter(&zp->z_name_lock, RW_WRITER);
2015 2019  
2016 2020          /*
2017 2021           * Grab a lock on the parent pointer to make sure we play well
2018 2022           * with the treewalk and directory rename code.
2019 2023           */
2020 2024          rw_enter(&zp->z_parent_lock, RW_WRITER);
2021 2025  
2022 2026          tx = dmu_tx_create(zfsvfs->z_os);
2023 2027          dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2024 2028          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2025 2029          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2026 2030          zfs_sa_upgrade_txholds(tx, zp);
2027 2031          zfs_sa_upgrade_txholds(tx, dzp);
2028 2032          error = dmu_tx_assign(tx, TXG_NOWAIT);
2029 2033          if (error) {
2030 2034                  rw_exit(&zp->z_parent_lock);
2031 2035                  rw_exit(&zp->z_name_lock);
2032 2036                  zfs_dirent_unlock(dl);
2033 2037                  VN_RELE(vp);
2034 2038                  if (error == ERESTART) {
2035 2039                          dmu_tx_wait(tx);
2036 2040                          dmu_tx_abort(tx);
2037 2041                          goto top;
2038 2042                  }
2039 2043                  dmu_tx_abort(tx);
2040 2044                  ZFS_EXIT(zfsvfs);
2041 2045                  return (error);
2042 2046          }
2043 2047  
2044 2048          error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2045 2049  
2046 2050          if (error == 0) {
2047 2051                  uint64_t txtype = TX_RMDIR;
2048 2052                  if (flags & FIGNORECASE)
2049 2053                          txtype |= TX_CI;
2050 2054                  zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2051 2055          }
2052 2056  
2053 2057          dmu_tx_commit(tx);
2054 2058  
2055 2059          rw_exit(&zp->z_parent_lock);
2056 2060          rw_exit(&zp->z_name_lock);
2057 2061  out:
2058 2062          zfs_dirent_unlock(dl);
2059 2063  
2060 2064          VN_RELE(vp);
2061 2065  
2062 2066          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2063 2067                  zil_commit(zilog, 0);
2064 2068  
2065 2069          ZFS_EXIT(zfsvfs);
2066 2070          return (error);
2067 2071  }
2068 2072  
2069 2073  /*
2070 2074   * Read as many directory entries as will fit into the provided
2071 2075   * buffer from the given directory cursor position (specified in
2072 2076   * the uio structure.
2073 2077   *
2074 2078   *      IN:     vp      - vnode of directory to read.
2075 2079   *              uio     - structure supplying read location, range info,
2076 2080   *                        and return buffer.
2077 2081   *              cr      - credentials of caller.
2078 2082   *              ct      - caller context
2079 2083   *              flags   - case flags
2080 2084   *
2081 2085   *      OUT:    uio     - updated offset and range, buffer filled.
2082 2086   *              eofp    - set to true if end-of-file detected.
2083 2087   *
2084 2088   *      RETURN: 0 if success
2085 2089   *              error code if failure
2086 2090   *
2087 2091   * Timestamps:
2088 2092   *      vp - atime updated
2089 2093   *
2090 2094   * Note that the low 4 bits of the cookie returned by zap is always zero.
2091 2095   * This allows us to use the low range for "special" directory entries:
2092 2096   * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2093 2097   * we use the offset 2 for the '.zfs' directory.
2094 2098   */
2095 2099  /* ARGSUSED */
2096 2100  static int
2097 2101  zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
2098 2102      caller_context_t *ct, int flags)
2099 2103  {
2100 2104          znode_t         *zp = VTOZ(vp);
2101 2105          iovec_t         *iovp;
2102 2106          edirent_t       *eodp;
2103 2107          dirent64_t      *odp;
2104 2108          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2105 2109          objset_t        *os;
2106 2110          caddr_t         outbuf;
2107 2111          size_t          bufsize;
2108 2112          zap_cursor_t    zc;
2109 2113          zap_attribute_t zap;
2110 2114          uint_t          bytes_wanted;
2111 2115          uint64_t        offset; /* must be unsigned; checks for < 1 */
2112 2116          uint64_t        parent;
2113 2117          int             local_eof;
2114 2118          int             outcount;
2115 2119          int             error;
2116 2120          uint8_t         prefetch;
2117 2121          boolean_t       check_sysattrs;
2118 2122  
2119 2123          ZFS_ENTER(zfsvfs);
2120 2124          ZFS_VERIFY_ZP(zp);
2121 2125  
2122 2126          if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2123 2127              &parent, sizeof (parent))) != 0) {
2124 2128                  ZFS_EXIT(zfsvfs);
2125 2129                  return (error);
2126 2130          }
2127 2131  
2128 2132          /*
2129 2133           * If we are not given an eof variable,
2130 2134           * use a local one.
2131 2135           */
2132 2136          if (eofp == NULL)
2133 2137                  eofp = &local_eof;
2134 2138  
2135 2139          /*
2136 2140           * Check for valid iov_len.
2137 2141           */
2138 2142          if (uio->uio_iov->iov_len <= 0) {
2139 2143                  ZFS_EXIT(zfsvfs);
2140 2144                  return (EINVAL);
2141 2145          }
2142 2146  
2143 2147          /*
2144 2148           * Quit if directory has been removed (posix)
2145 2149           */
2146 2150          if ((*eofp = zp->z_unlinked) != 0) {
2147 2151                  ZFS_EXIT(zfsvfs);
2148 2152                  return (0);
2149 2153          }
2150 2154  
2151 2155          error = 0;
2152 2156          os = zfsvfs->z_os;
2153 2157          offset = uio->uio_loffset;
2154 2158          prefetch = zp->z_zn_prefetch;
2155 2159  
2156 2160          /*
2157 2161           * Initialize the iterator cursor.
2158 2162           */
2159 2163          if (offset <= 3) {
2160 2164                  /*
2161 2165                   * Start iteration from the beginning of the directory.
2162 2166                   */
2163 2167                  zap_cursor_init(&zc, os, zp->z_id);
2164 2168          } else {
2165 2169                  /*
2166 2170                   * The offset is a serialized cursor.
2167 2171                   */
2168 2172                  zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2169 2173          }
2170 2174  
2171 2175          /*
2172 2176           * Get space to change directory entries into fs independent format.
2173 2177           */
2174 2178          iovp = uio->uio_iov;
2175 2179          bytes_wanted = iovp->iov_len;
2176 2180          if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2177 2181                  bufsize = bytes_wanted;
2178 2182                  outbuf = kmem_alloc(bufsize, KM_SLEEP);
2179 2183                  odp = (struct dirent64 *)outbuf;
2180 2184          } else {
2181 2185                  bufsize = bytes_wanted;
2182 2186                  odp = (struct dirent64 *)iovp->iov_base;
2183 2187          }
2184 2188          eodp = (struct edirent *)odp;
2185 2189  
2186 2190          /*
2187 2191           * If this VFS supports the system attribute view interface; and
2188 2192           * we're looking at an extended attribute directory; and we care
2189 2193           * about normalization conflicts on this vfs; then we must check
2190 2194           * for normalization conflicts with the sysattr name space.
2191 2195           */
2192 2196          check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2193 2197              (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2194 2198              (flags & V_RDDIR_ENTFLAGS);
2195 2199  
2196 2200          /*
2197 2201           * Transform to file-system independent format
2198 2202           */
2199 2203          outcount = 0;
2200 2204          while (outcount < bytes_wanted) {
2201 2205                  ino64_t objnum;
2202 2206                  ushort_t reclen;
2203 2207                  off64_t *next = NULL;
2204 2208  
2205 2209                  /*
2206 2210                   * Special case `.', `..', and `.zfs'.
2207 2211                   */
2208 2212                  if (offset == 0) {
2209 2213                          (void) strcpy(zap.za_name, ".");
2210 2214                          zap.za_normalization_conflict = 0;
2211 2215                          objnum = zp->z_id;
2212 2216                  } else if (offset == 1) {
2213 2217                          (void) strcpy(zap.za_name, "..");
2214 2218                          zap.za_normalization_conflict = 0;
2215 2219                          objnum = parent;
2216 2220                  } else if (offset == 2 && zfs_show_ctldir(zp)) {
2217 2221                          (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2218 2222                          zap.za_normalization_conflict = 0;
2219 2223                          objnum = ZFSCTL_INO_ROOT;
2220 2224                  } else {
2221 2225                          /*
2222 2226                           * Grab next entry.
2223 2227                           */
2224 2228                          if (error = zap_cursor_retrieve(&zc, &zap)) {
2225 2229                                  if ((*eofp = (error == ENOENT)) != 0)
2226 2230                                          break;
2227 2231                                  else
2228 2232                                          goto update;
2229 2233                          }
2230 2234  
2231 2235                          if (zap.za_integer_length != 8 ||
2232 2236                              zap.za_num_integers != 1) {
2233 2237                                  cmn_err(CE_WARN, "zap_readdir: bad directory "
2234 2238                                      "entry, obj = %lld, offset = %lld\n",
2235 2239                                      (u_longlong_t)zp->z_id,
2236 2240                                      (u_longlong_t)offset);
2237 2241                                  error = ENXIO;
2238 2242                                  goto update;
2239 2243                          }
2240 2244  
2241 2245                          objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2242 2246                          /*
2243 2247                           * MacOS X can extract the object type here such as:
2244 2248                           * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2245 2249                           */
2246 2250  
2247 2251                          if (check_sysattrs && !zap.za_normalization_conflict) {
2248 2252                                  zap.za_normalization_conflict =
2249 2253                                      xattr_sysattr_casechk(zap.za_name);
2250 2254                          }
2251 2255                  }
2252 2256  
2253 2257                  if (flags & V_RDDIR_ACCFILTER) {
2254 2258                          /*
2255 2259                           * If we have no access at all, don't include
2256 2260                           * this entry in the returned information
2257 2261                           */
2258 2262                          znode_t *ezp;
2259 2263                          if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2260 2264                                  goto skip_entry;
2261 2265                          if (!zfs_has_access(ezp, cr)) {
2262 2266                                  VN_RELE(ZTOV(ezp));
2263 2267                                  goto skip_entry;
2264 2268                          }
2265 2269                          VN_RELE(ZTOV(ezp));
2266 2270                  }
2267 2271  
2268 2272                  if (flags & V_RDDIR_ENTFLAGS)
2269 2273                          reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2270 2274                  else
2271 2275                          reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2272 2276  
2273 2277                  /*
2274 2278                   * Will this entry fit in the buffer?
2275 2279                   */
2276 2280                  if (outcount + reclen > bufsize) {
2277 2281                          /*
2278 2282                           * Did we manage to fit anything in the buffer?
2279 2283                           */
2280 2284                          if (!outcount) {
2281 2285                                  error = EINVAL;
2282 2286                                  goto update;
2283 2287                          }
2284 2288                          break;
2285 2289                  }
2286 2290                  if (flags & V_RDDIR_ENTFLAGS) {
2287 2291                          /*
2288 2292                           * Add extended flag entry:
2289 2293                           */
2290 2294                          eodp->ed_ino = objnum;
2291 2295                          eodp->ed_reclen = reclen;
2292 2296                          /* NOTE: ed_off is the offset for the *next* entry */
2293 2297                          next = &(eodp->ed_off);
2294 2298                          eodp->ed_eflags = zap.za_normalization_conflict ?
2295 2299                              ED_CASE_CONFLICT : 0;
2296 2300                          (void) strncpy(eodp->ed_name, zap.za_name,
2297 2301                              EDIRENT_NAMELEN(reclen));
2298 2302                          eodp = (edirent_t *)((intptr_t)eodp + reclen);
2299 2303                  } else {
2300 2304                          /*
2301 2305                           * Add normal entry:
2302 2306                           */
2303 2307                          odp->d_ino = objnum;
2304 2308                          odp->d_reclen = reclen;
2305 2309                          /* NOTE: d_off is the offset for the *next* entry */
2306 2310                          next = &(odp->d_off);
2307 2311                          (void) strncpy(odp->d_name, zap.za_name,
2308 2312                              DIRENT64_NAMELEN(reclen));
2309 2313                          odp = (dirent64_t *)((intptr_t)odp + reclen);
2310 2314                  }
2311 2315                  outcount += reclen;
2312 2316  
2313 2317                  ASSERT(outcount <= bufsize);
2314 2318  
2315 2319                  /* Prefetch znode */
2316 2320                  if (prefetch)
2317 2321                          dmu_prefetch(os, objnum, 0, 0);
2318 2322  
2319 2323          skip_entry:
2320 2324                  /*
2321 2325                   * Move to the next entry, fill in the previous offset.
2322 2326                   */
2323 2327                  if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2324 2328                          zap_cursor_advance(&zc);
2325 2329                          offset = zap_cursor_serialize(&zc);
2326 2330                  } else {
2327 2331                          offset += 1;
2328 2332                  }
2329 2333                  if (next)
2330 2334                          *next = offset;
2331 2335          }
2332 2336          zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2333 2337  
2334 2338          if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2335 2339                  iovp->iov_base += outcount;
2336 2340                  iovp->iov_len -= outcount;
2337 2341                  uio->uio_resid -= outcount;
2338 2342          } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2339 2343                  /*
2340 2344                   * Reset the pointer.
2341 2345                   */
2342 2346                  offset = uio->uio_loffset;
2343 2347          }
2344 2348  
2345 2349  update:
2346 2350          zap_cursor_fini(&zc);
2347 2351          if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2348 2352                  kmem_free(outbuf, bufsize);
2349 2353  
2350 2354          if (error == ENOENT)
2351 2355                  error = 0;
2352 2356  
2353 2357          ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2354 2358  
2355 2359          uio->uio_loffset = offset;
2356 2360          ZFS_EXIT(zfsvfs);
2357 2361          return (error);
2358 2362  }
2359 2363  
2360 2364  ulong_t zfs_fsync_sync_cnt = 4;
2361 2365  
2362 2366  static int
2363 2367  zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2364 2368  {
2365 2369          znode_t *zp = VTOZ(vp);
2366 2370          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2367 2371  
2368 2372          /*
2369 2373           * Regardless of whether this is required for standards conformance,
2370 2374           * this is the logical behavior when fsync() is called on a file with
2371 2375           * dirty pages.  We use B_ASYNC since the ZIL transactions are already
2372 2376           * going to be pushed out as part of the zil_commit().
2373 2377           */
2374 2378          if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2375 2379              (vp->v_type == VREG) && !(IS_SWAPVP(vp)))
2376 2380                  (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct);
2377 2381  
2378 2382          (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2379 2383  
2380 2384          if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2381 2385                  ZFS_ENTER(zfsvfs);
2382 2386                  ZFS_VERIFY_ZP(zp);
2383 2387                  zil_commit(zfsvfs->z_log, zp->z_id);
2384 2388                  ZFS_EXIT(zfsvfs);
2385 2389          }
2386 2390          return (0);
2387 2391  }
2388 2392  
2389 2393  
2390 2394  /*
2391 2395   * Get the requested file attributes and place them in the provided
2392 2396   * vattr structure.
2393 2397   *
2394 2398   *      IN:     vp      - vnode of file.
2395 2399   *              vap     - va_mask identifies requested attributes.
2396 2400   *                        If AT_XVATTR set, then optional attrs are requested
2397 2401   *              flags   - ATTR_NOACLCHECK (CIFS server context)
2398 2402   *              cr      - credentials of caller.
2399 2403   *              ct      - caller context
2400 2404   *
2401 2405   *      OUT:    vap     - attribute values.
2402 2406   *
2403 2407   *      RETURN: 0 (always succeeds)
2404 2408   */
2405 2409  /* ARGSUSED */
2406 2410  static int
2407 2411  zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2408 2412      caller_context_t *ct)
2409 2413  {
2410 2414          znode_t *zp = VTOZ(vp);
2411 2415          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2412 2416          int     error = 0;
2413 2417          uint64_t links;
2414 2418          uint64_t mtime[2], ctime[2];
2415 2419          xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2416 2420          xoptattr_t *xoap = NULL;
2417 2421          boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2418 2422          sa_bulk_attr_t bulk[2];
2419 2423          int count = 0;
2420 2424  
2421 2425          ZFS_ENTER(zfsvfs);
2422 2426          ZFS_VERIFY_ZP(zp);
2423 2427  
2424 2428          zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2425 2429  
2426 2430          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2427 2431          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2428 2432  
2429 2433          if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2430 2434                  ZFS_EXIT(zfsvfs);
2431 2435                  return (error);
2432 2436          }
2433 2437  
2434 2438          /*
2435 2439           * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2436 2440           * Also, if we are the owner don't bother, since owner should
2437 2441           * always be allowed to read basic attributes of file.
2438 2442           */
2439 2443          if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2440 2444              (vap->va_uid != crgetuid(cr))) {
2441 2445                  if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2442 2446                      skipaclchk, cr)) {
2443 2447                          ZFS_EXIT(zfsvfs);
2444 2448                          return (error);
2445 2449                  }
2446 2450          }
2447 2451  
2448 2452          /*
2449 2453           * Return all attributes.  It's cheaper to provide the answer
2450 2454           * than to determine whether we were asked the question.
2451 2455           */
2452 2456  
2453 2457          mutex_enter(&zp->z_lock);
2454 2458          vap->va_type = vp->v_type;
2455 2459          vap->va_mode = zp->z_mode & MODEMASK;
2456 2460          vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2457 2461          vap->va_nodeid = zp->z_id;
2458 2462          if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2459 2463                  links = zp->z_links + 1;
2460 2464          else
2461 2465                  links = zp->z_links;
2462 2466          vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
2463 2467          vap->va_size = zp->z_size;
2464 2468          vap->va_rdev = vp->v_rdev;
2465 2469          vap->va_seq = zp->z_seq;
2466 2470  
2467 2471          /*
2468 2472           * Add in any requested optional attributes and the create time.
2469 2473           * Also set the corresponding bits in the returned attribute bitmap.
2470 2474           */
2471 2475          if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2472 2476                  if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2473 2477                          xoap->xoa_archive =
2474 2478                              ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2475 2479                          XVA_SET_RTN(xvap, XAT_ARCHIVE);
2476 2480                  }
2477 2481  
2478 2482                  if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2479 2483                          xoap->xoa_readonly =
2480 2484                              ((zp->z_pflags & ZFS_READONLY) != 0);
2481 2485                          XVA_SET_RTN(xvap, XAT_READONLY);
2482 2486                  }
2483 2487  
2484 2488                  if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2485 2489                          xoap->xoa_system =
2486 2490                              ((zp->z_pflags & ZFS_SYSTEM) != 0);
2487 2491                          XVA_SET_RTN(xvap, XAT_SYSTEM);
2488 2492                  }
2489 2493  
2490 2494                  if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2491 2495                          xoap->xoa_hidden =
2492 2496                              ((zp->z_pflags & ZFS_HIDDEN) != 0);
2493 2497                          XVA_SET_RTN(xvap, XAT_HIDDEN);
2494 2498                  }
2495 2499  
2496 2500                  if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2497 2501                          xoap->xoa_nounlink =
2498 2502                              ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2499 2503                          XVA_SET_RTN(xvap, XAT_NOUNLINK);
2500 2504                  }
2501 2505  
2502 2506                  if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2503 2507                          xoap->xoa_immutable =
2504 2508                              ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2505 2509                          XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2506 2510                  }
2507 2511  
2508 2512                  if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2509 2513                          xoap->xoa_appendonly =
2510 2514                              ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2511 2515                          XVA_SET_RTN(xvap, XAT_APPENDONLY);
2512 2516                  }
2513 2517  
2514 2518                  if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2515 2519                          xoap->xoa_nodump =
2516 2520                              ((zp->z_pflags & ZFS_NODUMP) != 0);
2517 2521                          XVA_SET_RTN(xvap, XAT_NODUMP);
2518 2522                  }
2519 2523  
2520 2524                  if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2521 2525                          xoap->xoa_opaque =
2522 2526                              ((zp->z_pflags & ZFS_OPAQUE) != 0);
2523 2527                          XVA_SET_RTN(xvap, XAT_OPAQUE);
2524 2528                  }
2525 2529  
2526 2530                  if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2527 2531                          xoap->xoa_av_quarantined =
2528 2532                              ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2529 2533                          XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2530 2534                  }
2531 2535  
2532 2536                  if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2533 2537                          xoap->xoa_av_modified =
2534 2538                              ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2535 2539                          XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2536 2540                  }
2537 2541  
2538 2542                  if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2539 2543                      vp->v_type == VREG) {
2540 2544                          zfs_sa_get_scanstamp(zp, xvap);
2541 2545                  }
2542 2546  
2543 2547                  if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2544 2548                          uint64_t times[2];
2545 2549  
2546 2550                          (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2547 2551                              times, sizeof (times));
2548 2552                          ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2549 2553                          XVA_SET_RTN(xvap, XAT_CREATETIME);
2550 2554                  }
2551 2555  
2552 2556                  if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2553 2557                          xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2554 2558                          XVA_SET_RTN(xvap, XAT_REPARSE);
2555 2559                  }
2556 2560                  if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2557 2561                          xoap->xoa_generation = zp->z_gen;
2558 2562                          XVA_SET_RTN(xvap, XAT_GEN);
2559 2563                  }
2560 2564  
2561 2565                  if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2562 2566                          xoap->xoa_offline =
2563 2567                              ((zp->z_pflags & ZFS_OFFLINE) != 0);
2564 2568                          XVA_SET_RTN(xvap, XAT_OFFLINE);
2565 2569                  }
2566 2570  
2567 2571                  if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2568 2572                          xoap->xoa_sparse =
2569 2573                              ((zp->z_pflags & ZFS_SPARSE) != 0);
2570 2574                          XVA_SET_RTN(xvap, XAT_SPARSE);
2571 2575                  }
2572 2576          }
2573 2577  
2574 2578          ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2575 2579          ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2576 2580          ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2577 2581  
2578 2582          mutex_exit(&zp->z_lock);
2579 2583  
2580 2584          sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
2581 2585  
2582 2586          if (zp->z_blksz == 0) {
2583 2587                  /*
2584 2588                   * Block size hasn't been set; suggest maximal I/O transfers.
2585 2589                   */
2586 2590                  vap->va_blksize = zfsvfs->z_max_blksz;
2587 2591          }
2588 2592  
2589 2593          ZFS_EXIT(zfsvfs);
2590 2594          return (0);
2591 2595  }
2592 2596  
2593 2597  /*
2594 2598   * Set the file attributes to the values contained in the
2595 2599   * vattr structure.
2596 2600   *
2597 2601   *      IN:     vp      - vnode of file to be modified.
2598 2602   *              vap     - new attribute values.
2599 2603   *                        If AT_XVATTR set, then optional attrs are being set
2600 2604   *              flags   - ATTR_UTIME set if non-default time values provided.
2601 2605   *                      - ATTR_NOACLCHECK (CIFS context only).
2602 2606   *              cr      - credentials of caller.
2603 2607   *              ct      - caller context
2604 2608   *
2605 2609   *      RETURN: 0 if success
2606 2610   *              error code if failure
2607 2611   *
2608 2612   * Timestamps:
2609 2613   *      vp - ctime updated, mtime updated if size changed.
2610 2614   */
2611 2615  /* ARGSUSED */
2612 2616  static int
2613 2617  zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2614 2618          caller_context_t *ct)
2615 2619  {
2616 2620          znode_t         *zp = VTOZ(vp);
2617 2621          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2618 2622          zilog_t         *zilog;
2619 2623          dmu_tx_t        *tx;
2620 2624          vattr_t         oldva;
2621 2625          xvattr_t        tmpxvattr;
2622 2626          uint_t          mask = vap->va_mask;
2623 2627          uint_t          saved_mask;
2624 2628          int             trim_mask = 0;
2625 2629          uint64_t        new_mode;
2626 2630          uint64_t        new_uid, new_gid;
2627 2631          uint64_t        xattr_obj;
2628 2632          uint64_t        mtime[2], ctime[2];
2629 2633          znode_t         *attrzp;
2630 2634          int             need_policy = FALSE;
2631 2635          int             err, err2;
2632 2636          zfs_fuid_info_t *fuidp = NULL;
2633 2637          xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2634 2638          xoptattr_t      *xoap;
2635 2639          zfs_acl_t       *aclp;
2636 2640          boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2637 2641          boolean_t       fuid_dirtied = B_FALSE;
2638 2642          sa_bulk_attr_t  bulk[7], xattr_bulk[7];
2639 2643          int             count = 0, xattr_count = 0;
2640 2644  
2641 2645          if (mask == 0)
2642 2646                  return (0);
2643 2647  
2644 2648          if (mask & AT_NOSET)
2645 2649                  return (EINVAL);
2646 2650  
2647 2651          ZFS_ENTER(zfsvfs);
2648 2652          ZFS_VERIFY_ZP(zp);
2649 2653  
2650 2654          zilog = zfsvfs->z_log;
2651 2655  
2652 2656          /*
2653 2657           * Make sure that if we have ephemeral uid/gid or xvattr specified
2654 2658           * that file system is at proper version level
2655 2659           */
2656 2660  
2657 2661          if (zfsvfs->z_use_fuids == B_FALSE &&
2658 2662              (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2659 2663              ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2660 2664              (mask & AT_XVATTR))) {
2661 2665                  ZFS_EXIT(zfsvfs);
2662 2666                  return (EINVAL);
2663 2667          }
2664 2668  
2665 2669          if (mask & AT_SIZE && vp->v_type == VDIR) {
2666 2670                  ZFS_EXIT(zfsvfs);
2667 2671                  return (EISDIR);
2668 2672          }
2669 2673  
2670 2674          if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2671 2675                  ZFS_EXIT(zfsvfs);
2672 2676                  return (EINVAL);
2673 2677          }
2674 2678  
2675 2679          /*
2676 2680           * If this is an xvattr_t, then get a pointer to the structure of
2677 2681           * optional attributes.  If this is NULL, then we have a vattr_t.
2678 2682           */
2679 2683          xoap = xva_getxoptattr(xvap);
2680 2684  
2681 2685          xva_init(&tmpxvattr);
2682 2686  
2683 2687          /*
2684 2688           * Immutable files can only alter immutable bit and atime
2685 2689           */
2686 2690          if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2687 2691              ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2688 2692              ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2689 2693                  ZFS_EXIT(zfsvfs);
2690 2694                  return (EPERM);
2691 2695          }
2692 2696  
2693 2697          if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2694 2698                  ZFS_EXIT(zfsvfs);
2695 2699                  return (EPERM);
2696 2700          }
2697 2701  
2698 2702          /*
2699 2703           * Verify timestamps doesn't overflow 32 bits.
2700 2704           * ZFS can handle large timestamps, but 32bit syscalls can't
2701 2705           * handle times greater than 2039.  This check should be removed
2702 2706           * once large timestamps are fully supported.
2703 2707           */
2704 2708          if (mask & (AT_ATIME | AT_MTIME)) {
2705 2709                  if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2706 2710                      ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2707 2711                          ZFS_EXIT(zfsvfs);
2708 2712                          return (EOVERFLOW);
2709 2713                  }
2710 2714          }
2711 2715  
2712 2716  top:
2713 2717          attrzp = NULL;
2714 2718          aclp = NULL;
2715 2719  
2716 2720          /* Can this be moved to before the top label? */
2717 2721          if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2718 2722                  ZFS_EXIT(zfsvfs);
2719 2723                  return (EROFS);
2720 2724          }
2721 2725  
2722 2726          /*
2723 2727           * First validate permissions
2724 2728           */
2725 2729  
2726 2730          if (mask & AT_SIZE) {
2727 2731                  err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2728 2732                  if (err) {
2729 2733                          ZFS_EXIT(zfsvfs);
2730 2734                          return (err);
2731 2735                  }
2732 2736                  /*
2733 2737                   * XXX - Note, we are not providing any open
2734 2738                   * mode flags here (like FNDELAY), so we may
2735 2739                   * block if there are locks present... this
2736 2740                   * should be addressed in openat().
2737 2741                   */
2738 2742                  /* XXX - would it be OK to generate a log record here? */
2739 2743                  err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2740 2744                  if (err) {
2741 2745                          ZFS_EXIT(zfsvfs);
2742 2746                          return (err);
2743 2747                  }
2744 2748          }
2745 2749  
2746 2750          if (mask & (AT_ATIME|AT_MTIME) ||
2747 2751              ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2748 2752              XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2749 2753              XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2750 2754              XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2751 2755              XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2752 2756              XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2753 2757              XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2754 2758                  need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2755 2759                      skipaclchk, cr);
2756 2760          }
2757 2761  
2758 2762          if (mask & (AT_UID|AT_GID)) {
2759 2763                  int     idmask = (mask & (AT_UID|AT_GID));
2760 2764                  int     take_owner;
2761 2765                  int     take_group;
2762 2766  
2763 2767                  /*
2764 2768                   * NOTE: even if a new mode is being set,
2765 2769                   * we may clear S_ISUID/S_ISGID bits.
2766 2770                   */
2767 2771  
2768 2772                  if (!(mask & AT_MODE))
2769 2773                          vap->va_mode = zp->z_mode;
2770 2774  
2771 2775                  /*
2772 2776                   * Take ownership or chgrp to group we are a member of
2773 2777                   */
2774 2778  
2775 2779                  take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2776 2780                  take_group = (mask & AT_GID) &&
2777 2781                      zfs_groupmember(zfsvfs, vap->va_gid, cr);
2778 2782  
2779 2783                  /*
2780 2784                   * If both AT_UID and AT_GID are set then take_owner and
2781 2785                   * take_group must both be set in order to allow taking
2782 2786                   * ownership.
2783 2787                   *
2784 2788                   * Otherwise, send the check through secpolicy_vnode_setattr()
2785 2789                   *
2786 2790                   */
2787 2791  
2788 2792                  if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2789 2793                      ((idmask == AT_UID) && take_owner) ||
2790 2794                      ((idmask == AT_GID) && take_group)) {
2791 2795                          if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2792 2796                              skipaclchk, cr) == 0) {
2793 2797                                  /*
2794 2798                                   * Remove setuid/setgid for non-privileged users
2795 2799                                   */
2796 2800                                  secpolicy_setid_clear(vap, cr);
2797 2801                                  trim_mask = (mask & (AT_UID|AT_GID));
2798 2802                          } else {
2799 2803                                  need_policy =  TRUE;
2800 2804                          }
2801 2805                  } else {
2802 2806                          need_policy =  TRUE;
2803 2807                  }
2804 2808          }
2805 2809  
2806 2810          mutex_enter(&zp->z_lock);
2807 2811          oldva.va_mode = zp->z_mode;
2808 2812          zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2809 2813          if (mask & AT_XVATTR) {
2810 2814                  /*
2811 2815                   * Update xvattr mask to include only those attributes
2812 2816                   * that are actually changing.
2813 2817                   *
2814 2818                   * the bits will be restored prior to actually setting
2815 2819                   * the attributes so the caller thinks they were set.
2816 2820                   */
2817 2821                  if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2818 2822                          if (xoap->xoa_appendonly !=
2819 2823                              ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2820 2824                                  need_policy = TRUE;
2821 2825                          } else {
2822 2826                                  XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2823 2827                                  XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2824 2828                          }
2825 2829                  }
2826 2830  
2827 2831                  if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2828 2832                          if (xoap->xoa_nounlink !=
2829 2833                              ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2830 2834                                  need_policy = TRUE;
2831 2835                          } else {
2832 2836                                  XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2833 2837                                  XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2834 2838                          }
2835 2839                  }
2836 2840  
2837 2841                  if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2838 2842                          if (xoap->xoa_immutable !=
2839 2843                              ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2840 2844                                  need_policy = TRUE;
2841 2845                          } else {
2842 2846                                  XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2843 2847                                  XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2844 2848                          }
2845 2849                  }
2846 2850  
2847 2851                  if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2848 2852                          if (xoap->xoa_nodump !=
2849 2853                              ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2850 2854                                  need_policy = TRUE;
2851 2855                          } else {
2852 2856                                  XVA_CLR_REQ(xvap, XAT_NODUMP);
2853 2857                                  XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2854 2858                          }
2855 2859                  }
2856 2860  
2857 2861                  if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2858 2862                          if (xoap->xoa_av_modified !=
2859 2863                              ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2860 2864                                  need_policy = TRUE;
2861 2865                          } else {
2862 2866                                  XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2863 2867                                  XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2864 2868                          }
2865 2869                  }
2866 2870  
2867 2871                  if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2868 2872                          if ((vp->v_type != VREG &&
2869 2873                              xoap->xoa_av_quarantined) ||
2870 2874                              xoap->xoa_av_quarantined !=
2871 2875                              ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2872 2876                                  need_policy = TRUE;
2873 2877                          } else {
2874 2878                                  XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2875 2879                                  XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2876 2880                          }
2877 2881                  }
2878 2882  
2879 2883                  if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2880 2884                          mutex_exit(&zp->z_lock);
2881 2885                          ZFS_EXIT(zfsvfs);
2882 2886                          return (EPERM);
2883 2887                  }
2884 2888  
2885 2889                  if (need_policy == FALSE &&
2886 2890                      (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2887 2891                      XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2888 2892                          need_policy = TRUE;
2889 2893                  }
2890 2894          }
2891 2895  
2892 2896          mutex_exit(&zp->z_lock);
2893 2897  
2894 2898          if (mask & AT_MODE) {
2895 2899                  if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2896 2900                          err = secpolicy_setid_setsticky_clear(vp, vap,
2897 2901                              &oldva, cr);
2898 2902                          if (err) {
2899 2903                                  ZFS_EXIT(zfsvfs);
2900 2904                                  return (err);
2901 2905                          }
2902 2906                          trim_mask |= AT_MODE;
2903 2907                  } else {
2904 2908                          need_policy = TRUE;
2905 2909                  }
2906 2910          }
2907 2911  
2908 2912          if (need_policy) {
2909 2913                  /*
2910 2914                   * If trim_mask is set then take ownership
2911 2915                   * has been granted or write_acl is present and user
2912 2916                   * has the ability to modify mode.  In that case remove
2913 2917                   * UID|GID and or MODE from mask so that
2914 2918                   * secpolicy_vnode_setattr() doesn't revoke it.
2915 2919                   */
2916 2920  
2917 2921                  if (trim_mask) {
2918 2922                          saved_mask = vap->va_mask;
2919 2923                          vap->va_mask &= ~trim_mask;
2920 2924                  }
2921 2925                  err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2922 2926                      (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2923 2927                  if (err) {
2924 2928                          ZFS_EXIT(zfsvfs);
2925 2929                          return (err);
2926 2930                  }
2927 2931  
2928 2932                  if (trim_mask)
2929 2933                          vap->va_mask |= saved_mask;
2930 2934          }
2931 2935  
2932 2936          /*
2933 2937           * secpolicy_vnode_setattr, or take ownership may have
2934 2938           * changed va_mask
2935 2939           */
2936 2940          mask = vap->va_mask;
2937 2941  
2938 2942          if ((mask & (AT_UID | AT_GID))) {
2939 2943                  err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2940 2944                      &xattr_obj, sizeof (xattr_obj));
2941 2945  
2942 2946                  if (err == 0 && xattr_obj) {
2943 2947                          err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
2944 2948                          if (err)
2945 2949                                  goto out2;
2946 2950                  }
2947 2951                  if (mask & AT_UID) {
2948 2952                          new_uid = zfs_fuid_create(zfsvfs,
2949 2953                              (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2950 2954                          if (new_uid != zp->z_uid &&
2951 2955                              zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
2952 2956                                  if (attrzp)
2953 2957                                          VN_RELE(ZTOV(attrzp));
2954 2958                                  err = EDQUOT;
2955 2959                                  goto out2;
2956 2960                          }
2957 2961                  }
2958 2962  
2959 2963                  if (mask & AT_GID) {
2960 2964                          new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
2961 2965                              cr, ZFS_GROUP, &fuidp);
2962 2966                          if (new_gid != zp->z_gid &&
2963 2967                              zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
2964 2968                                  if (attrzp)
2965 2969                                          VN_RELE(ZTOV(attrzp));
2966 2970                                  err = EDQUOT;
2967 2971                                  goto out2;
2968 2972                          }
2969 2973                  }
2970 2974          }
2971 2975          tx = dmu_tx_create(zfsvfs->z_os);
2972 2976  
2973 2977          if (mask & AT_MODE) {
2974 2978                  uint64_t pmode = zp->z_mode;
2975 2979                  uint64_t acl_obj;
2976 2980                  new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2977 2981  
2978 2982                  if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
2979 2983                          goto out;
2980 2984  
2981 2985                  mutex_enter(&zp->z_lock);
2982 2986                  if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2983 2987                          /*
2984 2988                           * Are we upgrading ACL from old V0 format
2985 2989                           * to V1 format?
2986 2990                           */
2987 2991                          if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2988 2992                              zfs_znode_acl_version(zp) ==
2989 2993                              ZFS_ACL_VERSION_INITIAL) {
2990 2994                                  dmu_tx_hold_free(tx, acl_obj, 0,
2991 2995                                      DMU_OBJECT_END);
2992 2996                                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2993 2997                                      0, aclp->z_acl_bytes);
2994 2998                          } else {
2995 2999                                  dmu_tx_hold_write(tx, acl_obj, 0,
2996 3000                                      aclp->z_acl_bytes);
2997 3001                          }
2998 3002                  } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2999 3003                          dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3000 3004                              0, aclp->z_acl_bytes);
3001 3005                  }
3002 3006                  mutex_exit(&zp->z_lock);
3003 3007                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3004 3008          } else {
3005 3009                  if ((mask & AT_XVATTR) &&
3006 3010                      XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3007 3011                          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3008 3012                  else
3009 3013                          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3010 3014          }
3011 3015  
3012 3016          if (attrzp) {
3013 3017                  dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3014 3018          }
3015 3019  
3016 3020          fuid_dirtied = zfsvfs->z_fuid_dirty;
3017 3021          if (fuid_dirtied)
3018 3022                  zfs_fuid_txhold(zfsvfs, tx);
3019 3023  
3020 3024          zfs_sa_upgrade_txholds(tx, zp);
3021 3025  
3022 3026          err = dmu_tx_assign(tx, TXG_NOWAIT);
3023 3027          if (err) {
3024 3028                  if (err == ERESTART)
3025 3029                          dmu_tx_wait(tx);
3026 3030                  goto out;
3027 3031          }
3028 3032  
3029 3033          count = 0;
3030 3034          /*
3031 3035           * Set each attribute requested.
3032 3036           * We group settings according to the locks they need to acquire.
3033 3037           *
3034 3038           * Note: you cannot set ctime directly, although it will be
3035 3039           * updated as a side-effect of calling this function.
3036 3040           */
3037 3041  
3038 3042  
3039 3043          if (mask & (AT_UID|AT_GID|AT_MODE))
3040 3044                  mutex_enter(&zp->z_acl_lock);
3041 3045          mutex_enter(&zp->z_lock);
3042 3046  
3043 3047          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3044 3048              &zp->z_pflags, sizeof (zp->z_pflags));
3045 3049  
3046 3050          if (attrzp) {
3047 3051                  if (mask & (AT_UID|AT_GID|AT_MODE))
3048 3052                          mutex_enter(&attrzp->z_acl_lock);
3049 3053                  mutex_enter(&attrzp->z_lock);
3050 3054                  SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3051 3055                      SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3052 3056                      sizeof (attrzp->z_pflags));
3053 3057          }
3054 3058  
3055 3059          if (mask & (AT_UID|AT_GID)) {
3056 3060  
3057 3061                  if (mask & AT_UID) {
3058 3062                          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3059 3063                              &new_uid, sizeof (new_uid));
3060 3064                          zp->z_uid = new_uid;
3061 3065                          if (attrzp) {
3062 3066                                  SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3063 3067                                      SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3064 3068                                      sizeof (new_uid));
3065 3069                                  attrzp->z_uid = new_uid;
3066 3070                          }
3067 3071                  }
3068 3072  
3069 3073                  if (mask & AT_GID) {
3070 3074                          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3071 3075                              NULL, &new_gid, sizeof (new_gid));
3072 3076                          zp->z_gid = new_gid;
3073 3077                          if (attrzp) {
3074 3078                                  SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3075 3079                                      SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3076 3080                                      sizeof (new_gid));
3077 3081                                  attrzp->z_gid = new_gid;
3078 3082                          }
3079 3083                  }
3080 3084                  if (!(mask & AT_MODE)) {
3081 3085                          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3082 3086                              NULL, &new_mode, sizeof (new_mode));
3083 3087                          new_mode = zp->z_mode;
3084 3088                  }
3085 3089                  err = zfs_acl_chown_setattr(zp);
3086 3090                  ASSERT(err == 0);
3087 3091                  if (attrzp) {
3088 3092                          err = zfs_acl_chown_setattr(attrzp);

↓ open down ↓

1352 lines elided

↑ open up ↑

3089 3093                          ASSERT(err == 0);
3090 3094                  }
3091 3095          }
3092 3096  
3093 3097          if (mask & AT_MODE) {
3094 3098                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3095 3099                      &new_mode, sizeof (new_mode));
3096 3100                  zp->z_mode = new_mode;
3097 3101                  ASSERT3U((uintptr_t)aclp, !=, NULL);
3098 3102                  err = zfs_aclset_common(zp, aclp, cr, tx);
3099      -                ASSERT3U(err, ==, 0);
     3103 +                ASSERT0(err);
3100 3104                  if (zp->z_acl_cached)
3101 3105                          zfs_acl_free(zp->z_acl_cached);
3102 3106                  zp->z_acl_cached = aclp;
3103 3107                  aclp = NULL;
3104 3108          }
3105 3109  
3106 3110  
3107 3111          if (mask & AT_ATIME) {
3108 3112                  ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3109 3113                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,

3110 3114                      &zp->z_atime, sizeof (zp->z_atime));
3111 3115          }
3112 3116  
3113 3117          if (mask & AT_MTIME) {
3114 3118                  ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3115 3119                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3116 3120                      mtime, sizeof (mtime));
3117 3121          }
3118 3122  
3119 3123          /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3120 3124          if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3121 3125                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3122 3126                      NULL, mtime, sizeof (mtime));
3123 3127                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3124 3128                      &ctime, sizeof (ctime));
3125 3129                  zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3126 3130                      B_TRUE);
3127 3131          } else if (mask != 0) {
3128 3132                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3129 3133                      &ctime, sizeof (ctime));
3130 3134                  zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3131 3135                      B_TRUE);
3132 3136                  if (attrzp) {
3133 3137                          SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3134 3138                              SA_ZPL_CTIME(zfsvfs), NULL,
3135 3139                              &ctime, sizeof (ctime));
3136 3140                          zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3137 3141                              mtime, ctime, B_TRUE);
3138 3142                  }
3139 3143          }
3140 3144          /*
3141 3145           * Do this after setting timestamps to prevent timestamp
3142 3146           * update from toggling bit
3143 3147           */
3144 3148  
3145 3149          if (xoap && (mask & AT_XVATTR)) {
3146 3150  
3147 3151                  /*
3148 3152                   * restore trimmed off masks
3149 3153                   * so that return masks can be set for caller.
3150 3154                   */
3151 3155  
3152 3156                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3153 3157                          XVA_SET_REQ(xvap, XAT_APPENDONLY);
3154 3158                  }
3155 3159                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3156 3160                          XVA_SET_REQ(xvap, XAT_NOUNLINK);
3157 3161                  }
3158 3162                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3159 3163                          XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3160 3164                  }
3161 3165                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3162 3166                          XVA_SET_REQ(xvap, XAT_NODUMP);
3163 3167                  }
3164 3168                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3165 3169                          XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3166 3170                  }
3167 3171                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3168 3172                          XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3169 3173                  }
3170 3174  
3171 3175                  if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3172 3176                          ASSERT(vp->v_type == VREG);
3173 3177  
3174 3178                  zfs_xvattr_set(zp, xvap, tx);
3175 3179          }
3176 3180  
3177 3181          if (fuid_dirtied)
3178 3182                  zfs_fuid_sync(zfsvfs, tx);
3179 3183  
3180 3184          if (mask != 0)
3181 3185                  zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3182 3186  
3183 3187          mutex_exit(&zp->z_lock);
3184 3188          if (mask & (AT_UID|AT_GID|AT_MODE))
3185 3189                  mutex_exit(&zp->z_acl_lock);
3186 3190  
3187 3191          if (attrzp) {
3188 3192                  if (mask & (AT_UID|AT_GID|AT_MODE))
3189 3193                          mutex_exit(&attrzp->z_acl_lock);
3190 3194                  mutex_exit(&attrzp->z_lock);
3191 3195          }
3192 3196  out:
3193 3197          if (err == 0 && attrzp) {
3194 3198                  err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3195 3199                      xattr_count, tx);
3196 3200                  ASSERT(err2 == 0);
3197 3201          }
3198 3202  
3199 3203          if (attrzp)
3200 3204                  VN_RELE(ZTOV(attrzp));
3201 3205          if (aclp)
3202 3206                  zfs_acl_free(aclp);
3203 3207  
3204 3208          if (fuidp) {
3205 3209                  zfs_fuid_info_free(fuidp);
3206 3210                  fuidp = NULL;
3207 3211          }
3208 3212  
3209 3213          if (err) {
3210 3214                  dmu_tx_abort(tx);
3211 3215                  if (err == ERESTART)
3212 3216                          goto top;
3213 3217          } else {
3214 3218                  err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3215 3219                  dmu_tx_commit(tx);
3216 3220          }
3217 3221  
3218 3222  out2:
3219 3223          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3220 3224                  zil_commit(zilog, 0);
3221 3225  
3222 3226          ZFS_EXIT(zfsvfs);
3223 3227          return (err);
3224 3228  }
3225 3229  
3226 3230  typedef struct zfs_zlock {
3227 3231          krwlock_t       *zl_rwlock;     /* lock we acquired */
3228 3232          znode_t         *zl_znode;      /* znode we held */
3229 3233          struct zfs_zlock *zl_next;      /* next in list */
3230 3234  } zfs_zlock_t;
3231 3235  
3232 3236  /*
3233 3237   * Drop locks and release vnodes that were held by zfs_rename_lock().
3234 3238   */
3235 3239  static void
3236 3240  zfs_rename_unlock(zfs_zlock_t **zlpp)
3237 3241  {
3238 3242          zfs_zlock_t *zl;
3239 3243  
3240 3244          while ((zl = *zlpp) != NULL) {
3241 3245                  if (zl->zl_znode != NULL)
3242 3246                          VN_RELE(ZTOV(zl->zl_znode));
3243 3247                  rw_exit(zl->zl_rwlock);
3244 3248                  *zlpp = zl->zl_next;
3245 3249                  kmem_free(zl, sizeof (*zl));
3246 3250          }
3247 3251  }
3248 3252  
3249 3253  /*
3250 3254   * Search back through the directory tree, using the ".." entries.
3251 3255   * Lock each directory in the chain to prevent concurrent renames.
3252 3256   * Fail any attempt to move a directory into one of its own descendants.
3253 3257   * XXX - z_parent_lock can overlap with map or grow locks
3254 3258   */
3255 3259  static int
3256 3260  zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3257 3261  {
3258 3262          zfs_zlock_t     *zl;
3259 3263          znode_t         *zp = tdzp;
3260 3264          uint64_t        rootid = zp->z_zfsvfs->z_root;
3261 3265          uint64_t        oidp = zp->z_id;
3262 3266          krwlock_t       *rwlp = &szp->z_parent_lock;
3263 3267          krw_t           rw = RW_WRITER;
3264 3268  
3265 3269          /*
3266 3270           * First pass write-locks szp and compares to zp->z_id.
3267 3271           * Later passes read-lock zp and compare to zp->z_parent.
3268 3272           */
3269 3273          do {
3270 3274                  if (!rw_tryenter(rwlp, rw)) {
3271 3275                          /*
3272 3276                           * Another thread is renaming in this path.
3273 3277                           * Note that if we are a WRITER, we don't have any
3274 3278                           * parent_locks held yet.
3275 3279                           */
3276 3280                          if (rw == RW_READER && zp->z_id > szp->z_id) {
3277 3281                                  /*
3278 3282                                   * Drop our locks and restart
3279 3283                                   */
3280 3284                                  zfs_rename_unlock(&zl);
3281 3285                                  *zlpp = NULL;
3282 3286                                  zp = tdzp;
3283 3287                                  oidp = zp->z_id;
3284 3288                                  rwlp = &szp->z_parent_lock;
3285 3289                                  rw = RW_WRITER;
3286 3290                                  continue;
3287 3291                          } else {
3288 3292                                  /*
3289 3293                                   * Wait for other thread to drop its locks
3290 3294                                   */
3291 3295                                  rw_enter(rwlp, rw);
3292 3296                          }
3293 3297                  }
3294 3298  
3295 3299                  zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3296 3300                  zl->zl_rwlock = rwlp;
3297 3301                  zl->zl_znode = NULL;
3298 3302                  zl->zl_next = *zlpp;
3299 3303                  *zlpp = zl;
3300 3304  
3301 3305                  if (oidp == szp->z_id)          /* We're a descendant of szp */
3302 3306                          return (EINVAL);
3303 3307  
3304 3308                  if (oidp == rootid)             /* We've hit the top */
3305 3309                          return (0);
3306 3310  
3307 3311                  if (rw == RW_READER) {          /* i.e. not the first pass */
3308 3312                          int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3309 3313                          if (error)
3310 3314                                  return (error);
3311 3315                          zl->zl_znode = zp;
3312 3316                  }
3313 3317                  (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3314 3318                      &oidp, sizeof (oidp));
3315 3319                  rwlp = &zp->z_parent_lock;
3316 3320                  rw = RW_READER;
3317 3321  
3318 3322          } while (zp->z_id != sdzp->z_id);
3319 3323  
3320 3324          return (0);
3321 3325  }
3322 3326  
3323 3327  /*
3324 3328   * Move an entry from the provided source directory to the target
3325 3329   * directory.  Change the entry name as indicated.
3326 3330   *
3327 3331   *      IN:     sdvp    - Source directory containing the "old entry".
3328 3332   *              snm     - Old entry name.
3329 3333   *              tdvp    - Target directory to contain the "new entry".
3330 3334   *              tnm     - New entry name.
3331 3335   *              cr      - credentials of caller.
3332 3336   *              ct      - caller context
3333 3337   *              flags   - case flags
3334 3338   *
3335 3339   *      RETURN: 0 if success
3336 3340   *              error code if failure
3337 3341   *
3338 3342   * Timestamps:
3339 3343   *      sdvp,tdvp - ctime|mtime updated
3340 3344   */
3341 3345  /*ARGSUSED*/
3342 3346  static int
3343 3347  zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3344 3348      caller_context_t *ct, int flags)
3345 3349  {
3346 3350          znode_t         *tdzp, *szp, *tzp;
3347 3351          znode_t         *sdzp = VTOZ(sdvp);
3348 3352          zfsvfs_t        *zfsvfs = sdzp->z_zfsvfs;
3349 3353          zilog_t         *zilog;
3350 3354          vnode_t         *realvp;
3351 3355          zfs_dirlock_t   *sdl, *tdl;
3352 3356          dmu_tx_t        *tx;
3353 3357          zfs_zlock_t     *zl;
3354 3358          int             cmp, serr, terr;
3355 3359          int             error = 0;
3356 3360          int             zflg = 0;
3357 3361  
3358 3362          ZFS_ENTER(zfsvfs);
3359 3363          ZFS_VERIFY_ZP(sdzp);
3360 3364          zilog = zfsvfs->z_log;
3361 3365  
3362 3366          /*
3363 3367           * Make sure we have the real vp for the target directory.
3364 3368           */
3365 3369          if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3366 3370                  tdvp = realvp;
3367 3371  
3368 3372          if (tdvp->v_vfsp != sdvp->v_vfsp || zfsctl_is_node(tdvp)) {
3369 3373                  ZFS_EXIT(zfsvfs);
3370 3374                  return (EXDEV);
3371 3375          }
3372 3376  
3373 3377          tdzp = VTOZ(tdvp);
3374 3378          ZFS_VERIFY_ZP(tdzp);
3375 3379          if (zfsvfs->z_utf8 && u8_validate(tnm,
3376 3380              strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3377 3381                  ZFS_EXIT(zfsvfs);
3378 3382                  return (EILSEQ);
3379 3383          }
3380 3384  
3381 3385          if (flags & FIGNORECASE)
3382 3386                  zflg |= ZCILOOK;
3383 3387  
3384 3388  top:
3385 3389          szp = NULL;
3386 3390          tzp = NULL;
3387 3391          zl = NULL;
3388 3392  
3389 3393          /*
3390 3394           * This is to prevent the creation of links into attribute space
3391 3395           * by renaming a linked file into/outof an attribute directory.
3392 3396           * See the comment in zfs_link() for why this is considered bad.
3393 3397           */
3394 3398          if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3395 3399                  ZFS_EXIT(zfsvfs);
3396 3400                  return (EINVAL);
3397 3401          }
3398 3402  
3399 3403          /*
3400 3404           * Lock source and target directory entries.  To prevent deadlock,
3401 3405           * a lock ordering must be defined.  We lock the directory with
3402 3406           * the smallest object id first, or if it's a tie, the one with
3403 3407           * the lexically first name.
3404 3408           */
3405 3409          if (sdzp->z_id < tdzp->z_id) {
3406 3410                  cmp = -1;
3407 3411          } else if (sdzp->z_id > tdzp->z_id) {
3408 3412                  cmp = 1;
3409 3413          } else {
3410 3414                  /*
3411 3415                   * First compare the two name arguments without
3412 3416                   * considering any case folding.
3413 3417                   */
3414 3418                  int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3415 3419  
3416 3420                  cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3417 3421                  ASSERT(error == 0 || !zfsvfs->z_utf8);
3418 3422                  if (cmp == 0) {
3419 3423                          /*
3420 3424                           * POSIX: "If the old argument and the new argument
3421 3425                           * both refer to links to the same existing file,
3422 3426                           * the rename() function shall return successfully
3423 3427                           * and perform no other action."
3424 3428                           */
3425 3429                          ZFS_EXIT(zfsvfs);
3426 3430                          return (0);
3427 3431                  }
3428 3432                  /*
3429 3433                   * If the file system is case-folding, then we may
3430 3434                   * have some more checking to do.  A case-folding file
3431 3435                   * system is either supporting mixed case sensitivity
3432 3436                   * access or is completely case-insensitive.  Note
3433 3437                   * that the file system is always case preserving.
3434 3438                   *
3435 3439                   * In mixed sensitivity mode case sensitive behavior
3436 3440                   * is the default.  FIGNORECASE must be used to
3437 3441                   * explicitly request case insensitive behavior.
3438 3442                   *
3439 3443                   * If the source and target names provided differ only
3440 3444                   * by case (e.g., a request to rename 'tim' to 'Tim'),
3441 3445                   * we will treat this as a special case in the
3442 3446                   * case-insensitive mode: as long as the source name
3443 3447                   * is an exact match, we will allow this to proceed as
3444 3448                   * a name-change request.
3445 3449                   */
3446 3450                  if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3447 3451                      (zfsvfs->z_case == ZFS_CASE_MIXED &&
3448 3452                      flags & FIGNORECASE)) &&
3449 3453                      u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3450 3454                      &error) == 0) {
3451 3455                          /*
3452 3456                           * case preserving rename request, require exact
3453 3457                           * name matches
3454 3458                           */
3455 3459                          zflg |= ZCIEXACT;
3456 3460                          zflg &= ~ZCILOOK;
3457 3461                  }
3458 3462          }
3459 3463  
3460 3464          /*
3461 3465           * If the source and destination directories are the same, we should
3462 3466           * grab the z_name_lock of that directory only once.
3463 3467           */
3464 3468          if (sdzp == tdzp) {
3465 3469                  zflg |= ZHAVELOCK;
3466 3470                  rw_enter(&sdzp->z_name_lock, RW_READER);
3467 3471          }
3468 3472  
3469 3473          if (cmp < 0) {
3470 3474                  serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3471 3475                      ZEXISTS | zflg, NULL, NULL);
3472 3476                  terr = zfs_dirent_lock(&tdl,
3473 3477                      tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3474 3478          } else {
3475 3479                  terr = zfs_dirent_lock(&tdl,
3476 3480                      tdzp, tnm, &tzp, zflg, NULL, NULL);
3477 3481                  serr = zfs_dirent_lock(&sdl,
3478 3482                      sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3479 3483                      NULL, NULL);
3480 3484          }
3481 3485  
3482 3486          if (serr) {
3483 3487                  /*
3484 3488                   * Source entry invalid or not there.
3485 3489                   */
3486 3490                  if (!terr) {
3487 3491                          zfs_dirent_unlock(tdl);
3488 3492                          if (tzp)
3489 3493                                  VN_RELE(ZTOV(tzp));
3490 3494                  }
3491 3495  
3492 3496                  if (sdzp == tdzp)
3493 3497                          rw_exit(&sdzp->z_name_lock);
3494 3498  
3495 3499                  if (strcmp(snm, "..") == 0)
3496 3500                          serr = EINVAL;
3497 3501                  ZFS_EXIT(zfsvfs);
3498 3502                  return (serr);
3499 3503          }
3500 3504          if (terr) {
3501 3505                  zfs_dirent_unlock(sdl);
3502 3506                  VN_RELE(ZTOV(szp));
3503 3507  
3504 3508                  if (sdzp == tdzp)
3505 3509                          rw_exit(&sdzp->z_name_lock);
3506 3510  
3507 3511                  if (strcmp(tnm, "..") == 0)
3508 3512                          terr = EINVAL;
3509 3513                  ZFS_EXIT(zfsvfs);
3510 3514                  return (terr);
3511 3515          }
3512 3516  
3513 3517          /*
3514 3518           * Must have write access at the source to remove the old entry
3515 3519           * and write access at the target to create the new entry.
3516 3520           * Note that if target and source are the same, this can be
3517 3521           * done in a single check.
3518 3522           */
3519 3523  
3520 3524          if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3521 3525                  goto out;
3522 3526  
3523 3527          if (ZTOV(szp)->v_type == VDIR) {
3524 3528                  /*
3525 3529                   * Check to make sure rename is valid.
3526 3530                   * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3527 3531                   */
3528 3532                  if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3529 3533                          goto out;
3530 3534          }
3531 3535  
3532 3536          /*
3533 3537           * Does target exist?
3534 3538           */
3535 3539          if (tzp) {
3536 3540                  /*
3537 3541                   * Source and target must be the same type.
3538 3542                   */
3539 3543                  if (ZTOV(szp)->v_type == VDIR) {
3540 3544                          if (ZTOV(tzp)->v_type != VDIR) {
3541 3545                                  error = ENOTDIR;
3542 3546                                  goto out;
3543 3547                          }
3544 3548                  } else {
3545 3549                          if (ZTOV(tzp)->v_type == VDIR) {
3546 3550                                  error = EISDIR;
3547 3551                                  goto out;
3548 3552                          }
3549 3553                  }
3550 3554                  /*
3551 3555                   * POSIX dictates that when the source and target
3552 3556                   * entries refer to the same file object, rename
3553 3557                   * must do nothing and exit without error.
3554 3558                   */
3555 3559                  if (szp->z_id == tzp->z_id) {
3556 3560                          error = 0;
3557 3561                          goto out;
3558 3562                  }
3559 3563          }
3560 3564  
3561 3565          vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3562 3566          if (tzp)
3563 3567                  vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3564 3568  
3565 3569          /*
3566 3570           * notify the target directory if it is not the same
3567 3571           * as source directory.
3568 3572           */
3569 3573          if (tdvp != sdvp) {
3570 3574                  vnevent_rename_dest_dir(tdvp, ct);
3571 3575          }
3572 3576  
3573 3577          tx = dmu_tx_create(zfsvfs->z_os);
3574 3578          dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3575 3579          dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3576 3580          dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3577 3581          dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3578 3582          if (sdzp != tdzp) {
3579 3583                  dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3580 3584                  zfs_sa_upgrade_txholds(tx, tdzp);
3581 3585          }
3582 3586          if (tzp) {
3583 3587                  dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3584 3588                  zfs_sa_upgrade_txholds(tx, tzp);
3585 3589          }
3586 3590  
3587 3591          zfs_sa_upgrade_txholds(tx, szp);
3588 3592          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3589 3593          error = dmu_tx_assign(tx, TXG_NOWAIT);
3590 3594          if (error) {
3591 3595                  if (zl != NULL)
3592 3596                          zfs_rename_unlock(&zl);
3593 3597                  zfs_dirent_unlock(sdl);
3594 3598                  zfs_dirent_unlock(tdl);
3595 3599  
3596 3600                  if (sdzp == tdzp)
3597 3601                          rw_exit(&sdzp->z_name_lock);
3598 3602  
3599 3603                  VN_RELE(ZTOV(szp));
3600 3604                  if (tzp)
3601 3605                          VN_RELE(ZTOV(tzp));
3602 3606                  if (error == ERESTART) {
3603 3607                          dmu_tx_wait(tx);
3604 3608                          dmu_tx_abort(tx);
3605 3609                          goto top;
3606 3610                  }
3607 3611                  dmu_tx_abort(tx);
3608 3612                  ZFS_EXIT(zfsvfs);
3609 3613                  return (error);
3610 3614          }
3611 3615

↓ open down ↓

502 lines elided

↑ open up ↑

3612 3616          if (tzp)        /* Attempt to remove the existing target */
3613 3617                  error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3614 3618  
3615 3619          if (error == 0) {
3616 3620                  error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3617 3621                  if (error == 0) {
3618 3622                          szp->z_pflags |= ZFS_AV_MODIFIED;
3619 3623  
3620 3624                          error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3621 3625                              (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3622      -                        ASSERT3U(error, ==, 0);
     3626 +                        ASSERT0(error);
3623 3627  
3624 3628                          error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3625 3629                          if (error == 0) {
3626 3630                                  zfs_log_rename(zilog, tx, TX_RENAME |
3627 3631                                      (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3628 3632                                      sdl->dl_name, tdzp, tdl->dl_name, szp);
3629 3633  
3630 3634                                  /*
3631 3635                                   * Update path information for the target vnode
3632 3636                                   */

3633 3637                                  vn_renamepath(tdvp, ZTOV(szp), tnm,
3634 3638                                      strlen(tnm));
3635 3639                          } else {
3636 3640                                  /*
3637 3641                                   * At this point, we have successfully created
3638 3642                                   * the target name, but have failed to remove
3639 3643                                   * the source name.  Since the create was done
3640 3644                                   * with the ZRENAMING flag, there are
3641 3645                                   * complications; for one, the link count is
3642 3646                                   * wrong.  The easiest way to deal with this
3643 3647                                   * is to remove the newly created target, and
3644 3648                                   * return the original error.  This must
3645 3649                                   * succeed; fortunately, it is very unlikely to
3646 3650                                   * fail, since we just created it.
3647 3651                                   */
3648 3652                                  VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3649 3653                                      ZRENAMING, NULL), ==, 0);
3650 3654                          }
3651 3655                  }
3652 3656          }
3653 3657  
3654 3658          dmu_tx_commit(tx);
3655 3659  out:
3656 3660          if (zl != NULL)
3657 3661                  zfs_rename_unlock(&zl);
3658 3662  
3659 3663          zfs_dirent_unlock(sdl);
3660 3664          zfs_dirent_unlock(tdl);
3661 3665  
3662 3666          if (sdzp == tdzp)
3663 3667                  rw_exit(&sdzp->z_name_lock);
3664 3668  
3665 3669  
3666 3670          VN_RELE(ZTOV(szp));
3667 3671          if (tzp)
3668 3672                  VN_RELE(ZTOV(tzp));
3669 3673  
3670 3674          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3671 3675                  zil_commit(zilog, 0);
3672 3676  
3673 3677          ZFS_EXIT(zfsvfs);
3674 3678          return (error);
3675 3679  }
3676 3680  
3677 3681  /*
3678 3682   * Insert the indicated symbolic reference entry into the directory.
3679 3683   *
3680 3684   *      IN:     dvp     - Directory to contain new symbolic link.
3681 3685   *              link    - Name for new symlink entry.
3682 3686   *              vap     - Attributes of new entry.
3683 3687   *              target  - Target path of new symlink.
3684 3688   *              cr      - credentials of caller.
3685 3689   *              ct      - caller context
3686 3690   *              flags   - case flags
3687 3691   *
3688 3692   *      RETURN: 0 if success
3689 3693   *              error code if failure
3690 3694   *
3691 3695   * Timestamps:
3692 3696   *      dvp - ctime|mtime updated
3693 3697   */
3694 3698  /*ARGSUSED*/
3695 3699  static int
3696 3700  zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
3697 3701      caller_context_t *ct, int flags)
3698 3702  {
3699 3703          znode_t         *zp, *dzp = VTOZ(dvp);
3700 3704          zfs_dirlock_t   *dl;
3701 3705          dmu_tx_t        *tx;
3702 3706          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3703 3707          zilog_t         *zilog;
3704 3708          uint64_t        len = strlen(link);
3705 3709          int             error;
3706 3710          int             zflg = ZNEW;
3707 3711          zfs_acl_ids_t   acl_ids;
3708 3712          boolean_t       fuid_dirtied;
3709 3713          uint64_t        txtype = TX_SYMLINK;
3710 3714  
3711 3715          ASSERT(vap->va_type == VLNK);
3712 3716  
3713 3717          ZFS_ENTER(zfsvfs);
3714 3718          ZFS_VERIFY_ZP(dzp);
3715 3719          zilog = zfsvfs->z_log;
3716 3720  
3717 3721          if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3718 3722              NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3719 3723                  ZFS_EXIT(zfsvfs);
3720 3724                  return (EILSEQ);
3721 3725          }
3722 3726          if (flags & FIGNORECASE)
3723 3727                  zflg |= ZCILOOK;
3724 3728  
3725 3729          if (len > MAXPATHLEN) {
3726 3730                  ZFS_EXIT(zfsvfs);
3727 3731                  return (ENAMETOOLONG);
3728 3732          }
3729 3733  
3730 3734          if ((error = zfs_acl_ids_create(dzp, 0,
3731 3735              vap, cr, NULL, &acl_ids)) != 0) {
3732 3736                  ZFS_EXIT(zfsvfs);
3733 3737                  return (error);
3734 3738          }
3735 3739  top:
3736 3740          /*
3737 3741           * Attempt to lock directory; fail if entry already exists.
3738 3742           */
3739 3743          error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3740 3744          if (error) {
3741 3745                  zfs_acl_ids_free(&acl_ids);
3742 3746                  ZFS_EXIT(zfsvfs);
3743 3747                  return (error);
3744 3748          }
3745 3749  
3746 3750          if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3747 3751                  zfs_acl_ids_free(&acl_ids);
3748 3752                  zfs_dirent_unlock(dl);
3749 3753                  ZFS_EXIT(zfsvfs);
3750 3754                  return (error);
3751 3755          }
3752 3756  
3753 3757          if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
3754 3758                  zfs_acl_ids_free(&acl_ids);
3755 3759                  zfs_dirent_unlock(dl);
3756 3760                  ZFS_EXIT(zfsvfs);
3757 3761                  return (EDQUOT);
3758 3762          }
3759 3763          tx = dmu_tx_create(zfsvfs->z_os);
3760 3764          fuid_dirtied = zfsvfs->z_fuid_dirty;
3761 3765          dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3762 3766          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3763 3767          dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3764 3768              ZFS_SA_BASE_ATTR_SIZE + len);
3765 3769          dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3766 3770          if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3767 3771                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3768 3772                      acl_ids.z_aclp->z_acl_bytes);
3769 3773          }
3770 3774          if (fuid_dirtied)
3771 3775                  zfs_fuid_txhold(zfsvfs, tx);
3772 3776          error = dmu_tx_assign(tx, TXG_NOWAIT);
3773 3777          if (error) {
3774 3778                  zfs_dirent_unlock(dl);
3775 3779                  if (error == ERESTART) {
3776 3780                          dmu_tx_wait(tx);
3777 3781                          dmu_tx_abort(tx);
3778 3782                          goto top;
3779 3783                  }
3780 3784                  zfs_acl_ids_free(&acl_ids);
3781 3785                  dmu_tx_abort(tx);
3782 3786                  ZFS_EXIT(zfsvfs);
3783 3787                  return (error);
3784 3788          }
3785 3789  
3786 3790          /*
3787 3791           * Create a new object for the symlink.
3788 3792           * for version 4 ZPL datsets the symlink will be an SA attribute
3789 3793           */
3790 3794          zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3791 3795  
3792 3796          if (fuid_dirtied)
3793 3797                  zfs_fuid_sync(zfsvfs, tx);
3794 3798  
3795 3799          mutex_enter(&zp->z_lock);
3796 3800          if (zp->z_is_sa)
3797 3801                  error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3798 3802                      link, len, tx);
3799 3803          else
3800 3804                  zfs_sa_symlink(zp, link, len, tx);
3801 3805          mutex_exit(&zp->z_lock);
3802 3806  
3803 3807          zp->z_size = len;
3804 3808          (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3805 3809              &zp->z_size, sizeof (zp->z_size), tx);
3806 3810          /*
3807 3811           * Insert the new object into the directory.
3808 3812           */
3809 3813          (void) zfs_link_create(dl, zp, tx, ZNEW);
3810 3814  
3811 3815          if (flags & FIGNORECASE)
3812 3816                  txtype |= TX_CI;
3813 3817          zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3814 3818  
3815 3819          zfs_acl_ids_free(&acl_ids);
3816 3820  
3817 3821          dmu_tx_commit(tx);
3818 3822  
3819 3823          zfs_dirent_unlock(dl);
3820 3824  
3821 3825          VN_RELE(ZTOV(zp));
3822 3826  
3823 3827          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3824 3828                  zil_commit(zilog, 0);
3825 3829  
3826 3830          ZFS_EXIT(zfsvfs);
3827 3831          return (error);
3828 3832  }
3829 3833  
3830 3834  /*
3831 3835   * Return, in the buffer contained in the provided uio structure,
3832 3836   * the symbolic path referred to by vp.
3833 3837   *
3834 3838   *      IN:     vp      - vnode of symbolic link.
3835 3839   *              uoip    - structure to contain the link path.
3836 3840   *              cr      - credentials of caller.
3837 3841   *              ct      - caller context
3838 3842   *
3839 3843   *      OUT:    uio     - structure to contain the link path.
3840 3844   *
3841 3845   *      RETURN: 0 if success
3842 3846   *              error code if failure
3843 3847   *
3844 3848   * Timestamps:
3845 3849   *      vp - atime updated
3846 3850   */
3847 3851  /* ARGSUSED */
3848 3852  static int
3849 3853  zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3850 3854  {
3851 3855          znode_t         *zp = VTOZ(vp);
3852 3856          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3853 3857          int             error;
3854 3858  
3855 3859          ZFS_ENTER(zfsvfs);
3856 3860          ZFS_VERIFY_ZP(zp);
3857 3861  
3858 3862          mutex_enter(&zp->z_lock);
3859 3863          if (zp->z_is_sa)
3860 3864                  error = sa_lookup_uio(zp->z_sa_hdl,
3861 3865                      SA_ZPL_SYMLINK(zfsvfs), uio);
3862 3866          else
3863 3867                  error = zfs_sa_readlink(zp, uio);
3864 3868          mutex_exit(&zp->z_lock);
3865 3869  
3866 3870          ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3867 3871  
3868 3872          ZFS_EXIT(zfsvfs);
3869 3873          return (error);
3870 3874  }
3871 3875  
3872 3876  /*
3873 3877   * Insert a new entry into directory tdvp referencing svp.
3874 3878   *
3875 3879   *      IN:     tdvp    - Directory to contain new entry.
3876 3880   *              svp     - vnode of new entry.
3877 3881   *              name    - name of new entry.
3878 3882   *              cr      - credentials of caller.
3879 3883   *              ct      - caller context
3880 3884   *
3881 3885   *      RETURN: 0 if success
3882 3886   *              error code if failure
3883 3887   *
3884 3888   * Timestamps:
3885 3889   *      tdvp - ctime|mtime updated
3886 3890   *       svp - ctime updated
3887 3891   */
3888 3892  /* ARGSUSED */
3889 3893  static int
3890 3894  zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
3891 3895      caller_context_t *ct, int flags)
3892 3896  {
3893 3897          znode_t         *dzp = VTOZ(tdvp);
3894 3898          znode_t         *tzp, *szp;
3895 3899          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3896 3900          zilog_t         *zilog;
3897 3901          zfs_dirlock_t   *dl;
3898 3902          dmu_tx_t        *tx;
3899 3903          vnode_t         *realvp;
3900 3904          int             error;
3901 3905          int             zf = ZNEW;
3902 3906          uint64_t        parent;
3903 3907          uid_t           owner;
3904 3908  
3905 3909          ASSERT(tdvp->v_type == VDIR);
3906 3910  
3907 3911          ZFS_ENTER(zfsvfs);
3908 3912          ZFS_VERIFY_ZP(dzp);
3909 3913          zilog = zfsvfs->z_log;
3910 3914  
3911 3915          if (VOP_REALVP(svp, &realvp, ct) == 0)
3912 3916                  svp = realvp;
3913 3917  
3914 3918          /*
3915 3919           * POSIX dictates that we return EPERM here.
3916 3920           * Better choices include ENOTSUP or EISDIR.
3917 3921           */
3918 3922          if (svp->v_type == VDIR) {
3919 3923                  ZFS_EXIT(zfsvfs);
3920 3924                  return (EPERM);
3921 3925          }
3922 3926  
3923 3927          if (svp->v_vfsp != tdvp->v_vfsp || zfsctl_is_node(svp)) {
3924 3928                  ZFS_EXIT(zfsvfs);
3925 3929                  return (EXDEV);
3926 3930          }
3927 3931  
3928 3932          szp = VTOZ(svp);
3929 3933          ZFS_VERIFY_ZP(szp);
3930 3934  
3931 3935          /* Prevent links to .zfs/shares files */
3932 3936  
3933 3937          if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3934 3938              &parent, sizeof (uint64_t))) != 0) {
3935 3939                  ZFS_EXIT(zfsvfs);
3936 3940                  return (error);
3937 3941          }
3938 3942          if (parent == zfsvfs->z_shares_dir) {
3939 3943                  ZFS_EXIT(zfsvfs);
3940 3944                  return (EPERM);
3941 3945          }
3942 3946  
3943 3947          if (zfsvfs->z_utf8 && u8_validate(name,
3944 3948              strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3945 3949                  ZFS_EXIT(zfsvfs);
3946 3950                  return (EILSEQ);
3947 3951          }
3948 3952          if (flags & FIGNORECASE)
3949 3953                  zf |= ZCILOOK;
3950 3954  
3951 3955          /*
3952 3956           * We do not support links between attributes and non-attributes
3953 3957           * because of the potential security risk of creating links
3954 3958           * into "normal" file space in order to circumvent restrictions
3955 3959           * imposed in attribute space.
3956 3960           */
3957 3961          if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
3958 3962                  ZFS_EXIT(zfsvfs);
3959 3963                  return (EINVAL);
3960 3964          }
3961 3965  
3962 3966  
3963 3967          owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
3964 3968          if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
3965 3969                  ZFS_EXIT(zfsvfs);
3966 3970                  return (EPERM);
3967 3971          }
3968 3972  
3969 3973          if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3970 3974                  ZFS_EXIT(zfsvfs);
3971 3975                  return (error);
3972 3976          }
3973 3977  
3974 3978  top:
3975 3979          /*
3976 3980           * Attempt to lock directory; fail if entry already exists.
3977 3981           */
3978 3982          error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
3979 3983          if (error) {
3980 3984                  ZFS_EXIT(zfsvfs);
3981 3985                  return (error);
3982 3986          }
3983 3987  
3984 3988          tx = dmu_tx_create(zfsvfs->z_os);
3985 3989          dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3986 3990          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3987 3991          zfs_sa_upgrade_txholds(tx, szp);
3988 3992          zfs_sa_upgrade_txholds(tx, dzp);
3989 3993          error = dmu_tx_assign(tx, TXG_NOWAIT);
3990 3994          if (error) {
3991 3995                  zfs_dirent_unlock(dl);
3992 3996                  if (error == ERESTART) {
3993 3997                          dmu_tx_wait(tx);
3994 3998                          dmu_tx_abort(tx);
3995 3999                          goto top;
3996 4000                  }
3997 4001                  dmu_tx_abort(tx);
3998 4002                  ZFS_EXIT(zfsvfs);
3999 4003                  return (error);
4000 4004          }
4001 4005  
4002 4006          error = zfs_link_create(dl, szp, tx, 0);
4003 4007  
4004 4008          if (error == 0) {
4005 4009                  uint64_t txtype = TX_LINK;
4006 4010                  if (flags & FIGNORECASE)
4007 4011                          txtype |= TX_CI;
4008 4012                  zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4009 4013          }
4010 4014  
4011 4015          dmu_tx_commit(tx);
4012 4016  
4013 4017          zfs_dirent_unlock(dl);
4014 4018  
4015 4019          if (error == 0) {
4016 4020                  vnevent_link(svp, ct);
4017 4021          }
4018 4022  
4019 4023          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4020 4024                  zil_commit(zilog, 0);
4021 4025  
4022 4026          ZFS_EXIT(zfsvfs);
4023 4027          return (error);
4024 4028  }
4025 4029  
4026 4030  /*
4027 4031   * zfs_null_putapage() is used when the file system has been force
4028 4032   * unmounted. It just drops the pages.
4029 4033   */
4030 4034  /* ARGSUSED */
4031 4035  static int
4032 4036  zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4033 4037                  size_t *lenp, int flags, cred_t *cr)
4034 4038  {
4035 4039          pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4036 4040          return (0);
4037 4041  }
4038 4042  
4039 4043  /*
4040 4044   * Push a page out to disk, klustering if possible.
4041 4045   *
4042 4046   *      IN:     vp      - file to push page to.
4043 4047   *              pp      - page to push.
4044 4048   *              flags   - additional flags.
4045 4049   *              cr      - credentials of caller.
4046 4050   *
4047 4051   *      OUT:    offp    - start of range pushed.
4048 4052   *              lenp    - len of range pushed.
4049 4053   *
4050 4054   *      RETURN: 0 if success
4051 4055   *              error code if failure
4052 4056   *
4053 4057   * NOTE: callers must have locked the page to be pushed.  On
4054 4058   * exit, the page (and all other pages in the kluster) must be
4055 4059   * unlocked.
4056 4060   */
4057 4061  /* ARGSUSED */
4058 4062  static int
4059 4063  zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4060 4064                  size_t *lenp, int flags, cred_t *cr)
4061 4065  {
4062 4066          znode_t         *zp = VTOZ(vp);
4063 4067          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4064 4068          dmu_tx_t        *tx;
4065 4069          u_offset_t      off, koff;
4066 4070          size_t          len, klen;
4067 4071          int             err;
4068 4072  
4069 4073          off = pp->p_offset;
4070 4074          len = PAGESIZE;
4071 4075          /*
4072 4076           * If our blocksize is bigger than the page size, try to kluster
4073 4077           * multiple pages so that we write a full block (thus avoiding
4074 4078           * a read-modify-write).
4075 4079           */
4076 4080          if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4077 4081                  klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4078 4082                  koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4079 4083                  ASSERT(koff <= zp->z_size);
4080 4084                  if (koff + klen > zp->z_size)
4081 4085                          klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4082 4086                  pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4083 4087          }
4084 4088          ASSERT3U(btop(len), ==, btopr(len));
4085 4089  
4086 4090          /*
4087 4091           * Can't push pages past end-of-file.
4088 4092           */
4089 4093          if (off >= zp->z_size) {
4090 4094                  /* ignore all pages */
4091 4095                  err = 0;
4092 4096                  goto out;
4093 4097          } else if (off + len > zp->z_size) {
4094 4098                  int npages = btopr(zp->z_size - off);
4095 4099                  page_t *trunc;
4096 4100  
4097 4101                  page_list_break(&pp, &trunc, npages);
4098 4102                  /* ignore pages past end of file */
4099 4103                  if (trunc)
4100 4104                          pvn_write_done(trunc, flags);
4101 4105                  len = zp->z_size - off;
4102 4106          }
4103 4107  
4104 4108          if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4105 4109              zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4106 4110                  err = EDQUOT;
4107 4111                  goto out;
4108 4112          }
4109 4113  top:
4110 4114          tx = dmu_tx_create(zfsvfs->z_os);
4111 4115          dmu_tx_hold_write(tx, zp->z_id, off, len);
4112 4116  
4113 4117          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4114 4118          zfs_sa_upgrade_txholds(tx, zp);
4115 4119          err = dmu_tx_assign(tx, TXG_NOWAIT);
4116 4120          if (err != 0) {
4117 4121                  if (err == ERESTART) {
4118 4122                          dmu_tx_wait(tx);
4119 4123                          dmu_tx_abort(tx);
4120 4124                          goto top;
4121 4125                  }
4122 4126                  dmu_tx_abort(tx);
4123 4127                  goto out;
4124 4128          }
4125 4129  
4126 4130          if (zp->z_blksz <= PAGESIZE) {
4127 4131                  caddr_t va = zfs_map_page(pp, S_READ);
4128 4132                  ASSERT3U(len, <=, PAGESIZE);
4129 4133                  dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4130 4134                  zfs_unmap_page(pp, va);
4131 4135          } else {
4132 4136                  err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4133 4137          }
4134 4138  
4135 4139          if (err == 0) {
4136 4140                  uint64_t mtime[2], ctime[2];
4137 4141                  sa_bulk_attr_t bulk[3];
4138 4142                  int count = 0;
4139 4143  
4140 4144                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4141 4145                      &mtime, 16);
4142 4146                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4143 4147                      &ctime, 16);
4144 4148                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4145 4149                      &zp->z_pflags, 8);
4146 4150                  zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4147 4151                      B_TRUE);
4148 4152                  zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4149 4153          }
4150 4154          dmu_tx_commit(tx);
4151 4155  
4152 4156  out:
4153 4157          pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4154 4158          if (offp)
4155 4159                  *offp = off;
4156 4160          if (lenp)
4157 4161                  *lenp = len;
4158 4162  
4159 4163          return (err);
4160 4164  }
4161 4165  
4162 4166  /*
4163 4167   * Copy the portion of the file indicated from pages into the file.
4164 4168   * The pages are stored in a page list attached to the files vnode.
4165 4169   *
4166 4170   *      IN:     vp      - vnode of file to push page data to.
4167 4171   *              off     - position in file to put data.
4168 4172   *              len     - amount of data to write.
4169 4173   *              flags   - flags to control the operation.
4170 4174   *              cr      - credentials of caller.
4171 4175   *              ct      - caller context.
4172 4176   *
4173 4177   *      RETURN: 0 if success
4174 4178   *              error code if failure
4175 4179   *
4176 4180   * Timestamps:
4177 4181   *      vp - ctime|mtime updated
4178 4182   */
4179 4183  /*ARGSUSED*/
4180 4184  static int
4181 4185  zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4182 4186      caller_context_t *ct)
4183 4187  {
4184 4188          znode_t         *zp = VTOZ(vp);
4185 4189          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4186 4190          page_t          *pp;
4187 4191          size_t          io_len;
4188 4192          u_offset_t      io_off;
4189 4193          uint_t          blksz;
4190 4194          rl_t            *rl;
4191 4195          int             error = 0;
4192 4196  
4193 4197          ZFS_ENTER(zfsvfs);
4194 4198          ZFS_VERIFY_ZP(zp);
4195 4199  
4196 4200          /*
4197 4201           * There's nothing to do if no data is cached.
4198 4202           */
4199 4203          if (!vn_has_cached_data(vp)) {
4200 4204                  ZFS_EXIT(zfsvfs);
4201 4205                  return (0);
4202 4206          }
4203 4207  
4204 4208          /*
4205 4209           * Align this request to the file block size in case we kluster.
4206 4210           * XXX - this can result in pretty aggresive locking, which can
4207 4211           * impact simultanious read/write access.  One option might be
4208 4212           * to break up long requests (len == 0) into block-by-block
4209 4213           * operations to get narrower locking.
4210 4214           */
4211 4215          blksz = zp->z_blksz;
4212 4216          if (ISP2(blksz))
4213 4217                  io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4214 4218          else
4215 4219                  io_off = 0;
4216 4220          if (len > 0 && ISP2(blksz))
4217 4221                  io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4218 4222          else
4219 4223                  io_len = 0;
4220 4224  
4221 4225          if (io_len == 0) {
4222 4226                  /*
4223 4227                   * Search the entire vp list for pages >= io_off.
4224 4228                   */
4225 4229                  rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
4226 4230                  error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4227 4231                  goto out;
4228 4232          }
4229 4233          rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
4230 4234  
4231 4235          if (off > zp->z_size) {
4232 4236                  /* past end of file */
4233 4237                  zfs_range_unlock(rl);
4234 4238                  ZFS_EXIT(zfsvfs);
4235 4239                  return (0);
4236 4240          }
4237 4241  
4238 4242          len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4239 4243  
4240 4244          for (off = io_off; io_off < off + len; io_off += io_len) {
4241 4245                  if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4242 4246                          pp = page_lookup(vp, io_off,
4243 4247                              (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4244 4248                  } else {
4245 4249                          pp = page_lookup_nowait(vp, io_off,
4246 4250                              (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4247 4251                  }
4248 4252  
4249 4253                  if (pp != NULL && pvn_getdirty(pp, flags)) {
4250 4254                          int err;
4251 4255  
4252 4256                          /*
4253 4257                           * Found a dirty page to push
4254 4258                           */
4255 4259                          err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4256 4260                          if (err)
4257 4261                                  error = err;
4258 4262                  } else {
4259 4263                          io_len = PAGESIZE;
4260 4264                  }
4261 4265          }
4262 4266  out:
4263 4267          zfs_range_unlock(rl);
4264 4268          if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4265 4269                  zil_commit(zfsvfs->z_log, zp->z_id);
4266 4270          ZFS_EXIT(zfsvfs);
4267 4271          return (error);
4268 4272  }
4269 4273  
4270 4274  /*ARGSUSED*/
4271 4275  void
4272 4276  zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4273 4277  {
4274 4278          znode_t *zp = VTOZ(vp);
4275 4279          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4276 4280          int error;
4277 4281  
4278 4282          rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4279 4283          if (zp->z_sa_hdl == NULL) {
4280 4284                  /*
4281 4285                   * The fs has been unmounted, or we did a
4282 4286                   * suspend/resume and this file no longer exists.
4283 4287                   */
4284 4288                  if (vn_has_cached_data(vp)) {
4285 4289                          (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
4286 4290                              B_INVAL, cr);
4287 4291                  }
4288 4292  
4289 4293                  mutex_enter(&zp->z_lock);
4290 4294                  mutex_enter(&vp->v_lock);
4291 4295                  ASSERT(vp->v_count == 1);
4292 4296                  vp->v_count = 0;
4293 4297                  mutex_exit(&vp->v_lock);
4294 4298                  mutex_exit(&zp->z_lock);
4295 4299                  rw_exit(&zfsvfs->z_teardown_inactive_lock);
4296 4300                  zfs_znode_free(zp);
4297 4301                  return;
4298 4302          }
4299 4303  
4300 4304          /*
4301 4305           * Attempt to push any data in the page cache.  If this fails
4302 4306           * we will get kicked out later in zfs_zinactive().
4303 4307           */
4304 4308          if (vn_has_cached_data(vp)) {
4305 4309                  (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
4306 4310                      cr);
4307 4311          }
4308 4312  
4309 4313          if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4310 4314                  dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4311 4315  
4312 4316                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4313 4317                  zfs_sa_upgrade_txholds(tx, zp);
4314 4318                  error = dmu_tx_assign(tx, TXG_WAIT);
4315 4319                  if (error) {
4316 4320                          dmu_tx_abort(tx);
4317 4321                  } else {
4318 4322                          mutex_enter(&zp->z_lock);
4319 4323                          (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4320 4324                              (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4321 4325                          zp->z_atime_dirty = 0;
4322 4326                          mutex_exit(&zp->z_lock);
4323 4327                          dmu_tx_commit(tx);
4324 4328                  }
4325 4329          }
4326 4330  
4327 4331          zfs_zinactive(zp);
4328 4332          rw_exit(&zfsvfs->z_teardown_inactive_lock);
4329 4333  }
4330 4334  
4331 4335  /*
4332 4336   * Bounds-check the seek operation.
4333 4337   *
4334 4338   *      IN:     vp      - vnode seeking within
4335 4339   *              ooff    - old file offset
4336 4340   *              noffp   - pointer to new file offset
4337 4341   *              ct      - caller context
4338 4342   *
4339 4343   *      RETURN: 0 if success
4340 4344   *              EINVAL if new offset invalid
4341 4345   */
4342 4346  /* ARGSUSED */
4343 4347  static int
4344 4348  zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4345 4349      caller_context_t *ct)
4346 4350  {
4347 4351          if (vp->v_type == VDIR)
4348 4352                  return (0);
4349 4353          return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4350 4354  }
4351 4355  
4352 4356  /*
4353 4357   * Pre-filter the generic locking function to trap attempts to place
4354 4358   * a mandatory lock on a memory mapped file.
4355 4359   */
4356 4360  static int
4357 4361  zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4358 4362      flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4359 4363  {
4360 4364          znode_t *zp = VTOZ(vp);
4361 4365          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4362 4366  
4363 4367          ZFS_ENTER(zfsvfs);
4364 4368          ZFS_VERIFY_ZP(zp);
4365 4369  
4366 4370          /*
4367 4371           * We are following the UFS semantics with respect to mapcnt
4368 4372           * here: If we see that the file is mapped already, then we will
4369 4373           * return an error, but we don't worry about races between this
4370 4374           * function and zfs_map().
4371 4375           */
4372 4376          if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4373 4377                  ZFS_EXIT(zfsvfs);
4374 4378                  return (EAGAIN);
4375 4379          }
4376 4380          ZFS_EXIT(zfsvfs);
4377 4381          return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4378 4382  }
4379 4383  
4380 4384  /*
4381 4385   * If we can't find a page in the cache, we will create a new page
4382 4386   * and fill it with file data.  For efficiency, we may try to fill
4383 4387   * multiple pages at once (klustering) to fill up the supplied page
4384 4388   * list.  Note that the pages to be filled are held with an exclusive
4385 4389   * lock to prevent access by other threads while they are being filled.
4386 4390   */
4387 4391  static int
4388 4392  zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4389 4393      caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4390 4394  {
4391 4395          znode_t *zp = VTOZ(vp);
4392 4396          page_t *pp, *cur_pp;
4393 4397          objset_t *os = zp->z_zfsvfs->z_os;
4394 4398          u_offset_t io_off, total;
4395 4399          size_t io_len;
4396 4400          int err;
4397 4401  
4398 4402          if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4399 4403                  /*
4400 4404                   * We only have a single page, don't bother klustering
4401 4405                   */
4402 4406                  io_off = off;
4403 4407                  io_len = PAGESIZE;
4404 4408                  pp = page_create_va(vp, io_off, io_len,
4405 4409                      PG_EXCL | PG_WAIT, seg, addr);
4406 4410          } else {
4407 4411                  /*
4408 4412                   * Try to find enough pages to fill the page list
4409 4413                   */
4410 4414                  pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4411 4415                      &io_len, off, plsz, 0);
4412 4416          }
4413 4417          if (pp == NULL) {
4414 4418                  /*
4415 4419                   * The page already exists, nothing to do here.
4416 4420                   */
4417 4421                  *pl = NULL;
4418 4422                  return (0);
4419 4423          }
4420 4424  
4421 4425          /*
4422 4426           * Fill the pages in the kluster.
4423 4427           */
4424 4428          cur_pp = pp;
4425 4429          for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4426 4430                  caddr_t va;
4427 4431  
4428 4432                  ASSERT3U(io_off, ==, cur_pp->p_offset);
4429 4433                  va = zfs_map_page(cur_pp, S_WRITE);
4430 4434                  err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4431 4435                      DMU_READ_PREFETCH);
4432 4436                  zfs_unmap_page(cur_pp, va);
4433 4437                  if (err) {
4434 4438                          /* On error, toss the entire kluster */
4435 4439                          pvn_read_done(pp, B_ERROR);
4436 4440                          /* convert checksum errors into IO errors */
4437 4441                          if (err == ECKSUM)
4438 4442                                  err = EIO;
4439 4443                          return (err);
4440 4444                  }
4441 4445                  cur_pp = cur_pp->p_next;
4442 4446          }
4443 4447  
4444 4448          /*
4445 4449           * Fill in the page list array from the kluster starting
4446 4450           * from the desired offset `off'.
4447 4451           * NOTE: the page list will always be null terminated.
4448 4452           */
4449 4453          pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4450 4454          ASSERT(pl == NULL || (*pl)->p_offset == off);
4451 4455  
4452 4456          return (0);
4453 4457  }
4454 4458  
4455 4459  /*
4456 4460   * Return pointers to the pages for the file region [off, off + len]
4457 4461   * in the pl array.  If plsz is greater than len, this function may
4458 4462   * also return page pointers from after the specified region
4459 4463   * (i.e. the region [off, off + plsz]).  These additional pages are
4460 4464   * only returned if they are already in the cache, or were created as
4461 4465   * part of a klustered read.
4462 4466   *
4463 4467   *      IN:     vp      - vnode of file to get data from.
4464 4468   *              off     - position in file to get data from.
4465 4469   *              len     - amount of data to retrieve.
4466 4470   *              plsz    - length of provided page list.
4467 4471   *              seg     - segment to obtain pages for.
4468 4472   *              addr    - virtual address of fault.
4469 4473   *              rw      - mode of created pages.
4470 4474   *              cr      - credentials of caller.
4471 4475   *              ct      - caller context.
4472 4476   *
4473 4477   *      OUT:    protp   - protection mode of created pages.
4474 4478   *              pl      - list of pages created.
4475 4479   *
4476 4480   *      RETURN: 0 if success
4477 4481   *              error code if failure
4478 4482   *
4479 4483   * Timestamps:
4480 4484   *      vp - atime updated
4481 4485   */
4482 4486  /* ARGSUSED */
4483 4487  static int
4484 4488  zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4485 4489          page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4486 4490          enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4487 4491  {
4488 4492          znode_t         *zp = VTOZ(vp);
4489 4493          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4490 4494          page_t          **pl0 = pl;
4491 4495          int             err = 0;
4492 4496  
4493 4497          /* we do our own caching, faultahead is unnecessary */
4494 4498          if (pl == NULL)
4495 4499                  return (0);
4496 4500          else if (len > plsz)
4497 4501                  len = plsz;
4498 4502          else
4499 4503                  len = P2ROUNDUP(len, PAGESIZE);
4500 4504          ASSERT(plsz >= len);
4501 4505  
4502 4506          ZFS_ENTER(zfsvfs);
4503 4507          ZFS_VERIFY_ZP(zp);
4504 4508  
4505 4509          if (protp)
4506 4510                  *protp = PROT_ALL;
4507 4511  
4508 4512          /*
4509 4513           * Loop through the requested range [off, off + len) looking
4510 4514           * for pages.  If we don't find a page, we will need to create
4511 4515           * a new page and fill it with data from the file.
4512 4516           */
4513 4517          while (len > 0) {
4514 4518                  if (*pl = page_lookup(vp, off, SE_SHARED))
4515 4519                          *(pl+1) = NULL;
4516 4520                  else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4517 4521                          goto out;
4518 4522                  while (*pl) {
4519 4523                          ASSERT3U((*pl)->p_offset, ==, off);
4520 4524                          off += PAGESIZE;
4521 4525                          addr += PAGESIZE;
4522 4526                          if (len > 0) {
4523 4527                                  ASSERT3U(len, >=, PAGESIZE);
4524 4528                                  len -= PAGESIZE;
4525 4529                          }
4526 4530                          ASSERT3U(plsz, >=, PAGESIZE);
4527 4531                          plsz -= PAGESIZE;
4528 4532                          pl++;
4529 4533                  }
4530 4534          }
4531 4535  
4532 4536          /*
4533 4537           * Fill out the page array with any pages already in the cache.
4534 4538           */
4535 4539          while (plsz > 0 &&
4536 4540              (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4537 4541                          off += PAGESIZE;
4538 4542                          plsz -= PAGESIZE;
4539 4543          }
4540 4544  out:
4541 4545          if (err) {
4542 4546                  /*
4543 4547                   * Release any pages we have previously locked.
4544 4548                   */
4545 4549                  while (pl > pl0)
4546 4550                          page_unlock(*--pl);
4547 4551          } else {
4548 4552                  ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4549 4553          }
4550 4554  
4551 4555          *pl = NULL;
4552 4556  
4553 4557          ZFS_EXIT(zfsvfs);
4554 4558          return (err);
4555 4559  }
4556 4560  
4557 4561  /*
4558 4562   * Request a memory map for a section of a file.  This code interacts
4559 4563   * with common code and the VM system as follows:
4560 4564   *
4561 4565   *      common code calls mmap(), which ends up in smmap_common()
4562 4566   *
4563 4567   *      this calls VOP_MAP(), which takes you into (say) zfs
4564 4568   *
4565 4569   *      zfs_map() calls as_map(), passing segvn_create() as the callback
4566 4570   *
4567 4571   *      segvn_create() creates the new segment and calls VOP_ADDMAP()
4568 4572   *
4569 4573   *      zfs_addmap() updates z_mapcnt
4570 4574   */
4571 4575  /*ARGSUSED*/
4572 4576  static int
4573 4577  zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4574 4578      size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4575 4579      caller_context_t *ct)
4576 4580  {
4577 4581          znode_t *zp = VTOZ(vp);
4578 4582          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4579 4583          segvn_crargs_t  vn_a;
4580 4584          int             error;
4581 4585  
4582 4586          ZFS_ENTER(zfsvfs);
4583 4587          ZFS_VERIFY_ZP(zp);
4584 4588  
4585 4589          if ((prot & PROT_WRITE) && (zp->z_pflags &
4586 4590              (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4587 4591                  ZFS_EXIT(zfsvfs);
4588 4592                  return (EPERM);
4589 4593          }
4590 4594  
4591 4595          if ((prot & (PROT_READ | PROT_EXEC)) &&
4592 4596              (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4593 4597                  ZFS_EXIT(zfsvfs);
4594 4598                  return (EACCES);
4595 4599          }
4596 4600  
4597 4601          if (vp->v_flag & VNOMAP) {
4598 4602                  ZFS_EXIT(zfsvfs);
4599 4603                  return (ENOSYS);
4600 4604          }
4601 4605  
4602 4606          if (off < 0 || len > MAXOFFSET_T - off) {
4603 4607                  ZFS_EXIT(zfsvfs);
4604 4608                  return (ENXIO);
4605 4609          }
4606 4610  
4607 4611          if (vp->v_type != VREG) {
4608 4612                  ZFS_EXIT(zfsvfs);
4609 4613                  return (ENODEV);
4610 4614          }
4611 4615  
4612 4616          /*
4613 4617           * If file is locked, disallow mapping.
4614 4618           */
4615 4619          if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
4616 4620                  ZFS_EXIT(zfsvfs);
4617 4621                  return (EAGAIN);
4618 4622          }
4619 4623  
4620 4624          as_rangelock(as);
4621 4625          error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4622 4626          if (error != 0) {
4623 4627                  as_rangeunlock(as);
4624 4628                  ZFS_EXIT(zfsvfs);
4625 4629                  return (error);
4626 4630          }
4627 4631  
4628 4632          vn_a.vp = vp;
4629 4633          vn_a.offset = (u_offset_t)off;
4630 4634          vn_a.type = flags & MAP_TYPE;
4631 4635          vn_a.prot = prot;
4632 4636          vn_a.maxprot = maxprot;
4633 4637          vn_a.cred = cr;
4634 4638          vn_a.amp = NULL;
4635 4639          vn_a.flags = flags & ~MAP_TYPE;
4636 4640          vn_a.szc = 0;
4637 4641          vn_a.lgrp_mem_policy_flags = 0;
4638 4642  
4639 4643          error = as_map(as, *addrp, len, segvn_create, &vn_a);
4640 4644  
4641 4645          as_rangeunlock(as);
4642 4646          ZFS_EXIT(zfsvfs);
4643 4647          return (error);
4644 4648  }
4645 4649  
4646 4650  /* ARGSUSED */
4647 4651  static int
4648 4652  zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4649 4653      size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4650 4654      caller_context_t *ct)
4651 4655  {
4652 4656          uint64_t pages = btopr(len);
4653 4657  
4654 4658          atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
4655 4659          return (0);
4656 4660  }
4657 4661  
4658 4662  /*
4659 4663   * The reason we push dirty pages as part of zfs_delmap() is so that we get a
4660 4664   * more accurate mtime for the associated file.  Since we don't have a way of
4661 4665   * detecting when the data was actually modified, we have to resort to
4662 4666   * heuristics.  If an explicit msync() is done, then we mark the mtime when the
4663 4667   * last page is pushed.  The problem occurs when the msync() call is omitted,
4664 4668   * which by far the most common case:
4665 4669   *
4666 4670   *      open()
4667 4671   *      mmap()
4668 4672   *      <modify memory>
4669 4673   *      munmap()
4670 4674   *      close()
4671 4675   *      <time lapse>
4672 4676   *      putpage() via fsflush
4673 4677   *
4674 4678   * If we wait until fsflush to come along, we can have a modification time that
4675 4679   * is some arbitrary point in the future.  In order to prevent this in the
4676 4680   * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
4677 4681   * torn down.
4678 4682   */
4679 4683  /* ARGSUSED */
4680 4684  static int
4681 4685  zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4682 4686      size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4683 4687      caller_context_t *ct)
4684 4688  {
4685 4689          uint64_t pages = btopr(len);
4686 4690  
4687 4691          ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
4688 4692          atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
4689 4693  
4690 4694          if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
4691 4695              vn_has_cached_data(vp))
4692 4696                  (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
4693 4697  
4694 4698          return (0);
4695 4699  }
4696 4700  
4697 4701  /*
4698 4702   * Free or allocate space in a file.  Currently, this function only
4699 4703   * supports the `F_FREESP' command.  However, this command is somewhat
4700 4704   * misnamed, as its functionality includes the ability to allocate as
4701 4705   * well as free space.
4702 4706   *
4703 4707   *      IN:     vp      - vnode of file to free data in.
4704 4708   *              cmd     - action to take (only F_FREESP supported).
4705 4709   *              bfp     - section of file to free/alloc.
4706 4710   *              flag    - current file open mode flags.
4707 4711   *              offset  - current file offset.
4708 4712   *              cr      - credentials of caller [UNUSED].
4709 4713   *              ct      - caller context.
4710 4714   *
4711 4715   *      RETURN: 0 if success
4712 4716   *              error code if failure
4713 4717   *
4714 4718   * Timestamps:
4715 4719   *      vp - ctime|mtime updated
4716 4720   */
4717 4721  /* ARGSUSED */
4718 4722  static int
4719 4723  zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
4720 4724      offset_t offset, cred_t *cr, caller_context_t *ct)
4721 4725  {
4722 4726          znode_t         *zp = VTOZ(vp);
4723 4727          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4724 4728          uint64_t        off, len;
4725 4729          int             error;
4726 4730  
4727 4731          ZFS_ENTER(zfsvfs);
4728 4732          ZFS_VERIFY_ZP(zp);
4729 4733  
4730 4734          if (cmd != F_FREESP) {
4731 4735                  ZFS_EXIT(zfsvfs);
4732 4736                  return (EINVAL);
4733 4737          }
4734 4738  
4735 4739          if (error = convoff(vp, bfp, 0, offset)) {
4736 4740                  ZFS_EXIT(zfsvfs);
4737 4741                  return (error);
4738 4742          }
4739 4743  
4740 4744          if (bfp->l_len < 0) {
4741 4745                  ZFS_EXIT(zfsvfs);
4742 4746                  return (EINVAL);
4743 4747          }
4744 4748  
4745 4749          off = bfp->l_start;
4746 4750          len = bfp->l_len; /* 0 means from off to end of file */
4747 4751  
4748 4752          error = zfs_freesp(zp, off, len, flag, TRUE);
4749 4753  
4750 4754          ZFS_EXIT(zfsvfs);
4751 4755          return (error);
4752 4756  }
4753 4757  
4754 4758  /*ARGSUSED*/
4755 4759  static int
4756 4760  zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4757 4761  {
4758 4762          znode_t         *zp = VTOZ(vp);
4759 4763          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4760 4764          uint32_t        gen;
4761 4765          uint64_t        gen64;
4762 4766          uint64_t        object = zp->z_id;
4763 4767          zfid_short_t    *zfid;
4764 4768          int             size, i, error;
4765 4769  
4766 4770          ZFS_ENTER(zfsvfs);
4767 4771          ZFS_VERIFY_ZP(zp);
4768 4772  
4769 4773          if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4770 4774              &gen64, sizeof (uint64_t))) != 0) {
4771 4775                  ZFS_EXIT(zfsvfs);
4772 4776                  return (error);
4773 4777          }
4774 4778  
4775 4779          gen = (uint32_t)gen64;
4776 4780  
4777 4781          size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4778 4782          if (fidp->fid_len < size) {
4779 4783                  fidp->fid_len = size;
4780 4784                  ZFS_EXIT(zfsvfs);
4781 4785                  return (ENOSPC);
4782 4786          }
4783 4787  
4784 4788          zfid = (zfid_short_t *)fidp;
4785 4789  
4786 4790          zfid->zf_len = size;
4787 4791  
4788 4792          for (i = 0; i < sizeof (zfid->zf_object); i++)
4789 4793                  zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4790 4794  
4791 4795          /* Must have a non-zero generation number to distinguish from .zfs */
4792 4796          if (gen == 0)
4793 4797                  gen = 1;
4794 4798          for (i = 0; i < sizeof (zfid->zf_gen); i++)
4795 4799                  zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4796 4800  
4797 4801          if (size == LONG_FID_LEN) {
4798 4802                  uint64_t        objsetid = dmu_objset_id(zfsvfs->z_os);
4799 4803                  zfid_long_t     *zlfid;
4800 4804  
4801 4805                  zlfid = (zfid_long_t *)fidp;
4802 4806  
4803 4807                  for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4804 4808                          zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4805 4809  
4806 4810                  /* XXX - this should be the generation number for the objset */
4807 4811                  for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4808 4812                          zlfid->zf_setgen[i] = 0;
4809 4813          }
4810 4814  
4811 4815          ZFS_EXIT(zfsvfs);
4812 4816          return (0);
4813 4817  }
4814 4818  
4815 4819  static int
4816 4820  zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4817 4821      caller_context_t *ct)
4818 4822  {
4819 4823          znode_t         *zp, *xzp;
4820 4824          zfsvfs_t        *zfsvfs;
4821 4825          zfs_dirlock_t   *dl;
4822 4826          int             error;
4823 4827  
4824 4828          switch (cmd) {
4825 4829          case _PC_LINK_MAX:
4826 4830                  *valp = ULONG_MAX;
4827 4831                  return (0);
4828 4832  
4829 4833          case _PC_FILESIZEBITS:
4830 4834                  *valp = 64;
4831 4835                  return (0);
4832 4836  
4833 4837          case _PC_XATTR_EXISTS:
4834 4838                  zp = VTOZ(vp);
4835 4839                  zfsvfs = zp->z_zfsvfs;
4836 4840                  ZFS_ENTER(zfsvfs);
4837 4841                  ZFS_VERIFY_ZP(zp);
4838 4842                  *valp = 0;
4839 4843                  error = zfs_dirent_lock(&dl, zp, "", &xzp,
4840 4844                      ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
4841 4845                  if (error == 0) {
4842 4846                          zfs_dirent_unlock(dl);
4843 4847                          if (!zfs_dirempty(xzp))
4844 4848                                  *valp = 1;
4845 4849                          VN_RELE(ZTOV(xzp));
4846 4850                  } else if (error == ENOENT) {
4847 4851                          /*
4848 4852                           * If there aren't extended attributes, it's the
4849 4853                           * same as having zero of them.
4850 4854                           */
4851 4855                          error = 0;
4852 4856                  }
4853 4857                  ZFS_EXIT(zfsvfs);
4854 4858                  return (error);
4855 4859  
4856 4860          case _PC_SATTR_ENABLED:
4857 4861          case _PC_SATTR_EXISTS:
4858 4862                  *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4859 4863                      (vp->v_type == VREG || vp->v_type == VDIR);
4860 4864                  return (0);
4861 4865  
4862 4866          case _PC_ACCESS_FILTERING:
4863 4867                  *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4864 4868                      vp->v_type == VDIR;
4865 4869                  return (0);
4866 4870  
4867 4871          case _PC_ACL_ENABLED:
4868 4872                  *valp = _ACL_ACE_ENABLED;
4869 4873                  return (0);
4870 4874  
4871 4875          case _PC_MIN_HOLE_SIZE:
4872 4876                  *valp = (ulong_t)SPA_MINBLOCKSIZE;
4873 4877                  return (0);
4874 4878  
4875 4879          case _PC_TIMESTAMP_RESOLUTION:
4876 4880                  /* nanosecond timestamp resolution */
4877 4881                  *valp = 1L;
4878 4882                  return (0);
4879 4883  
4880 4884          default:
4881 4885                  return (fs_pathconf(vp, cmd, valp, cr, ct));
4882 4886          }
4883 4887  }
4884 4888  
4885 4889  /*ARGSUSED*/
4886 4890  static int
4887 4891  zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4888 4892      caller_context_t *ct)
4889 4893  {
4890 4894          znode_t *zp = VTOZ(vp);
4891 4895          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4892 4896          int error;
4893 4897          boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4894 4898  
4895 4899          ZFS_ENTER(zfsvfs);
4896 4900          ZFS_VERIFY_ZP(zp);
4897 4901          error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4898 4902          ZFS_EXIT(zfsvfs);
4899 4903  
4900 4904          return (error);
4901 4905  }
4902 4906  
4903 4907  /*ARGSUSED*/
4904 4908  static int
4905 4909  zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4906 4910      caller_context_t *ct)
4907 4911  {
4908 4912          znode_t *zp = VTOZ(vp);
4909 4913          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4910 4914          int error;
4911 4915          boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4912 4916          zilog_t *zilog = zfsvfs->z_log;
4913 4917  
4914 4918          ZFS_ENTER(zfsvfs);
4915 4919          ZFS_VERIFY_ZP(zp);
4916 4920  
4917 4921          error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4918 4922  
4919 4923          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4920 4924                  zil_commit(zilog, 0);
4921 4925  
4922 4926          ZFS_EXIT(zfsvfs);
4923 4927          return (error);
4924 4928  }
4925 4929  
4926 4930  /*
4927 4931   * Tunable, both must be a power of 2.
4928 4932   *
4929 4933   * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
4930 4934   * zcr_blksz_max: if set to less than the file block size, allow loaning out of
4931 4935   *                an arcbuf for a partial block read
4932 4936   */
4933 4937  int zcr_blksz_min = (1 << 10);  /* 1K */
4934 4938  int zcr_blksz_max = (1 << 17);  /* 128K */
4935 4939  
4936 4940  /*ARGSUSED*/
4937 4941  static int
4938 4942  zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
4939 4943      caller_context_t *ct)
4940 4944  {
4941 4945          znode_t *zp = VTOZ(vp);
4942 4946          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4943 4947          int max_blksz = zfsvfs->z_max_blksz;
4944 4948          uio_t *uio = &xuio->xu_uio;
4945 4949          ssize_t size = uio->uio_resid;
4946 4950          offset_t offset = uio->uio_loffset;
4947 4951          int blksz;
4948 4952          int fullblk, i;
4949 4953          arc_buf_t *abuf;
4950 4954          ssize_t maxsize;
4951 4955          int preamble, postamble;
4952 4956  
4953 4957          if (xuio->xu_type != UIOTYPE_ZEROCOPY)
4954 4958                  return (EINVAL);
4955 4959  
4956 4960          ZFS_ENTER(zfsvfs);
4957 4961          ZFS_VERIFY_ZP(zp);
4958 4962          switch (ioflag) {
4959 4963          case UIO_WRITE:
4960 4964                  /*
4961 4965                   * Loan out an arc_buf for write if write size is bigger than
4962 4966                   * max_blksz, and the file's block size is also max_blksz.
4963 4967                   */
4964 4968                  blksz = max_blksz;
4965 4969                  if (size < blksz || zp->z_blksz != blksz) {
4966 4970                          ZFS_EXIT(zfsvfs);
4967 4971                          return (EINVAL);
4968 4972                  }
4969 4973                  /*
4970 4974                   * Caller requests buffers for write before knowing where the
4971 4975                   * write offset might be (e.g. NFS TCP write).
4972 4976                   */
4973 4977                  if (offset == -1) {
4974 4978                          preamble = 0;
4975 4979                  } else {
4976 4980                          preamble = P2PHASE(offset, blksz);
4977 4981                          if (preamble) {
4978 4982                                  preamble = blksz - preamble;
4979 4983                                  size -= preamble;
4980 4984                          }
4981 4985                  }
4982 4986  
4983 4987                  postamble = P2PHASE(size, blksz);
4984 4988                  size -= postamble;
4985 4989  
4986 4990                  fullblk = size / blksz;
4987 4991                  (void) dmu_xuio_init(xuio,
4988 4992                      (preamble != 0) + fullblk + (postamble != 0));
4989 4993                  DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
4990 4994                      int, postamble, int,
4991 4995                      (preamble != 0) + fullblk + (postamble != 0));
4992 4996  
4993 4997                  /*
4994 4998                   * Have to fix iov base/len for partial buffers.  They
4995 4999                   * currently represent full arc_buf's.
4996 5000                   */
4997 5001                  if (preamble) {
4998 5002                          /* data begins in the middle of the arc_buf */
4999 5003                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5000 5004                              blksz);
5001 5005                          ASSERT(abuf);
5002 5006                          (void) dmu_xuio_add(xuio, abuf,
5003 5007                              blksz - preamble, preamble);
5004 5008                  }
5005 5009  
5006 5010                  for (i = 0; i < fullblk; i++) {
5007 5011                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5008 5012                              blksz);
5009 5013                          ASSERT(abuf);
5010 5014                          (void) dmu_xuio_add(xuio, abuf, 0, blksz);
5011 5015                  }
5012 5016  
5013 5017                  if (postamble) {
5014 5018                          /* data ends in the middle of the arc_buf */
5015 5019                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5016 5020                              blksz);
5017 5021                          ASSERT(abuf);
5018 5022                          (void) dmu_xuio_add(xuio, abuf, 0, postamble);
5019 5023                  }
5020 5024                  break;
5021 5025          case UIO_READ:
5022 5026                  /*
5023 5027                   * Loan out an arc_buf for read if the read size is larger than
5024 5028                   * the current file block size.  Block alignment is not
5025 5029                   * considered.  Partial arc_buf will be loaned out for read.
5026 5030                   */
5027 5031                  blksz = zp->z_blksz;
5028 5032                  if (blksz < zcr_blksz_min)
5029 5033                          blksz = zcr_blksz_min;
5030 5034                  if (blksz > zcr_blksz_max)
5031 5035                          blksz = zcr_blksz_max;
5032 5036                  /* avoid potential complexity of dealing with it */
5033 5037                  if (blksz > max_blksz) {
5034 5038                          ZFS_EXIT(zfsvfs);
5035 5039                          return (EINVAL);
5036 5040                  }
5037 5041  
5038 5042                  maxsize = zp->z_size - uio->uio_loffset;
5039 5043                  if (size > maxsize)
5040 5044                          size = maxsize;
5041 5045  
5042 5046                  if (size < blksz || vn_has_cached_data(vp)) {
5043 5047                          ZFS_EXIT(zfsvfs);
5044 5048                          return (EINVAL);
5045 5049                  }
5046 5050                  break;
5047 5051          default:
5048 5052                  ZFS_EXIT(zfsvfs);
5049 5053                  return (EINVAL);
5050 5054          }
5051 5055  
5052 5056          uio->uio_extflg = UIO_XUIO;
5053 5057          XUIO_XUZC_RW(xuio) = ioflag;
5054 5058          ZFS_EXIT(zfsvfs);
5055 5059          return (0);
5056 5060  }
5057 5061  
5058 5062  /*ARGSUSED*/
5059 5063  static int
5060 5064  zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5061 5065  {
5062 5066          int i;
5063 5067          arc_buf_t *abuf;
5064 5068          int ioflag = XUIO_XUZC_RW(xuio);
5065 5069  
5066 5070          ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5067 5071  
5068 5072          i = dmu_xuio_cnt(xuio);
5069 5073          while (i-- > 0) {
5070 5074                  abuf = dmu_xuio_arcbuf(xuio, i);
5071 5075                  /*
5072 5076                   * if abuf == NULL, it must be a write buffer
5073 5077                   * that has been returned in zfs_write().
5074 5078                   */
5075 5079                  if (abuf)
5076 5080                          dmu_return_arcbuf(abuf);
5077 5081                  ASSERT(abuf || ioflag == UIO_WRITE);
5078 5082          }
5079 5083  
5080 5084          dmu_xuio_fini(xuio);
5081 5085          return (0);
5082 5086  }
5083 5087  
5084 5088  /*
5085 5089   * Predeclare these here so that the compiler assumes that
5086 5090   * this is an "old style" function declaration that does
5087 5091   * not include arguments => we won't get type mismatch errors
5088 5092   * in the initializations that follow.
5089 5093   */
5090 5094  static int zfs_inval();
5091 5095  static int zfs_isdir();
5092 5096  
5093 5097  static int
5094 5098  zfs_inval()
5095 5099  {
5096 5100          return (EINVAL);
5097 5101  }
5098 5102  
5099 5103  static int
5100 5104  zfs_isdir()
5101 5105  {
5102 5106          return (EISDIR);
5103 5107  }
5104 5108  /*
5105 5109   * Directory vnode operations template
5106 5110   */
5107 5111  vnodeops_t *zfs_dvnodeops;
5108 5112  const fs_operation_def_t zfs_dvnodeops_template[] = {
5109 5113          VOPNAME_OPEN,           { .vop_open = zfs_open },
5110 5114          VOPNAME_CLOSE,          { .vop_close = zfs_close },
5111 5115          VOPNAME_READ,           { .error = zfs_isdir },
5112 5116          VOPNAME_WRITE,          { .error = zfs_isdir },
5113 5117          VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5114 5118          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5115 5119          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5116 5120          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5117 5121          VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5118 5122          VOPNAME_CREATE,         { .vop_create = zfs_create },
5119 5123          VOPNAME_REMOVE,         { .vop_remove = zfs_remove },
5120 5124          VOPNAME_LINK,           { .vop_link = zfs_link },
5121 5125          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5122 5126          VOPNAME_MKDIR,          { .vop_mkdir = zfs_mkdir },
5123 5127          VOPNAME_RMDIR,          { .vop_rmdir = zfs_rmdir },
5124 5128          VOPNAME_READDIR,        { .vop_readdir = zfs_readdir },
5125 5129          VOPNAME_SYMLINK,        { .vop_symlink = zfs_symlink },
5126 5130          VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5127 5131          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5128 5132          VOPNAME_FID,            { .vop_fid = zfs_fid },
5129 5133          VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5130 5134          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5131 5135          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5132 5136          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5133 5137          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5134 5138          NULL,                   NULL
5135 5139  };
5136 5140  
5137 5141  /*
5138 5142   * Regular file vnode operations template
5139 5143   */
5140 5144  vnodeops_t *zfs_fvnodeops;
5141 5145  const fs_operation_def_t zfs_fvnodeops_template[] = {
5142 5146          VOPNAME_OPEN,           { .vop_open = zfs_open },
5143 5147          VOPNAME_CLOSE,          { .vop_close = zfs_close },
5144 5148          VOPNAME_READ,           { .vop_read = zfs_read },
5145 5149          VOPNAME_WRITE,          { .vop_write = zfs_write },
5146 5150          VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5147 5151          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5148 5152          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5149 5153          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5150 5154          VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5151 5155          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5152 5156          VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5153 5157          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5154 5158          VOPNAME_FID,            { .vop_fid = zfs_fid },
5155 5159          VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5156 5160          VOPNAME_FRLOCK,         { .vop_frlock = zfs_frlock },
5157 5161          VOPNAME_SPACE,          { .vop_space = zfs_space },
5158 5162          VOPNAME_GETPAGE,        { .vop_getpage = zfs_getpage },
5159 5163          VOPNAME_PUTPAGE,        { .vop_putpage = zfs_putpage },
5160 5164          VOPNAME_MAP,            { .vop_map = zfs_map },
5161 5165          VOPNAME_ADDMAP,         { .vop_addmap = zfs_addmap },
5162 5166          VOPNAME_DELMAP,         { .vop_delmap = zfs_delmap },
5163 5167          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5164 5168          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5165 5169          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5166 5170          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5167 5171          VOPNAME_REQZCBUF,       { .vop_reqzcbuf = zfs_reqzcbuf },
5168 5172          VOPNAME_RETZCBUF,       { .vop_retzcbuf = zfs_retzcbuf },
5169 5173          NULL,                   NULL
5170 5174  };
5171 5175  
5172 5176  /*
5173 5177   * Symbolic link vnode operations template
5174 5178   */
5175 5179  vnodeops_t *zfs_symvnodeops;
5176 5180  const fs_operation_def_t zfs_symvnodeops_template[] = {
5177 5181          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5178 5182          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5179 5183          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5180 5184          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5181 5185          VOPNAME_READLINK,       { .vop_readlink = zfs_readlink },
5182 5186          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5183 5187          VOPNAME_FID,            { .vop_fid = zfs_fid },
5184 5188          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5185 5189          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5186 5190          NULL,                   NULL
5187 5191  };
5188 5192  
5189 5193  /*
5190 5194   * special share hidden files vnode operations template
5191 5195   */
5192 5196  vnodeops_t *zfs_sharevnodeops;
5193 5197  const fs_operation_def_t zfs_sharevnodeops_template[] = {
5194 5198          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5195 5199          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5196 5200          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5197 5201          VOPNAME_FID,            { .vop_fid = zfs_fid },
5198 5202          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5199 5203          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5200 5204          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5201 5205          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5202 5206          NULL,                   NULL
5203 5207  };
5204 5208  
5205 5209  /*
5206 5210   * Extended attribute directory vnode operations template
5207 5211   *      This template is identical to the directory vnodes
5208 5212   *      operation template except for restricted operations:
5209 5213   *              VOP_MKDIR()
5210 5214   *              VOP_SYMLINK()
5211 5215   * Note that there are other restrictions embedded in:
5212 5216   *      zfs_create()    - restrict type to VREG
5213 5217   *      zfs_link()      - no links into/out of attribute space
5214 5218   *      zfs_rename()    - no moves into/out of attribute space
5215 5219   */
5216 5220  vnodeops_t *zfs_xdvnodeops;
5217 5221  const fs_operation_def_t zfs_xdvnodeops_template[] = {
5218 5222          VOPNAME_OPEN,           { .vop_open = zfs_open },
5219 5223          VOPNAME_CLOSE,          { .vop_close = zfs_close },
5220 5224          VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5221 5225          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5222 5226          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5223 5227          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5224 5228          VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5225 5229          VOPNAME_CREATE,         { .vop_create = zfs_create },
5226 5230          VOPNAME_REMOVE,         { .vop_remove = zfs_remove },
5227 5231          VOPNAME_LINK,           { .vop_link = zfs_link },
5228 5232          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5229 5233          VOPNAME_MKDIR,          { .error = zfs_inval },
5230 5234          VOPNAME_RMDIR,          { .vop_rmdir = zfs_rmdir },
5231 5235          VOPNAME_READDIR,        { .vop_readdir = zfs_readdir },
5232 5236          VOPNAME_SYMLINK,        { .error = zfs_inval },
5233 5237          VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5234 5238          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5235 5239          VOPNAME_FID,            { .vop_fid = zfs_fid },
5236 5240          VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5237 5241          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5238 5242          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5239 5243          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5240 5244          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5241 5245          NULL,                   NULL
5242 5246  };
5243 5247  
5244 5248  /*
5245 5249   * Error vnode operations template
5246 5250   */
5247 5251  vnodeops_t *zfs_evnodeops;
5248 5252  const fs_operation_def_t zfs_evnodeops_template[] = {
5249 5253          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5250 5254          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5251 5255          NULL,                   NULL
5252 5256  };

↓ open down ↓

1620 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX