illumos-gate Wdiff usr/src/uts/common/fs/zfs/zfs_vnops.c

Print this page

Possibility to physically reserve space without writing leaf blocks

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/zfs_vnops.c
          +++ new/usr/src/uts/common/fs/zfs/zfs_vnops.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  24   24   * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  25   25   */
  26   26  
  27   27  /* Portions Copyright 2007 Jeremy Teo */
  28   28  /* Portions Copyright 2010 Robert Milkowski */
  29   29  
  30   30  #include <sys/types.h>
  31   31  #include <sys/param.h>
  32   32  #include <sys/time.h>
  33   33  #include <sys/systm.h>
  34   34  #include <sys/sysmacros.h>
  35   35  #include <sys/resource.h>
  36   36  #include <sys/vfs.h>
  37   37  #include <sys/vfs_opreg.h>
  38   38  #include <sys/vnode.h>
  39   39  #include <sys/file.h>
  40   40  #include <sys/stat.h>
  41   41  #include <sys/kmem.h>
  42   42  #include <sys/taskq.h>
  43   43  #include <sys/uio.h>
  44   44  #include <sys/vmsystm.h>
  45   45  #include <sys/atomic.h>
  46   46  #include <sys/vm.h>
  47   47  #include <vm/seg_vn.h>
  48   48  #include <vm/pvn.h>
  49   49  #include <vm/as.h>
  50   50  #include <vm/kpm.h>
  51   51  #include <vm/seg_kpm.h>
  52   52  #include <sys/mman.h>
  53   53  #include <sys/pathname.h>
  54   54  #include <sys/cmn_err.h>
  55   55  #include <sys/errno.h>
  56   56  #include <sys/unistd.h>
  57   57  #include <sys/zfs_dir.h>
  58   58  #include <sys/zfs_acl.h>
  59   59  #include <sys/zfs_ioctl.h>
  60   60  #include <sys/fs/zfs.h>
  61   61  #include <sys/dmu.h>
  62   62  #include <sys/dmu_objset.h>
  63   63  #include <sys/spa.h>
  64   64  #include <sys/txg.h>
  65   65  #include <sys/dbuf.h>
  66   66  #include <sys/zap.h>

↓ open down ↓

66 lines elided

↑ open up ↑

  67   67  #include <sys/sa.h>
  68   68  #include <sys/dirent.h>
  69   69  #include <sys/policy.h>
  70   70  #include <sys/sunddi.h>
  71   71  #include <sys/filio.h>
  72   72  #include <sys/sid.h>
  73   73  #include "fs/fs_subr.h"
  74   74  #include <sys/zfs_ctldir.h>
  75   75  #include <sys/zfs_fuid.h>
  76   76  #include <sys/zfs_sa.h>
       77 +#include <sys/zfeature.h>
  77   78  #include <sys/dnlc.h>
  78   79  #include <sys/zfs_rlock.h>
  79   80  #include <sys/extdirent.h>
  80   81  #include <sys/kidmap.h>
  81   82  #include <sys/cred.h>
  82   83  #include <sys/attr.h>
  83   84  
  84   85  /*
  85   86   * Programming rules.
  86   87   *

  87   88   * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  88   89   * properly lock its in-core state, create a DMU transaction, do the work,
  89   90   * record this work in the intent log (ZIL), commit the DMU transaction,
  90   91   * and wait for the intent log to commit if it is a synchronous operation.
  91   92   * Moreover, the vnode ops must work in both normal and log replay context.
  92   93   * The ordering of events is important to avoid deadlocks and references
  93   94   * to freed memory.  The example below illustrates the following Big Rules:
  94   95   *
  95   96   *  (1) A check must be made in each zfs thread for a mounted file system.
  96   97   *      This is done avoiding races using ZFS_ENTER(zfsvfs).
  97   98   *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  98   99   *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  99  100   *      can return EIO from the calling function.
 100  101   *
 101  102   *  (2) VN_RELE() should always be the last thing except for zil_commit()
 102  103   *      (if necessary) and ZFS_EXIT(). This is for 3 reasons:
 103  104   *      First, if it's the last reference, the vnode/znode
 104  105   *      can be freed, so the zp may point to freed memory.  Second, the last
 105  106   *      reference will call zfs_zinactive(), which may induce a lot of work --
 106  107   *      pushing cached pages (which acquires range locks) and syncing out
 107  108   *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
 108  109   *      which could deadlock the system if you were already holding one.
 109  110   *      If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
 110  111   *
 111  112   *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 112  113   *      as they can span dmu_tx_assign() calls.
 113  114   *
 114  115   *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
 115  116   *      dmu_tx_assign().  This is critical because we don't want to block
 116  117   *      while holding locks.
 117  118   *
 118  119   *      If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
 119  120   *      reduces lock contention and CPU usage when we must wait (note that if
 120  121   *      throughput is constrained by the storage, nearly every transaction
 121  122   *      must wait).
 122  123   *
 123  124   *      Note, in particular, that if a lock is sometimes acquired before
 124  125   *      the tx assigns, and sometimes after (e.g. z_lock), then failing
 125  126   *      to use a non-blocking assign can deadlock the system.  The scenario:
 126  127   *
 127  128   *      Thread A has grabbed a lock before calling dmu_tx_assign().
 128  129   *      Thread B is in an already-assigned tx, and blocks for this lock.
 129  130   *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 130  131   *      forever, because the previous txg can't quiesce until B's tx commits.
 131  132   *
 132  133   *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 133  134   *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 134  135   *      calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
 135  136   *      to indicate that this operation has already called dmu_tx_wait().
 136  137   *      This will ensure that we don't retry forever, waiting a short bit
 137  138   *      each time.
 138  139   *
 139  140   *  (5) If the operation succeeded, generate the intent log entry for it
 140  141   *      before dropping locks.  This ensures that the ordering of events
 141  142   *      in the intent log matches the order in which they actually occurred.
 142  143   *      During ZIL replay the zfs_log_* functions will update the sequence
 143  144   *      number to indicate the zil transaction has replayed.
 144  145   *
 145  146   *  (6) At the end of each vnode op, the DMU tx must always commit,
 146  147   *      regardless of whether there were any errors.
 147  148   *
 148  149   *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 149  150   *      to ensure that synchronous semantics are provided when necessary.
 150  151   *
 151  152   * In general, this is how things should be ordered in each vnode op:
 152  153   *
 153  154   *      ZFS_ENTER(zfsvfs);              // exit if unmounted
 154  155   * top:
 155  156   *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may VN_HOLD())
 156  157   *      rw_enter(...);                  // grab any other locks you need
 157  158   *      tx = dmu_tx_create(...);        // get DMU tx
 158  159   *      dmu_tx_hold_*();                // hold each object you might modify
 159  160   *      error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 160  161   *      if (error) {
 161  162   *              rw_exit(...);           // drop locks
 162  163   *              zfs_dirent_unlock(dl);  // unlock directory entry
 163  164   *              VN_RELE(...);           // release held vnodes
 164  165   *              if (error == ERESTART) {
 165  166   *                      waited = B_TRUE;
 166  167   *                      dmu_tx_wait(tx);
 167  168   *                      dmu_tx_abort(tx);
 168  169   *                      goto top;
 169  170   *              }
 170  171   *              dmu_tx_abort(tx);       // abort DMU tx
 171  172   *              ZFS_EXIT(zfsvfs);       // finished in zfs
 172  173   *              return (error);         // really out of space
 173  174   *      }
 174  175   *      error = do_real_work();         // do whatever this VOP does
 175  176   *      if (error == 0)
 176  177   *              zfs_log_*(...);         // on success, make ZIL entry
 177  178   *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 178  179   *      rw_exit(...);                   // drop locks
 179  180   *      zfs_dirent_unlock(dl);          // unlock directory entry
 180  181   *      VN_RELE(...);                   // release held vnodes
 181  182   *      zil_commit(zilog, foid);        // synchronous when necessary
 182  183   *      ZFS_EXIT(zfsvfs);               // finished in zfs
 183  184   *      return (error);                 // done, report error
 184  185   */
 185  186  
 186  187  /* ARGSUSED */
 187  188  static int
 188  189  zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 189  190  {
 190  191          znode_t *zp = VTOZ(*vpp);
 191  192          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 192  193  
 193  194          ZFS_ENTER(zfsvfs);
 194  195          ZFS_VERIFY_ZP(zp);
 195  196  
 196  197          if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 197  198              ((flag & FAPPEND) == 0)) {
 198  199                  ZFS_EXIT(zfsvfs);
 199  200                  return (SET_ERROR(EPERM));
 200  201          }
 201  202  
 202  203          if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 203  204              ZTOV(zp)->v_type == VREG &&
 204  205              !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
 205  206                  if (fs_vscan(*vpp, cr, 0) != 0) {
 206  207                          ZFS_EXIT(zfsvfs);
 207  208                          return (SET_ERROR(EACCES));
 208  209                  }
 209  210          }
 210  211  
 211  212          /* Keep a count of the synchronous opens in the znode */
 212  213          if (flag & (FSYNC | FDSYNC))
 213  214                  atomic_inc_32(&zp->z_sync_cnt);
 214  215  
 215  216          ZFS_EXIT(zfsvfs);
 216  217          return (0);
 217  218  }
 218  219  
 219  220  /* ARGSUSED */
 220  221  static int
 221  222  zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 222  223      caller_context_t *ct)
 223  224  {
 224  225          znode_t *zp = VTOZ(vp);
 225  226          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 226  227  
 227  228          /*
 228  229           * Clean up any locks held by this process on the vp.
 229  230           */
 230  231          cleanlocks(vp, ddi_get_pid(), 0);
 231  232          cleanshares(vp, ddi_get_pid());
 232  233  
 233  234          ZFS_ENTER(zfsvfs);
 234  235          ZFS_VERIFY_ZP(zp);
 235  236  
 236  237          /* Decrement the synchronous opens in the znode */
 237  238          if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 238  239                  atomic_dec_32(&zp->z_sync_cnt);
 239  240  
 240  241          if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 241  242              ZTOV(zp)->v_type == VREG &&
 242  243              !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
 243  244                  VERIFY(fs_vscan(vp, cr, 1) == 0);
 244  245  
 245  246          ZFS_EXIT(zfsvfs);
 246  247          return (0);
 247  248  }
 248  249  
 249  250  /*
 250  251   * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
 251  252   * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
 252  253   */
 253  254  static int
 254  255  zfs_holey(vnode_t *vp, int cmd, offset_t *off)
 255  256  {
 256  257          znode_t *zp = VTOZ(vp);
 257  258          uint64_t noff = (uint64_t)*off; /* new offset */
 258  259          uint64_t file_sz;
 259  260          int error;
 260  261          boolean_t hole;
 261  262  
 262  263          file_sz = zp->z_size;
 263  264          if (noff >= file_sz)  {
 264  265                  return (SET_ERROR(ENXIO));
 265  266          }
 266  267  
 267  268          if (cmd == _FIO_SEEK_HOLE)
 268  269                  hole = B_TRUE;
 269  270          else
 270  271                  hole = B_FALSE;
 271  272  
 272  273          error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 273  274  
 274  275          if (error == ESRCH)
 275  276                  return (SET_ERROR(ENXIO));
 276  277  
 277  278          /*
 278  279           * We could find a hole that begins after the logical end-of-file,
 279  280           * because dmu_offset_next() only works on whole blocks.  If the
 280  281           * EOF falls mid-block, then indicate that the "virtual hole"
 281  282           * at the end of the file begins at the logical EOF, rather than
 282  283           * at the end of the last block.
 283  284           */
 284  285          if (noff > file_sz) {

↓ open down ↓

198 lines elided

↑ open up ↑

 285  286                  ASSERT(hole);
 286  287                  noff = file_sz;
 287  288          }
 288  289  
 289  290          if (noff < *off)
 290  291                  return (error);
 291  292          *off = noff;
 292  293          return (error);
 293  294  }
 294  295  
      296 +
      297 +static int zfs_zero_write(vnode_t *vp, uint64_t size, cred_t *cr,
      298 +    caller_context_t *ct);
      299 +
 295  300  /* ARGSUSED */
 296  301  static int
 297  302  zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
 298  303      int *rvalp, caller_context_t *ct)
 299  304  {
 300  305          offset_t off;
 301  306          int error;
 302  307          zfsvfs_t *zfsvfs;
 303  308          znode_t *zp;
      309 +        uint64_t size;
 304  310  
 305  311          switch (com) {
 306  312          case _FIOFFS:
 307  313                  return (zfs_sync(vp->v_vfsp, 0, cred));
 308  314  
 309  315                  /*
 310  316                   * The following two ioctls are used by bfu.  Faking out,
 311  317                   * necessary to avoid bfu errors.
 312  318                   */
 313  319          case _FIOGDIO:

 314  320          case _FIOSDIO:
 315  321                  return (0);
 316  322  
 317  323          case _FIO_SEEK_DATA:
 318  324          case _FIO_SEEK_HOLE:
 319  325                  if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 320  326                          return (SET_ERROR(EFAULT));
 321  327  
 322  328                  zp = VTOZ(vp);
 323  329                  zfsvfs = zp->z_zfsvfs;
 324  330                  ZFS_ENTER(zfsvfs);

↓ open down ↓

11 lines elided

↑ open up ↑

 325  331                  ZFS_VERIFY_ZP(zp);
 326  332  
 327  333                  /* offset parameter is in/out */
 328  334                  error = zfs_holey(vp, com, &off);
 329  335                  ZFS_EXIT(zfsvfs);
 330  336                  if (error)
 331  337                          return (error);
 332  338                  if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 333  339                          return (SET_ERROR(EFAULT));
 334  340                  return (0);
      341 +        case _FIO_RESERVE_SPACE:
      342 +                if (ddi_copyin((void *)data, &size, sizeof (size), flag))
      343 +                        return (EFAULT);
      344 +                error = zfs_zero_write(vp, size, cred, ct);
      345 +                return (error);
 335  346          }
 336  347          return (SET_ERROR(ENOTTY));
 337  348  }
 338  349  
 339  350  /*
 340  351   * Utility functions to map and unmap a single physical page.  These
 341  352   * are used to manage the mappable copies of ZFS file data, and therefore
 342  353   * do not update ref/mod bits.
 343  354   */
 344  355  caddr_t

 345  356  zfs_map_page(page_t *pp, enum seg_rw rw)
 346  357  {
 347  358          if (kpm_enable)
 348  359                  return (hat_kpm_mapin(pp, 0));
 349  360          ASSERT(rw == S_READ || rw == S_WRITE);
 350  361          return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
 351  362              (caddr_t)-1));
 352  363  }
 353  364  
 354  365  void
 355  366  zfs_unmap_page(page_t *pp, caddr_t addr)
 356  367  {
 357  368          if (kpm_enable) {
 358  369                  hat_kpm_mapout(pp, 0, addr);
 359  370          } else {
 360  371                  ppmapout(addr);
 361  372          }
 362  373  }
 363  374  
 364  375  /*
 365  376   * When a file is memory mapped, we must keep the IO data synchronized
 366  377   * between the DMU cache and the memory mapped pages.  What this means:
 367  378   *
 368  379   * On Write:    If we find a memory mapped page, we write to *both*
 369  380   *              the page and the dmu buffer.
 370  381   */
 371  382  static void
 372  383  update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
 373  384  {
 374  385          int64_t off;
 375  386  
 376  387          off = start & PAGEOFFSET;
 377  388          for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 378  389                  page_t *pp;
 379  390                  uint64_t nbytes = MIN(PAGESIZE - off, len);
 380  391  
 381  392                  if (pp = page_lookup(vp, start, SE_SHARED)) {
 382  393                          caddr_t va;
 383  394  
 384  395                          va = zfs_map_page(pp, S_WRITE);
 385  396                          (void) dmu_read(os, oid, start+off, nbytes, va+off,
 386  397                              DMU_READ_PREFETCH);
 387  398                          zfs_unmap_page(pp, va);
 388  399                          page_unlock(pp);
 389  400                  }
 390  401                  len -= nbytes;
 391  402                  off = 0;
 392  403          }
 393  404  }
 394  405  
 395  406  /*
 396  407   * When a file is memory mapped, we must keep the IO data synchronized
 397  408   * between the DMU cache and the memory mapped pages.  What this means:
 398  409   *
 399  410   * On Read:     We "read" preferentially from memory mapped pages,
 400  411   *              else we default from the dmu buffer.
 401  412   *
 402  413   * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 403  414   *       the file is memory mapped.
 404  415   */
 405  416  static int
 406  417  mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 407  418  {
 408  419          znode_t *zp = VTOZ(vp);
 409  420          int64_t start, off;
 410  421          int len = nbytes;
 411  422          int error = 0;
 412  423  
 413  424          start = uio->uio_loffset;
 414  425          off = start & PAGEOFFSET;
 415  426          for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 416  427                  page_t *pp;
 417  428                  uint64_t bytes = MIN(PAGESIZE - off, len);
 418  429  
 419  430                  if (pp = page_lookup(vp, start, SE_SHARED)) {
 420  431                          caddr_t va;
 421  432  
 422  433                          va = zfs_map_page(pp, S_READ);
 423  434                          error = uiomove(va + off, bytes, UIO_READ, uio);
 424  435                          zfs_unmap_page(pp, va);
 425  436                          page_unlock(pp);
 426  437                  } else {
 427  438                          error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 428  439                              uio, bytes);
 429  440                  }
 430  441                  len -= bytes;
 431  442                  off = 0;
 432  443                  if (error)
 433  444                          break;
 434  445          }
 435  446          return (error);
 436  447  }
 437  448  
 438  449  offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 439  450  
 440  451  /*
 441  452   * Read bytes from specified file into supplied buffer.
 442  453   *
 443  454   *      IN:     vp      - vnode of file to be read from.
 444  455   *              uio     - structure supplying read location, range info,
 445  456   *                        and return buffer.
 446  457   *              ioflag  - SYNC flags; used to provide FRSYNC semantics.
 447  458   *              cr      - credentials of caller.
 448  459   *              ct      - caller context
 449  460   *
 450  461   *      OUT:    uio     - updated offset and range, buffer filled.
 451  462   *
 452  463   *      RETURN: 0 on success, error code on failure.
 453  464   *
 454  465   * Side Effects:
 455  466   *      vp - atime updated if byte count > 0
 456  467   */
 457  468  /* ARGSUSED */
 458  469  static int
 459  470  zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 460  471  {
 461  472          znode_t         *zp = VTOZ(vp);
 462  473          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 463  474          ssize_t         n, nbytes;
 464  475          int             error = 0;
 465  476          rl_t            *rl;
 466  477          xuio_t          *xuio = NULL;
 467  478  
 468  479          ZFS_ENTER(zfsvfs);
 469  480          ZFS_VERIFY_ZP(zp);
 470  481  
 471  482          if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 472  483                  ZFS_EXIT(zfsvfs);
 473  484                  return (SET_ERROR(EACCES));
 474  485          }
 475  486  
 476  487          /*
 477  488           * Validate file offset
 478  489           */
 479  490          if (uio->uio_loffset < (offset_t)0) {
 480  491                  ZFS_EXIT(zfsvfs);
 481  492                  return (SET_ERROR(EINVAL));
 482  493          }
 483  494  
 484  495          /*
 485  496           * Fasttrack empty reads
 486  497           */
 487  498          if (uio->uio_resid == 0) {
 488  499                  ZFS_EXIT(zfsvfs);
 489  500                  return (0);
 490  501          }
 491  502  
 492  503          /*
 493  504           * Check for mandatory locks
 494  505           */
 495  506          if (MANDMODE(zp->z_mode)) {
 496  507                  if (error = chklock(vp, FREAD,
 497  508                      uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 498  509                          ZFS_EXIT(zfsvfs);
 499  510                          return (error);
 500  511                  }
 501  512          }
 502  513  
 503  514          /*
 504  515           * If we're in FRSYNC mode, sync out this znode before reading it.
 505  516           */
 506  517          if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 507  518                  zil_commit(zfsvfs->z_log, zp->z_id);
 508  519  
 509  520          /*
 510  521           * Lock the range against changes.
 511  522           */
 512  523          rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 513  524  
 514  525          /*
 515  526           * If we are reading past end-of-file we can skip
 516  527           * to the end; but we might still need to set atime.
 517  528           */
 518  529          if (uio->uio_loffset >= zp->z_size) {
 519  530                  error = 0;
 520  531                  goto out;
 521  532          }
 522  533  
 523  534          ASSERT(uio->uio_loffset < zp->z_size);
 524  535          n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 525  536  
 526  537          if ((uio->uio_extflg == UIO_XUIO) &&
 527  538              (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 528  539                  int nblk;
 529  540                  int blksz = zp->z_blksz;
 530  541                  uint64_t offset = uio->uio_loffset;
 531  542  
 532  543                  xuio = (xuio_t *)uio;
 533  544                  if ((ISP2(blksz))) {
 534  545                          nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
 535  546                              blksz)) / blksz;
 536  547                  } else {
 537  548                          ASSERT(offset + n <= blksz);
 538  549                          nblk = 1;
 539  550                  }
 540  551                  (void) dmu_xuio_init(xuio, nblk);
 541  552  
 542  553                  if (vn_has_cached_data(vp)) {
 543  554                          /*
 544  555                           * For simplicity, we always allocate a full buffer
 545  556                           * even if we only expect to read a portion of a block.
 546  557                           */
 547  558                          while (--nblk >= 0) {
 548  559                                  (void) dmu_xuio_add(xuio,
 549  560                                      dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 550  561                                      blksz), 0, blksz);
 551  562                          }
 552  563                  }
 553  564          }
 554  565  
 555  566          while (n > 0) {
 556  567                  nbytes = MIN(n, zfs_read_chunk_size -
 557  568                      P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 558  569  
 559  570                  if (vn_has_cached_data(vp)) {
 560  571                          error = mappedread(vp, nbytes, uio);
 561  572                  } else {
 562  573                          error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 563  574                              uio, nbytes);
 564  575                  }
 565  576                  if (error) {
 566  577                          /* convert checksum errors into IO errors */
 567  578                          if (error == ECKSUM)
 568  579                                  error = SET_ERROR(EIO);
 569  580                          break;
 570  581                  }
 571  582  
 572  583                  n -= nbytes;
 573  584          }
 574  585  out:
 575  586          zfs_range_unlock(rl);
 576  587  
 577  588          ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 578  589          ZFS_EXIT(zfsvfs);
 579  590          return (error);
 580  591  }
 581  592  
 582  593  /*
 583  594   * Write the bytes to a file.
 584  595   *
 585  596   *      IN:     vp      - vnode of file to be written to.
 586  597   *              uio     - structure supplying write location, range info,
 587  598   *                        and data buffer.
 588  599   *              ioflag  - FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
 589  600   *                        set if in append mode.
 590  601   *              cr      - credentials of caller.
 591  602   *              ct      - caller context (NFS/CIFS fem monitor only)
 592  603   *
 593  604   *      OUT:    uio     - updated offset and range.
 594  605   *
 595  606   *      RETURN: 0 on success, error code on failure.
 596  607   *
 597  608   * Timestamps:
 598  609   *      vp - ctime|mtime updated if byte count > 0
 599  610   */
 600  611  
 601  612  /* ARGSUSED */
 602  613  static int
 603  614  zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 604  615  {
 605  616          znode_t         *zp = VTOZ(vp);
 606  617          rlim64_t        limit = uio->uio_llimit;
 607  618          ssize_t         start_resid = uio->uio_resid;
 608  619          ssize_t         tx_bytes;
 609  620          uint64_t        end_size;
 610  621          dmu_tx_t        *tx;
 611  622          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 612  623          zilog_t         *zilog;
 613  624          offset_t        woff;
 614  625          ssize_t         n, nbytes;
 615  626          rl_t            *rl;
 616  627          int             max_blksz = zfsvfs->z_max_blksz;
 617  628          int             error = 0;
 618  629          arc_buf_t       *abuf;
 619  630          iovec_t         *aiov = NULL;
 620  631          xuio_t          *xuio = NULL;
 621  632          int             i_iov = 0;
 622  633          int             iovcnt = uio->uio_iovcnt;
 623  634          iovec_t         *iovp = uio->uio_iov;
 624  635          int             write_eof;
 625  636          int             count = 0;
 626  637          sa_bulk_attr_t  bulk[4];
 627  638          uint64_t        mtime[2], ctime[2];
 628  639  
 629  640          /*
 630  641           * Fasttrack empty write
 631  642           */
 632  643          n = start_resid;
 633  644          if (n == 0)
 634  645                  return (0);
 635  646  
 636  647          if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 637  648                  limit = MAXOFFSET_T;
 638  649  
 639  650          ZFS_ENTER(zfsvfs);
 640  651          ZFS_VERIFY_ZP(zp);
 641  652  
 642  653          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 643  654          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 644  655          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 645  656              &zp->z_size, 8);
 646  657          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 647  658              &zp->z_pflags, 8);
 648  659  
 649  660          /*
 650  661           * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
 651  662           * callers might not be able to detect properly that we are read-only,
 652  663           * so check it explicitly here.
 653  664           */
 654  665          if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 655  666                  ZFS_EXIT(zfsvfs);
 656  667                  return (SET_ERROR(EROFS));
 657  668          }
 658  669  
 659  670          /*
 660  671           * If immutable or not appending then return EPERM
 661  672           */
 662  673          if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
 663  674              ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 664  675              (uio->uio_loffset < zp->z_size))) {
 665  676                  ZFS_EXIT(zfsvfs);
 666  677                  return (SET_ERROR(EPERM));
 667  678          }
 668  679  
 669  680          zilog = zfsvfs->z_log;
 670  681  
 671  682          /*
 672  683           * Validate file offset
 673  684           */
 674  685          woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 675  686          if (woff < 0) {
 676  687                  ZFS_EXIT(zfsvfs);
 677  688                  return (SET_ERROR(EINVAL));
 678  689          }
 679  690  
 680  691          /*
 681  692           * Check for mandatory locks before calling zfs_range_lock()
 682  693           * in order to prevent a deadlock with locks set via fcntl().
 683  694           */
 684  695          if (MANDMODE((mode_t)zp->z_mode) &&
 685  696              (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 686  697                  ZFS_EXIT(zfsvfs);
 687  698                  return (error);
 688  699          }
 689  700  
 690  701          /*
 691  702           * Pre-fault the pages to ensure slow (eg NFS) pages
 692  703           * don't hold up txg.
 693  704           * Skip this if uio contains loaned arc_buf.
 694  705           */
 695  706          if ((uio->uio_extflg == UIO_XUIO) &&
 696  707              (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
 697  708                  xuio = (xuio_t *)uio;
 698  709          else
 699  710                  uio_prefaultpages(MIN(n, max_blksz), uio);
 700  711  
 701  712          /*
 702  713           * If in append mode, set the io offset pointer to eof.
 703  714           */
 704  715          if (ioflag & FAPPEND) {
 705  716                  /*
 706  717                   * Obtain an appending range lock to guarantee file append
 707  718                   * semantics.  We reset the write offset once we have the lock.
 708  719                   */
 709  720                  rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 710  721                  woff = rl->r_off;
 711  722                  if (rl->r_len == UINT64_MAX) {
 712  723                          /*
 713  724                           * We overlocked the file because this write will cause
 714  725                           * the file block size to increase.
 715  726                           * Note that zp_size cannot change with this lock held.
 716  727                           */
 717  728                          woff = zp->z_size;
 718  729                  }
 719  730                  uio->uio_loffset = woff;
 720  731          } else {
 721  732                  /*
 722  733                   * Note that if the file block size will change as a result of
 723  734                   * this write, then this range lock will lock the entire file
 724  735                   * so that we can re-write the block safely.
 725  736                   */
 726  737                  rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 727  738          }
 728  739  
 729  740          if (woff >= limit) {
 730  741                  zfs_range_unlock(rl);
 731  742                  ZFS_EXIT(zfsvfs);
 732  743                  return (SET_ERROR(EFBIG));
 733  744          }
 734  745  
 735  746          if ((woff + n) > limit || woff > (limit - n))
 736  747                  n = limit - woff;
 737  748  
 738  749          /* Will this write extend the file length? */
 739  750          write_eof = (woff + n > zp->z_size);
 740  751  
 741  752          end_size = MAX(zp->z_size, woff + n);
 742  753  
 743  754          /*
 744  755           * Write the file in reasonable size chunks.  Each chunk is written
 745  756           * in a separate transaction; this keeps the intent log records small
 746  757           * and allows us to do more fine-grained space accounting.
 747  758           */
 748  759          while (n > 0) {
 749  760                  abuf = NULL;
 750  761                  woff = uio->uio_loffset;
 751  762                  if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 752  763                      zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 753  764                          if (abuf != NULL)
 754  765                                  dmu_return_arcbuf(abuf);
 755  766                          error = SET_ERROR(EDQUOT);
 756  767                          break;
 757  768                  }
 758  769  
 759  770                  if (xuio && abuf == NULL) {
 760  771                          ASSERT(i_iov < iovcnt);
 761  772                          aiov = &iovp[i_iov];
 762  773                          abuf = dmu_xuio_arcbuf(xuio, i_iov);
 763  774                          dmu_xuio_clear(xuio, i_iov);
 764  775                          DTRACE_PROBE3(zfs_cp_write, int, i_iov,
 765  776                              iovec_t *, aiov, arc_buf_t *, abuf);
 766  777                          ASSERT((aiov->iov_base == abuf->b_data) ||
 767  778                              ((char *)aiov->iov_base - (char *)abuf->b_data +
 768  779                              aiov->iov_len == arc_buf_size(abuf)));
 769  780                          i_iov++;
 770  781                  } else if (abuf == NULL && n >= max_blksz &&
 771  782                      woff >= zp->z_size &&
 772  783                      P2PHASE(woff, max_blksz) == 0 &&
 773  784                      zp->z_blksz == max_blksz) {
 774  785                          /*
 775  786                           * This write covers a full block.  "Borrow" a buffer
 776  787                           * from the dmu so that we can fill it before we enter
 777  788                           * a transaction.  This avoids the possibility of
 778  789                           * holding up the transaction if the data copy hangs
 779  790                           * up on a pagefault (e.g., from an NFS server mapping).
 780  791                           */
 781  792                          size_t cbytes;
 782  793  
 783  794                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 784  795                              max_blksz);
 785  796                          ASSERT(abuf != NULL);
 786  797                          ASSERT(arc_buf_size(abuf) == max_blksz);
 787  798                          if (error = uiocopy(abuf->b_data, max_blksz,
 788  799                              UIO_WRITE, uio, &cbytes)) {
 789  800                                  dmu_return_arcbuf(abuf);
 790  801                                  break;
 791  802                          }
 792  803                          ASSERT(cbytes == max_blksz);
 793  804                  }
 794  805  
 795  806                  /*
 796  807                   * Start a transaction.
 797  808                   */
 798  809                  tx = dmu_tx_create(zfsvfs->z_os);
 799  810                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 800  811                  dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 801  812                  zfs_sa_upgrade_txholds(tx, zp);
 802  813                  error = dmu_tx_assign(tx, TXG_WAIT);
 803  814                  if (error) {
 804  815                          dmu_tx_abort(tx);
 805  816                          if (abuf != NULL)
 806  817                                  dmu_return_arcbuf(abuf);
 807  818                          break;
 808  819                  }
 809  820  
 810  821                  /*
 811  822                   * If zfs_range_lock() over-locked we grow the blocksize
 812  823                   * and then reduce the lock range.  This will only happen
 813  824                   * on the first iteration since zfs_range_reduce() will
 814  825                   * shrink down r_len to the appropriate size.
 815  826                   */
 816  827                  if (rl->r_len == UINT64_MAX) {
 817  828                          uint64_t new_blksz;
 818  829  
 819  830                          if (zp->z_blksz > max_blksz) {
 820  831                                  ASSERT(!ISP2(zp->z_blksz));
 821  832                                  new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
 822  833                          } else {
 823  834                                  new_blksz = MIN(end_size, max_blksz);
 824  835                          }
 825  836                          zfs_grow_blocksize(zp, new_blksz, tx);
 826  837                          zfs_range_reduce(rl, woff, n);
 827  838                  }
 828  839  
 829  840                  /*
 830  841                   * XXX - should we really limit each write to z_max_blksz?
 831  842                   * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 832  843                   */
 833  844                  nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 834  845  
 835  846                  if (abuf == NULL) {
 836  847                          tx_bytes = uio->uio_resid;
 837  848                          error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 838  849                              uio, nbytes, tx);
 839  850                          tx_bytes -= uio->uio_resid;
 840  851                  } else {
 841  852                          tx_bytes = nbytes;
 842  853                          ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
 843  854                          /*
 844  855                           * If this is not a full block write, but we are
 845  856                           * extending the file past EOF and this data starts
 846  857                           * block-aligned, use assign_arcbuf().  Otherwise,
 847  858                           * write via dmu_write().
 848  859                           */
 849  860                          if (tx_bytes < max_blksz && (!write_eof ||
 850  861                              aiov->iov_base != abuf->b_data)) {
 851  862                                  ASSERT(xuio);
 852  863                                  dmu_write(zfsvfs->z_os, zp->z_id, woff,
 853  864                                      aiov->iov_len, aiov->iov_base, tx);
 854  865                                  dmu_return_arcbuf(abuf);
 855  866                                  xuio_stat_wbuf_copied();
 856  867                          } else {
 857  868                                  ASSERT(xuio || tx_bytes == max_blksz);
 858  869                                  dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
 859  870                                      woff, abuf, tx);
 860  871                          }
 861  872                          ASSERT(tx_bytes <= uio->uio_resid);
 862  873                          uioskip(uio, tx_bytes);
 863  874                  }
 864  875                  if (tx_bytes && vn_has_cached_data(vp)) {
 865  876                          update_pages(vp, woff,
 866  877                              tx_bytes, zfsvfs->z_os, zp->z_id);
 867  878                  }
 868  879  
 869  880                  /*
 870  881                   * If we made no progress, we're done.  If we made even
 871  882                   * partial progress, update the znode and ZIL accordingly.
 872  883                   */
 873  884                  if (tx_bytes == 0) {
 874  885                          (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 875  886                              (void *)&zp->z_size, sizeof (uint64_t), tx);
 876  887                          dmu_tx_commit(tx);
 877  888                          ASSERT(error != 0);
 878  889                          break;
 879  890                  }
 880  891  
 881  892                  /*
 882  893                   * Clear Set-UID/Set-GID bits on successful write if not
 883  894                   * privileged and at least one of the excute bits is set.
 884  895                   *
 885  896                   * It would be nice to to this after all writes have
 886  897                   * been done, but that would still expose the ISUID/ISGID
 887  898                   * to another app after the partial write is committed.
 888  899                   *
 889  900                   * Note: we don't call zfs_fuid_map_id() here because
 890  901                   * user 0 is not an ephemeral uid.
 891  902                   */
 892  903                  mutex_enter(&zp->z_acl_lock);
 893  904                  if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 894  905                      (S_IXUSR >> 6))) != 0 &&
 895  906                      (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 896  907                      secpolicy_vnode_setid_retain(cr,
 897  908                      (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
 898  909                          uint64_t newmode;
 899  910                          zp->z_mode &= ~(S_ISUID | S_ISGID);
 900  911                          newmode = zp->z_mode;
 901  912                          (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 902  913                              (void *)&newmode, sizeof (uint64_t), tx);
 903  914                  }
 904  915                  mutex_exit(&zp->z_acl_lock);
 905  916  
 906  917                  zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 907  918                      B_TRUE);
 908  919  
 909  920                  /*
 910  921                   * Update the file size (zp_size) if it has changed;
 911  922                   * account for possible concurrent updates.
 912  923                   */
 913  924                  while ((end_size = zp->z_size) < uio->uio_loffset) {
 914  925                          (void) atomic_cas_64(&zp->z_size, end_size,
 915  926                              uio->uio_loffset);
 916  927                          ASSERT(error == 0);
 917  928                  }
 918  929                  /*
 919  930                   * If we are replaying and eof is non zero then force
 920  931                   * the file size to the specified eof. Note, there's no
 921  932                   * concurrency during replay.
 922  933                   */
 923  934                  if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 924  935                          zp->z_size = zfsvfs->z_replay_eof;
 925  936  
 926  937                  error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 927  938  
 928  939                  zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 929  940                  dmu_tx_commit(tx);
 930  941  
 931  942                  if (error != 0)
 932  943                          break;
 933  944                  ASSERT(tx_bytes == nbytes);
 934  945                  n -= nbytes;
 935  946  
 936  947                  if (!xuio && n > 0)
 937  948                          uio_prefaultpages(MIN(n, max_blksz), uio);
 938  949          }
 939  950  
 940  951          zfs_range_unlock(rl);
 941  952  
 942  953          /*
 943  954           * If we're in replay mode, or we made no progress, return error.
 944  955           * Otherwise, it's at least a partial write, so it's successful.
 945  956           */
 946  957          if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 947  958                  ZFS_EXIT(zfsvfs);
 948  959                  return (error);

↓ open down ↓

604 lines elided

↑ open up ↑

 949  960          }
 950  961  
 951  962          if (ioflag & (FSYNC | FDSYNC) ||
 952  963              zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 953  964                  zil_commit(zilog, zp->z_id);
 954  965  
 955  966          ZFS_EXIT(zfsvfs);
 956  967          return (0);
 957  968  }
 958  969  
      970 +#define ZFS_RESERVE_CHUNK (2 * 1024 * 1024)
      971 +/* ARGSUSED */
      972 +static int
      973 +zfs_zero_write(vnode_t *vp, uint64_t size, cred_t *cr, caller_context_t *ct)
      974 +{
      975 +        znode_t         *zp = VTOZ(vp);
      976 +        zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
      977 +        int             count = 0;
      978 +        sa_bulk_attr_t  bulk[4];
      979 +        uint64_t        mtime[2], ctime[2];
      980 +        rl_t            *rl;
      981 +        int             error = 0;
      982 +        dmu_tx_t        *tx = NULL;
      983 +        uint64_t        end_size;
      984 +        uint64_t        pos = 0;
      985 +
      986 +        if (zp->z_size > 0)
      987 +                return (EFBIG);
      988 +        if (size == 0)
      989 +                return (0);
      990 +
      991 +        ZFS_ENTER(zfsvfs);
      992 +        ZFS_VERIFY_ZP(zp);
      993 +
      994 +        if (!spa_feature_is_enabled(zfsvfs->z_os->os_spa,
      995 +            SPA_FEATURE_SPACE_RESERVATION))
      996 +        {
      997 +                ZFS_EXIT(zfsvfs);
      998 +                return (ENOTSUP);
      999 +        }
     1000 +
     1001 +        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
     1002 +        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
     1003 +        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
     1004 +            &zp->z_size, 8);
     1005 +        SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
     1006 +            &zp->z_pflags, 8);
     1007 +
     1008 +        /*
     1009 +         * If immutable or not appending then return EPERM
     1010 +         */
     1011 +        if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY))) {
     1012 +                ZFS_EXIT(zfsvfs);
     1013 +                return (EPERM);
     1014 +        }
     1015 +
     1016 +        rl = zfs_range_lock(zp, 0, size, RL_WRITER);
     1017 +
     1018 +        if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
     1019 +            zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
     1020 +                error = EDQUOT;
     1021 +                goto out;
     1022 +        }
     1023 +
     1024 +        while (pos < size) {
     1025 +                uint64_t length = size - pos;
     1026 +                length = MIN(length, ZFS_RESERVE_CHUNK);
     1027 +again:
     1028 +                tx = dmu_tx_create(zfsvfs->z_os);
     1029 +                dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
     1030 +                dmu_tx_hold_write(tx, zp->z_id, pos, length);
     1031 +                zfs_sa_upgrade_txholds(tx, zp);
     1032 +                error = dmu_tx_assign(tx, TXG_NOWAIT);
     1033 +                if (error) {
     1034 +                        if (error == ERESTART) {
     1035 +                                dmu_tx_wait(tx);
     1036 +                                dmu_tx_abort(tx);
     1037 +                                goto again;
     1038 +                        }
     1039 +                        dmu_tx_abort(tx);
     1040 +                        goto out;
     1041 +                }
     1042 +
     1043 +                if (pos == 0)
     1044 +                        zfs_grow_blocksize(zp, MIN(size, zfsvfs->z_max_blksz), tx);
     1045 +                dmu_write_zero(zfsvfs->z_os, zp->z_id, pos, length, tx);
     1046 +
     1047 +                zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
     1048 +
     1049 +                pos += length;
     1050 +                while ((end_size = zp->z_size) < pos)
     1051 +                        (void) atomic_cas_64(&zp->z_size, end_size, pos);
     1052 +
     1053 +                error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
     1054 +
     1055 +                dmu_tx_commit(tx);
     1056 +                if (error)
     1057 +                        goto out;
     1058 +        }
     1059 +out:
     1060 +        zfs_range_unlock(rl);
     1061 +        ZFS_EXIT(zfsvfs);
     1062 +
     1063 +        return (error);
     1064 +}
     1065 +
 959 1066  void
 960 1067  zfs_get_done(zgd_t *zgd, int error)
 961 1068  {
 962 1069          znode_t *zp = zgd->zgd_private;
 963 1070          objset_t *os = zp->z_zfsvfs->z_os;
 964 1071  
 965 1072          if (zgd->zgd_db)
 966 1073                  dmu_buf_rele(zgd->zgd_db, zgd);
 967 1074  
 968 1075          zfs_range_unlock(zgd->zgd_rl);

 969 1076  
 970 1077          /*
 971 1078           * Release the vnode asynchronously as we currently have the
 972 1079           * txg stopped from syncing.
 973 1080           */
 974 1081          VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 975 1082  
 976 1083          if (error == 0 && zgd->zgd_bp)
 977 1084                  zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 978 1085  
 979 1086          kmem_free(zgd, sizeof (zgd_t));
 980 1087  }
 981 1088  
 982 1089  #ifdef DEBUG
 983 1090  static int zil_fault_io = 0;
 984 1091  #endif
 985 1092  
 986 1093  /*
 987 1094   * Get data to generate a TX_WRITE intent log record.
 988 1095   */
 989 1096  int
 990 1097  zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 991 1098  {
 992 1099          zfsvfs_t *zfsvfs = arg;
 993 1100          objset_t *os = zfsvfs->z_os;
 994 1101          znode_t *zp;
 995 1102          uint64_t object = lr->lr_foid;
 996 1103          uint64_t offset = lr->lr_offset;
 997 1104          uint64_t size = lr->lr_length;
 998 1105          blkptr_t *bp = &lr->lr_blkptr;
 999 1106          dmu_buf_t *db;
1000 1107          zgd_t *zgd;
1001 1108          int error = 0;
1002 1109  
1003 1110          ASSERT(zio != NULL);
1004 1111          ASSERT(size != 0);
1005 1112  
1006 1113          /*
1007 1114           * Nothing to do if the file has been removed
1008 1115           */
1009 1116          if (zfs_zget(zfsvfs, object, &zp) != 0)
1010 1117                  return (SET_ERROR(ENOENT));
1011 1118          if (zp->z_unlinked) {
1012 1119                  /*
1013 1120                   * Release the vnode asynchronously as we currently have the
1014 1121                   * txg stopped from syncing.
1015 1122                   */
1016 1123                  VN_RELE_ASYNC(ZTOV(zp),
1017 1124                      dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1018 1125                  return (SET_ERROR(ENOENT));
1019 1126          }
1020 1127  
1021 1128          zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1022 1129          zgd->zgd_zilog = zfsvfs->z_log;
1023 1130          zgd->zgd_private = zp;
1024 1131  
1025 1132          /*
1026 1133           * Write records come in two flavors: immediate and indirect.
1027 1134           * For small writes it's cheaper to store the data with the
1028 1135           * log record (immediate); for large writes it's cheaper to
1029 1136           * sync the data and get a pointer to it (indirect) so that
1030 1137           * we don't have to write the data twice.
1031 1138           */
1032 1139          if (buf != NULL) { /* immediate write */
1033 1140                  zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1034 1141                  /* test for truncation needs to be done while range locked */
1035 1142                  if (offset >= zp->z_size) {
1036 1143                          error = SET_ERROR(ENOENT);
1037 1144                  } else {
1038 1145                          error = dmu_read(os, object, offset, size, buf,
1039 1146                              DMU_READ_NO_PREFETCH);
1040 1147                  }
1041 1148                  ASSERT(error == 0 || error == ENOENT);
1042 1149          } else { /* indirect write */
1043 1150                  /*
1044 1151                   * Have to lock the whole block to ensure when it's
1045 1152                   * written out and it's checksum is being calculated
1046 1153                   * that no one can change the data. We need to re-check
1047 1154                   * blocksize after we get the lock in case it's changed!
1048 1155                   */
1049 1156                  for (;;) {
1050 1157                          uint64_t blkoff;
1051 1158                          size = zp->z_blksz;
1052 1159                          blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1053 1160                          offset -= blkoff;
1054 1161                          zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1055 1162                              RL_READER);
1056 1163                          if (zp->z_blksz == size)
1057 1164                                  break;
1058 1165                          offset += blkoff;
1059 1166                          zfs_range_unlock(zgd->zgd_rl);
1060 1167                  }
1061 1168                  /* test for truncation needs to be done while range locked */
1062 1169                  if (lr->lr_offset >= zp->z_size)
1063 1170                          error = SET_ERROR(ENOENT);
1064 1171  #ifdef DEBUG
1065 1172                  if (zil_fault_io) {
1066 1173                          error = SET_ERROR(EIO);
1067 1174                          zil_fault_io = 0;
1068 1175                  }
1069 1176  #endif
1070 1177                  if (error == 0)
1071 1178                          error = dmu_buf_hold(os, object, offset, zgd, &db,
1072 1179                              DMU_READ_NO_PREFETCH);
1073 1180  
1074 1181                  if (error == 0) {
1075 1182                          blkptr_t *obp = dmu_buf_get_blkptr(db);
1076 1183                          if (obp) {
1077 1184                                  ASSERT(BP_IS_HOLE(bp));
1078 1185                                  *bp = *obp;
1079 1186                          }
1080 1187  
1081 1188                          zgd->zgd_db = db;
1082 1189                          zgd->zgd_bp = bp;
1083 1190  
1084 1191                          ASSERT(db->db_offset == offset);
1085 1192                          ASSERT(db->db_size == size);
1086 1193  
1087 1194                          error = dmu_sync(zio, lr->lr_common.lrc_txg,
1088 1195                              zfs_get_done, zgd);
1089 1196                          ASSERT(error || lr->lr_length <= zp->z_blksz);
1090 1197  
1091 1198                          /*
1092 1199                           * On success, we need to wait for the write I/O
1093 1200                           * initiated by dmu_sync() to complete before we can
1094 1201                           * release this dbuf.  We will finish everything up
1095 1202                           * in the zfs_get_done() callback.
1096 1203                           */
1097 1204                          if (error == 0)
1098 1205                                  return (0);
1099 1206  
1100 1207                          if (error == EALREADY) {
1101 1208                                  lr->lr_common.lrc_txtype = TX_WRITE2;
1102 1209                                  error = 0;
1103 1210                          }
1104 1211                  }
1105 1212          }
1106 1213  
1107 1214          zfs_get_done(zgd, error);
1108 1215  
1109 1216          return (error);
1110 1217  }
1111 1218  
1112 1219  /*ARGSUSED*/
1113 1220  static int
1114 1221  zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1115 1222      caller_context_t *ct)
1116 1223  {
1117 1224          znode_t *zp = VTOZ(vp);
1118 1225          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1119 1226          int error;
1120 1227  
1121 1228          ZFS_ENTER(zfsvfs);
1122 1229          ZFS_VERIFY_ZP(zp);
1123 1230  
1124 1231          if (flag & V_ACE_MASK)
1125 1232                  error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1126 1233          else
1127 1234                  error = zfs_zaccess_rwx(zp, mode, flag, cr);
1128 1235  
1129 1236          ZFS_EXIT(zfsvfs);
1130 1237          return (error);
1131 1238  }
1132 1239  
1133 1240  /*
1134 1241   * If vnode is for a device return a specfs vnode instead.
1135 1242   */
1136 1243  static int
1137 1244  specvp_check(vnode_t **vpp, cred_t *cr)
1138 1245  {
1139 1246          int error = 0;
1140 1247  
1141 1248          if (IS_DEVVP(*vpp)) {
1142 1249                  struct vnode *svp;
1143 1250  
1144 1251                  svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1145 1252                  VN_RELE(*vpp);
1146 1253                  if (svp == NULL)
1147 1254                          error = SET_ERROR(ENOSYS);
1148 1255                  *vpp = svp;
1149 1256          }
1150 1257          return (error);
1151 1258  }
1152 1259  
1153 1260  
1154 1261  /*
1155 1262   * Lookup an entry in a directory, or an extended attribute directory.
1156 1263   * If it exists, return a held vnode reference for it.
1157 1264   *
1158 1265   *      IN:     dvp     - vnode of directory to search.
1159 1266   *              nm      - name of entry to lookup.
1160 1267   *              pnp     - full pathname to lookup [UNUSED].
1161 1268   *              flags   - LOOKUP_XATTR set if looking for an attribute.
1162 1269   *              rdir    - root directory vnode [UNUSED].
1163 1270   *              cr      - credentials of caller.
1164 1271   *              ct      - caller context
1165 1272   *              direntflags - directory lookup flags
1166 1273   *              realpnp - returned pathname.
1167 1274   *
1168 1275   *      OUT:    vpp     - vnode of located entry, NULL if not found.
1169 1276   *
1170 1277   *      RETURN: 0 on success, error code on failure.
1171 1278   *
1172 1279   * Timestamps:
1173 1280   *      NA
1174 1281   */
1175 1282  /* ARGSUSED */
1176 1283  static int
1177 1284  zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1178 1285      int flags, vnode_t *rdir, cred_t *cr,  caller_context_t *ct,
1179 1286      int *direntflags, pathname_t *realpnp)
1180 1287  {
1181 1288          znode_t *zdp = VTOZ(dvp);
1182 1289          zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1183 1290          int     error = 0;
1184 1291  
1185 1292          /* fast path */
1186 1293          if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1187 1294  
1188 1295                  if (dvp->v_type != VDIR) {
1189 1296                          return (SET_ERROR(ENOTDIR));
1190 1297                  } else if (zdp->z_sa_hdl == NULL) {
1191 1298                          return (SET_ERROR(EIO));
1192 1299                  }
1193 1300  
1194 1301                  if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1195 1302                          error = zfs_fastaccesschk_execute(zdp, cr);
1196 1303                          if (!error) {
1197 1304                                  *vpp = dvp;
1198 1305                                  VN_HOLD(*vpp);
1199 1306                                  return (0);
1200 1307                          }
1201 1308                          return (error);
1202 1309                  } else {
1203 1310                          vnode_t *tvp = dnlc_lookup(dvp, nm);
1204 1311  
1205 1312                          if (tvp) {
1206 1313                                  error = zfs_fastaccesschk_execute(zdp, cr);
1207 1314                                  if (error) {
1208 1315                                          VN_RELE(tvp);
1209 1316                                          return (error);
1210 1317                                  }
1211 1318                                  if (tvp == DNLC_NO_VNODE) {
1212 1319                                          VN_RELE(tvp);
1213 1320                                          return (SET_ERROR(ENOENT));
1214 1321                                  } else {
1215 1322                                          *vpp = tvp;
1216 1323                                          return (specvp_check(vpp, cr));
1217 1324                                  }
1218 1325                          }
1219 1326                  }
1220 1327          }
1221 1328  
1222 1329          DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1223 1330  
1224 1331          ZFS_ENTER(zfsvfs);
1225 1332          ZFS_VERIFY_ZP(zdp);
1226 1333  
1227 1334          *vpp = NULL;
1228 1335  
1229 1336          if (flags & LOOKUP_XATTR) {
1230 1337                  /*
1231 1338                   * If the xattr property is off, refuse the lookup request.
1232 1339                   */
1233 1340                  if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1234 1341                          ZFS_EXIT(zfsvfs);
1235 1342                          return (SET_ERROR(EINVAL));
1236 1343                  }
1237 1344  
1238 1345                  /*
1239 1346                   * We don't allow recursive attributes..
1240 1347                   * Maybe someday we will.
1241 1348                   */
1242 1349                  if (zdp->z_pflags & ZFS_XATTR) {
1243 1350                          ZFS_EXIT(zfsvfs);
1244 1351                          return (SET_ERROR(EINVAL));
1245 1352                  }
1246 1353  
1247 1354                  if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1248 1355                          ZFS_EXIT(zfsvfs);
1249 1356                          return (error);
1250 1357                  }
1251 1358  
1252 1359                  /*
1253 1360                   * Do we have permission to get into attribute directory?
1254 1361                   */
1255 1362  
1256 1363                  if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1257 1364                      B_FALSE, cr)) {
1258 1365                          VN_RELE(*vpp);
1259 1366                          *vpp = NULL;
1260 1367                  }
1261 1368  
1262 1369                  ZFS_EXIT(zfsvfs);
1263 1370                  return (error);
1264 1371          }
1265 1372  
1266 1373          if (dvp->v_type != VDIR) {
1267 1374                  ZFS_EXIT(zfsvfs);
1268 1375                  return (SET_ERROR(ENOTDIR));
1269 1376          }
1270 1377  
1271 1378          /*
1272 1379           * Check accessibility of directory.
1273 1380           */
1274 1381  
1275 1382          if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1276 1383                  ZFS_EXIT(zfsvfs);
1277 1384                  return (error);
1278 1385          }
1279 1386  
1280 1387          if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1281 1388              NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1282 1389                  ZFS_EXIT(zfsvfs);
1283 1390                  return (SET_ERROR(EILSEQ));
1284 1391          }
1285 1392  
1286 1393          error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1287 1394          if (error == 0)
1288 1395                  error = specvp_check(vpp, cr);
1289 1396  
1290 1397          ZFS_EXIT(zfsvfs);
1291 1398          return (error);
1292 1399  }
1293 1400  
1294 1401  /*
1295 1402   * Attempt to create a new entry in a directory.  If the entry
1296 1403   * already exists, truncate the file if permissible, else return
1297 1404   * an error.  Return the vp of the created or trunc'd file.
1298 1405   *
1299 1406   *      IN:     dvp     - vnode of directory to put new file entry in.
1300 1407   *              name    - name of new file entry.
1301 1408   *              vap     - attributes of new file.
1302 1409   *              excl    - flag indicating exclusive or non-exclusive mode.
1303 1410   *              mode    - mode to open file with.
1304 1411   *              cr      - credentials of caller.
1305 1412   *              flag    - large file flag [UNUSED].
1306 1413   *              ct      - caller context
1307 1414   *              vsecp   - ACL to be set
1308 1415   *
1309 1416   *      OUT:    vpp     - vnode of created or trunc'd entry.
1310 1417   *
1311 1418   *      RETURN: 0 on success, error code on failure.
1312 1419   *
1313 1420   * Timestamps:
1314 1421   *      dvp - ctime|mtime updated if new entry created
1315 1422   *       vp - ctime|mtime always, atime if new
1316 1423   */
1317 1424  
1318 1425  /* ARGSUSED */
1319 1426  static int
1320 1427  zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
1321 1428      int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
1322 1429      vsecattr_t *vsecp)
1323 1430  {
1324 1431          znode_t         *zp, *dzp = VTOZ(dvp);
1325 1432          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1326 1433          zilog_t         *zilog;
1327 1434          objset_t        *os;
1328 1435          zfs_dirlock_t   *dl;
1329 1436          dmu_tx_t        *tx;
1330 1437          int             error;
1331 1438          ksid_t          *ksid;
1332 1439          uid_t           uid;
1333 1440          gid_t           gid = crgetgid(cr);
1334 1441          zfs_acl_ids_t   acl_ids;
1335 1442          boolean_t       fuid_dirtied;
1336 1443          boolean_t       have_acl = B_FALSE;
1337 1444          boolean_t       waited = B_FALSE;
1338 1445  
1339 1446          /*
1340 1447           * If we have an ephemeral id, ACL, or XVATTR then
1341 1448           * make sure file system is at proper version
1342 1449           */
1343 1450  
1344 1451          ksid = crgetsid(cr, KSID_OWNER);
1345 1452          if (ksid)
1346 1453                  uid = ksid_getid(ksid);
1347 1454          else
1348 1455                  uid = crgetuid(cr);
1349 1456  
1350 1457          if (zfsvfs->z_use_fuids == B_FALSE &&
1351 1458              (vsecp || (vap->va_mask & AT_XVATTR) ||
1352 1459              IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1353 1460                  return (SET_ERROR(EINVAL));
1354 1461  
1355 1462          ZFS_ENTER(zfsvfs);
1356 1463          ZFS_VERIFY_ZP(dzp);
1357 1464          os = zfsvfs->z_os;
1358 1465          zilog = zfsvfs->z_log;
1359 1466  
1360 1467          if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1361 1468              NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1362 1469                  ZFS_EXIT(zfsvfs);
1363 1470                  return (SET_ERROR(EILSEQ));
1364 1471          }
1365 1472  
1366 1473          if (vap->va_mask & AT_XVATTR) {
1367 1474                  if ((error = secpolicy_xvattr((xvattr_t *)vap,
1368 1475                      crgetuid(cr), cr, vap->va_type)) != 0) {
1369 1476                          ZFS_EXIT(zfsvfs);
1370 1477                          return (error);
1371 1478                  }
1372 1479          }
1373 1480  top:
1374 1481          *vpp = NULL;
1375 1482  
1376 1483          if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1377 1484                  vap->va_mode &= ~VSVTX;
1378 1485  
1379 1486          if (*name == '\0') {
1380 1487                  /*
1381 1488                   * Null component name refers to the directory itself.
1382 1489                   */
1383 1490                  VN_HOLD(dvp);
1384 1491                  zp = dzp;
1385 1492                  dl = NULL;
1386 1493                  error = 0;
1387 1494          } else {
1388 1495                  /* possible VN_HOLD(zp) */
1389 1496                  int zflg = 0;
1390 1497  
1391 1498                  if (flag & FIGNORECASE)
1392 1499                          zflg |= ZCILOOK;
1393 1500  
1394 1501                  error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1395 1502                      NULL, NULL);
1396 1503                  if (error) {
1397 1504                          if (have_acl)
1398 1505                                  zfs_acl_ids_free(&acl_ids);
1399 1506                          if (strcmp(name, "..") == 0)
1400 1507                                  error = SET_ERROR(EISDIR);
1401 1508                          ZFS_EXIT(zfsvfs);
1402 1509                          return (error);
1403 1510                  }
1404 1511          }
1405 1512  
1406 1513          if (zp == NULL) {
1407 1514                  uint64_t txtype;
1408 1515  
1409 1516                  /*
1410 1517                   * Create a new file object and update the directory
1411 1518                   * to reference it.
1412 1519                   */
1413 1520                  if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1414 1521                          if (have_acl)
1415 1522                                  zfs_acl_ids_free(&acl_ids);
1416 1523                          goto out;
1417 1524                  }
1418 1525  
1419 1526                  /*
1420 1527                   * We only support the creation of regular files in
1421 1528                   * extended attribute directories.
1422 1529                   */
1423 1530  
1424 1531                  if ((dzp->z_pflags & ZFS_XATTR) &&
1425 1532                      (vap->va_type != VREG)) {
1426 1533                          if (have_acl)
1427 1534                                  zfs_acl_ids_free(&acl_ids);
1428 1535                          error = SET_ERROR(EINVAL);
1429 1536                          goto out;
1430 1537                  }
1431 1538  
1432 1539                  if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1433 1540                      cr, vsecp, &acl_ids)) != 0)
1434 1541                          goto out;
1435 1542                  have_acl = B_TRUE;
1436 1543  
1437 1544                  if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1438 1545                          zfs_acl_ids_free(&acl_ids);
1439 1546                          error = SET_ERROR(EDQUOT);
1440 1547                          goto out;
1441 1548                  }
1442 1549  
1443 1550                  tx = dmu_tx_create(os);
1444 1551  
1445 1552                  dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1446 1553                      ZFS_SA_BASE_ATTR_SIZE);
1447 1554  
1448 1555                  fuid_dirtied = zfsvfs->z_fuid_dirty;
1449 1556                  if (fuid_dirtied)
1450 1557                          zfs_fuid_txhold(zfsvfs, tx);
1451 1558                  dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1452 1559                  dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1453 1560                  if (!zfsvfs->z_use_sa &&
1454 1561                      acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1455 1562                          dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1456 1563                              0, acl_ids.z_aclp->z_acl_bytes);
1457 1564                  }
1458 1565                  error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1459 1566                  if (error) {
1460 1567                          zfs_dirent_unlock(dl);
1461 1568                          if (error == ERESTART) {
1462 1569                                  waited = B_TRUE;
1463 1570                                  dmu_tx_wait(tx);
1464 1571                                  dmu_tx_abort(tx);
1465 1572                                  goto top;
1466 1573                          }
1467 1574                          zfs_acl_ids_free(&acl_ids);
1468 1575                          dmu_tx_abort(tx);
1469 1576                          ZFS_EXIT(zfsvfs);
1470 1577                          return (error);
1471 1578                  }
1472 1579                  zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1473 1580  
1474 1581                  if (fuid_dirtied)
1475 1582                          zfs_fuid_sync(zfsvfs, tx);
1476 1583  
1477 1584                  (void) zfs_link_create(dl, zp, tx, ZNEW);
1478 1585                  txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1479 1586                  if (flag & FIGNORECASE)
1480 1587                          txtype |= TX_CI;
1481 1588                  zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1482 1589                      vsecp, acl_ids.z_fuidp, vap);
1483 1590                  zfs_acl_ids_free(&acl_ids);
1484 1591                  dmu_tx_commit(tx);
1485 1592          } else {
1486 1593                  int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1487 1594  
1488 1595                  if (have_acl)
1489 1596                          zfs_acl_ids_free(&acl_ids);
1490 1597                  have_acl = B_FALSE;
1491 1598  
1492 1599                  /*
1493 1600                   * A directory entry already exists for this name.
1494 1601                   */
1495 1602                  /*
1496 1603                   * Can't truncate an existing file if in exclusive mode.
1497 1604                   */
1498 1605                  if (excl == EXCL) {
1499 1606                          error = SET_ERROR(EEXIST);
1500 1607                          goto out;
1501 1608                  }
1502 1609                  /*
1503 1610                   * Can't open a directory for writing.
1504 1611                   */
1505 1612                  if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1506 1613                          error = SET_ERROR(EISDIR);
1507 1614                          goto out;
1508 1615                  }
1509 1616                  /*
1510 1617                   * Verify requested access to file.
1511 1618                   */
1512 1619                  if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1513 1620                          goto out;
1514 1621                  }
1515 1622  
1516 1623                  mutex_enter(&dzp->z_lock);
1517 1624                  dzp->z_seq++;
1518 1625                  mutex_exit(&dzp->z_lock);
1519 1626  
1520 1627                  /*
1521 1628                   * Truncate regular files if requested.
1522 1629                   */
1523 1630                  if ((ZTOV(zp)->v_type == VREG) &&
1524 1631                      (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1525 1632                          /* we can't hold any locks when calling zfs_freesp() */
1526 1633                          zfs_dirent_unlock(dl);
1527 1634                          dl = NULL;
1528 1635                          error = zfs_freesp(zp, 0, 0, mode, TRUE);
1529 1636                          if (error == 0) {
1530 1637                                  vnevent_create(ZTOV(zp), ct);
1531 1638                          }
1532 1639                  }
1533 1640          }
1534 1641  out:
1535 1642  
1536 1643          if (dl)
1537 1644                  zfs_dirent_unlock(dl);
1538 1645  
1539 1646          if (error) {
1540 1647                  if (zp)
1541 1648                          VN_RELE(ZTOV(zp));
1542 1649          } else {
1543 1650                  *vpp = ZTOV(zp);
1544 1651                  error = specvp_check(vpp, cr);
1545 1652          }
1546 1653  
1547 1654          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1548 1655                  zil_commit(zilog, 0);
1549 1656  
1550 1657          ZFS_EXIT(zfsvfs);
1551 1658          return (error);
1552 1659  }
1553 1660  
1554 1661  /*
1555 1662   * Remove an entry from a directory.
1556 1663   *
1557 1664   *      IN:     dvp     - vnode of directory to remove entry from.
1558 1665   *              name    - name of entry to remove.
1559 1666   *              cr      - credentials of caller.
1560 1667   *              ct      - caller context
1561 1668   *              flags   - case flags
1562 1669   *
1563 1670   *      RETURN: 0 on success, error code on failure.
1564 1671   *
1565 1672   * Timestamps:
1566 1673   *      dvp - ctime|mtime
1567 1674   *       vp - ctime (if nlink > 0)
1568 1675   */
1569 1676  
1570 1677  uint64_t null_xattr = 0;
1571 1678  
1572 1679  /*ARGSUSED*/
1573 1680  static int
1574 1681  zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1575 1682      int flags)
1576 1683  {
1577 1684          znode_t         *zp, *dzp = VTOZ(dvp);
1578 1685          znode_t         *xzp;
1579 1686          vnode_t         *vp;
1580 1687          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1581 1688          zilog_t         *zilog;
1582 1689          uint64_t        acl_obj, xattr_obj;
1583 1690          uint64_t        xattr_obj_unlinked = 0;
1584 1691          uint64_t        obj = 0;
1585 1692          zfs_dirlock_t   *dl;
1586 1693          dmu_tx_t        *tx;
1587 1694          boolean_t       may_delete_now, delete_now = FALSE;
1588 1695          boolean_t       unlinked, toobig = FALSE;
1589 1696          uint64_t        txtype;
1590 1697          pathname_t      *realnmp = NULL;
1591 1698          pathname_t      realnm;
1592 1699          int             error;
1593 1700          int             zflg = ZEXISTS;
1594 1701          boolean_t       waited = B_FALSE;
1595 1702  
1596 1703          ZFS_ENTER(zfsvfs);
1597 1704          ZFS_VERIFY_ZP(dzp);
1598 1705          zilog = zfsvfs->z_log;
1599 1706  
1600 1707          if (flags & FIGNORECASE) {
1601 1708                  zflg |= ZCILOOK;
1602 1709                  pn_alloc(&realnm);
1603 1710                  realnmp = &realnm;
1604 1711          }
1605 1712  
1606 1713  top:
1607 1714          xattr_obj = 0;
1608 1715          xzp = NULL;
1609 1716          /*
1610 1717           * Attempt to lock directory; fail if entry doesn't exist.
1611 1718           */
1612 1719          if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1613 1720              NULL, realnmp)) {
1614 1721                  if (realnmp)
1615 1722                          pn_free(realnmp);
1616 1723                  ZFS_EXIT(zfsvfs);
1617 1724                  return (error);
1618 1725          }
1619 1726  
1620 1727          vp = ZTOV(zp);
1621 1728  
1622 1729          if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1623 1730                  goto out;
1624 1731          }
1625 1732  
1626 1733          /*
1627 1734           * Need to use rmdir for removing directories.
1628 1735           */
1629 1736          if (vp->v_type == VDIR) {
1630 1737                  error = SET_ERROR(EPERM);
1631 1738                  goto out;
1632 1739          }
1633 1740  
1634 1741          vnevent_remove(vp, dvp, name, ct);
1635 1742  
1636 1743          if (realnmp)
1637 1744                  dnlc_remove(dvp, realnmp->pn_buf);
1638 1745          else
1639 1746                  dnlc_remove(dvp, name);
1640 1747  
1641 1748          mutex_enter(&vp->v_lock);
1642 1749          may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1643 1750          mutex_exit(&vp->v_lock);
1644 1751  
1645 1752          /*
1646 1753           * We may delete the znode now, or we may put it in the unlinked set;
1647 1754           * it depends on whether we're the last link, and on whether there are
1648 1755           * other holds on the vnode.  So we dmu_tx_hold() the right things to
1649 1756           * allow for either case.
1650 1757           */
1651 1758          obj = zp->z_id;
1652 1759          tx = dmu_tx_create(zfsvfs->z_os);
1653 1760          dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1654 1761          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1655 1762          zfs_sa_upgrade_txholds(tx, zp);
1656 1763          zfs_sa_upgrade_txholds(tx, dzp);
1657 1764          if (may_delete_now) {
1658 1765                  toobig =
1659 1766                      zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1660 1767                  /* if the file is too big, only hold_free a token amount */
1661 1768                  dmu_tx_hold_free(tx, zp->z_id, 0,
1662 1769                      (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1663 1770          }
1664 1771  
1665 1772          /* are there any extended attributes? */
1666 1773          error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1667 1774              &xattr_obj, sizeof (xattr_obj));
1668 1775          if (error == 0 && xattr_obj) {
1669 1776                  error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1670 1777                  ASSERT0(error);
1671 1778                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1672 1779                  dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1673 1780          }
1674 1781  
1675 1782          mutex_enter(&zp->z_lock);
1676 1783          if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1677 1784                  dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1678 1785          mutex_exit(&zp->z_lock);
1679 1786  
1680 1787          /* charge as an update -- would be nice not to charge at all */
1681 1788          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1682 1789  
1683 1790          /*
1684 1791           * Mark this transaction as typically resulting in a net free of
1685 1792           * space, unless object removal will be delayed indefinitely
1686 1793           * (due to active holds on the vnode due to the file being open).
1687 1794           */
1688 1795          if (may_delete_now)
1689 1796                  dmu_tx_mark_netfree(tx);
1690 1797  
1691 1798          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1692 1799          if (error) {
1693 1800                  zfs_dirent_unlock(dl);
1694 1801                  VN_RELE(vp);
1695 1802                  if (xzp)
1696 1803                          VN_RELE(ZTOV(xzp));
1697 1804                  if (error == ERESTART) {
1698 1805                          waited = B_TRUE;
1699 1806                          dmu_tx_wait(tx);
1700 1807                          dmu_tx_abort(tx);
1701 1808                          goto top;
1702 1809                  }
1703 1810                  if (realnmp)
1704 1811                          pn_free(realnmp);
1705 1812                  dmu_tx_abort(tx);
1706 1813                  ZFS_EXIT(zfsvfs);
1707 1814                  return (error);
1708 1815          }
1709 1816  
1710 1817          /*
1711 1818           * Remove the directory entry.
1712 1819           */
1713 1820          error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1714 1821  
1715 1822          if (error) {
1716 1823                  dmu_tx_commit(tx);
1717 1824                  goto out;
1718 1825          }
1719 1826  
1720 1827          if (unlinked) {
1721 1828                  /*
1722 1829                   * Hold z_lock so that we can make sure that the ACL obj
1723 1830                   * hasn't changed.  Could have been deleted due to
1724 1831                   * zfs_sa_upgrade().
1725 1832                   */
1726 1833                  mutex_enter(&zp->z_lock);
1727 1834                  mutex_enter(&vp->v_lock);
1728 1835                  (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1729 1836                      &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1730 1837                  delete_now = may_delete_now && !toobig &&
1731 1838                      vp->v_count == 1 && !vn_has_cached_data(vp) &&
1732 1839                      xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1733 1840                      acl_obj;
1734 1841                  mutex_exit(&vp->v_lock);
1735 1842          }
1736 1843  
1737 1844          if (delete_now) {
1738 1845                  if (xattr_obj_unlinked) {
1739 1846                          ASSERT3U(xzp->z_links, ==, 2);
1740 1847                          mutex_enter(&xzp->z_lock);
1741 1848                          xzp->z_unlinked = 1;
1742 1849                          xzp->z_links = 0;
1743 1850                          error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1744 1851                              &xzp->z_links, sizeof (xzp->z_links), tx);
1745 1852                          ASSERT3U(error,  ==,  0);
1746 1853                          mutex_exit(&xzp->z_lock);
1747 1854                          zfs_unlinked_add(xzp, tx);
1748 1855  
1749 1856                          if (zp->z_is_sa)
1750 1857                                  error = sa_remove(zp->z_sa_hdl,
1751 1858                                      SA_ZPL_XATTR(zfsvfs), tx);
1752 1859                          else
1753 1860                                  error = sa_update(zp->z_sa_hdl,
1754 1861                                      SA_ZPL_XATTR(zfsvfs), &null_xattr,
1755 1862                                      sizeof (uint64_t), tx);
1756 1863                          ASSERT0(error);
1757 1864                  }
1758 1865                  mutex_enter(&vp->v_lock);
1759 1866                  vp->v_count--;
1760 1867                  ASSERT0(vp->v_count);
1761 1868                  mutex_exit(&vp->v_lock);
1762 1869                  mutex_exit(&zp->z_lock);
1763 1870                  zfs_znode_delete(zp, tx);
1764 1871          } else if (unlinked) {
1765 1872                  mutex_exit(&zp->z_lock);
1766 1873                  zfs_unlinked_add(zp, tx);
1767 1874          }
1768 1875  
1769 1876          txtype = TX_REMOVE;
1770 1877          if (flags & FIGNORECASE)
1771 1878                  txtype |= TX_CI;
1772 1879          zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1773 1880  
1774 1881          dmu_tx_commit(tx);
1775 1882  out:
1776 1883          if (realnmp)
1777 1884                  pn_free(realnmp);
1778 1885  
1779 1886          zfs_dirent_unlock(dl);
1780 1887  
1781 1888          if (!delete_now)
1782 1889                  VN_RELE(vp);
1783 1890          if (xzp)
1784 1891                  VN_RELE(ZTOV(xzp));
1785 1892  
1786 1893          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1787 1894                  zil_commit(zilog, 0);
1788 1895  
1789 1896          ZFS_EXIT(zfsvfs);
1790 1897          return (error);
1791 1898  }
1792 1899  
1793 1900  /*
1794 1901   * Create a new directory and insert it into dvp using the name
1795 1902   * provided.  Return a pointer to the inserted directory.
1796 1903   *
1797 1904   *      IN:     dvp     - vnode of directory to add subdir to.
1798 1905   *              dirname - name of new directory.
1799 1906   *              vap     - attributes of new directory.
1800 1907   *              cr      - credentials of caller.
1801 1908   *              ct      - caller context
1802 1909   *              flags   - case flags
1803 1910   *              vsecp   - ACL to be set
1804 1911   *
1805 1912   *      OUT:    vpp     - vnode of created directory.
1806 1913   *
1807 1914   *      RETURN: 0 on success, error code on failure.
1808 1915   *
1809 1916   * Timestamps:
1810 1917   *      dvp - ctime|mtime updated
1811 1918   *       vp - ctime|mtime|atime updated
1812 1919   */
1813 1920  /*ARGSUSED*/
1814 1921  static int
1815 1922  zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1816 1923      caller_context_t *ct, int flags, vsecattr_t *vsecp)
1817 1924  {
1818 1925          znode_t         *zp, *dzp = VTOZ(dvp);
1819 1926          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1820 1927          zilog_t         *zilog;
1821 1928          zfs_dirlock_t   *dl;
1822 1929          uint64_t        txtype;
1823 1930          dmu_tx_t        *tx;
1824 1931          int             error;
1825 1932          int             zf = ZNEW;
1826 1933          ksid_t          *ksid;
1827 1934          uid_t           uid;
1828 1935          gid_t           gid = crgetgid(cr);
1829 1936          zfs_acl_ids_t   acl_ids;
1830 1937          boolean_t       fuid_dirtied;
1831 1938          boolean_t       waited = B_FALSE;
1832 1939  
1833 1940          ASSERT(vap->va_type == VDIR);
1834 1941  
1835 1942          /*
1836 1943           * If we have an ephemeral id, ACL, or XVATTR then
1837 1944           * make sure file system is at proper version
1838 1945           */
1839 1946  
1840 1947          ksid = crgetsid(cr, KSID_OWNER);
1841 1948          if (ksid)
1842 1949                  uid = ksid_getid(ksid);
1843 1950          else
1844 1951                  uid = crgetuid(cr);
1845 1952          if (zfsvfs->z_use_fuids == B_FALSE &&
1846 1953              (vsecp || (vap->va_mask & AT_XVATTR) ||
1847 1954              IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1848 1955                  return (SET_ERROR(EINVAL));
1849 1956  
1850 1957          ZFS_ENTER(zfsvfs);
1851 1958          ZFS_VERIFY_ZP(dzp);
1852 1959          zilog = zfsvfs->z_log;
1853 1960  
1854 1961          if (dzp->z_pflags & ZFS_XATTR) {
1855 1962                  ZFS_EXIT(zfsvfs);
1856 1963                  return (SET_ERROR(EINVAL));
1857 1964          }
1858 1965  
1859 1966          if (zfsvfs->z_utf8 && u8_validate(dirname,
1860 1967              strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1861 1968                  ZFS_EXIT(zfsvfs);
1862 1969                  return (SET_ERROR(EILSEQ));
1863 1970          }
1864 1971          if (flags & FIGNORECASE)
1865 1972                  zf |= ZCILOOK;
1866 1973  
1867 1974          if (vap->va_mask & AT_XVATTR) {
1868 1975                  if ((error = secpolicy_xvattr((xvattr_t *)vap,
1869 1976                      crgetuid(cr), cr, vap->va_type)) != 0) {
1870 1977                          ZFS_EXIT(zfsvfs);
1871 1978                          return (error);
1872 1979                  }
1873 1980          }
1874 1981  
1875 1982          if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1876 1983              vsecp, &acl_ids)) != 0) {
1877 1984                  ZFS_EXIT(zfsvfs);
1878 1985                  return (error);
1879 1986          }
1880 1987          /*
1881 1988           * First make sure the new directory doesn't exist.
1882 1989           *
1883 1990           * Existence is checked first to make sure we don't return
1884 1991           * EACCES instead of EEXIST which can cause some applications
1885 1992           * to fail.
1886 1993           */
1887 1994  top:
1888 1995          *vpp = NULL;
1889 1996  
1890 1997          if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1891 1998              NULL, NULL)) {
1892 1999                  zfs_acl_ids_free(&acl_ids);
1893 2000                  ZFS_EXIT(zfsvfs);
1894 2001                  return (error);
1895 2002          }
1896 2003  
1897 2004          if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1898 2005                  zfs_acl_ids_free(&acl_ids);
1899 2006                  zfs_dirent_unlock(dl);
1900 2007                  ZFS_EXIT(zfsvfs);
1901 2008                  return (error);
1902 2009          }
1903 2010  
1904 2011          if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1905 2012                  zfs_acl_ids_free(&acl_ids);
1906 2013                  zfs_dirent_unlock(dl);
1907 2014                  ZFS_EXIT(zfsvfs);
1908 2015                  return (SET_ERROR(EDQUOT));
1909 2016          }
1910 2017  
1911 2018          /*
1912 2019           * Add a new entry to the directory.
1913 2020           */
1914 2021          tx = dmu_tx_create(zfsvfs->z_os);
1915 2022          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1916 2023          dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1917 2024          fuid_dirtied = zfsvfs->z_fuid_dirty;
1918 2025          if (fuid_dirtied)
1919 2026                  zfs_fuid_txhold(zfsvfs, tx);
1920 2027          if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1921 2028                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1922 2029                      acl_ids.z_aclp->z_acl_bytes);
1923 2030          }
1924 2031  
1925 2032          dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1926 2033              ZFS_SA_BASE_ATTR_SIZE);
1927 2034  
1928 2035          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1929 2036          if (error) {
1930 2037                  zfs_dirent_unlock(dl);
1931 2038                  if (error == ERESTART) {
1932 2039                          waited = B_TRUE;
1933 2040                          dmu_tx_wait(tx);
1934 2041                          dmu_tx_abort(tx);
1935 2042                          goto top;
1936 2043                  }
1937 2044                  zfs_acl_ids_free(&acl_ids);
1938 2045                  dmu_tx_abort(tx);
1939 2046                  ZFS_EXIT(zfsvfs);
1940 2047                  return (error);
1941 2048          }
1942 2049  
1943 2050          /*
1944 2051           * Create new node.
1945 2052           */
1946 2053          zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1947 2054  
1948 2055          if (fuid_dirtied)
1949 2056                  zfs_fuid_sync(zfsvfs, tx);
1950 2057  
1951 2058          /*
1952 2059           * Now put new name in parent dir.
1953 2060           */
1954 2061          (void) zfs_link_create(dl, zp, tx, ZNEW);
1955 2062  
1956 2063          *vpp = ZTOV(zp);
1957 2064  
1958 2065          txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1959 2066          if (flags & FIGNORECASE)
1960 2067                  txtype |= TX_CI;
1961 2068          zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1962 2069              acl_ids.z_fuidp, vap);
1963 2070  
1964 2071          zfs_acl_ids_free(&acl_ids);
1965 2072  
1966 2073          dmu_tx_commit(tx);
1967 2074  
1968 2075          zfs_dirent_unlock(dl);
1969 2076  
1970 2077          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1971 2078                  zil_commit(zilog, 0);
1972 2079  
1973 2080          ZFS_EXIT(zfsvfs);
1974 2081          return (0);
1975 2082  }
1976 2083  
1977 2084  /*
1978 2085   * Remove a directory subdir entry.  If the current working
1979 2086   * directory is the same as the subdir to be removed, the
1980 2087   * remove will fail.
1981 2088   *
1982 2089   *      IN:     dvp     - vnode of directory to remove from.
1983 2090   *              name    - name of directory to be removed.
1984 2091   *              cwd     - vnode of current working directory.
1985 2092   *              cr      - credentials of caller.
1986 2093   *              ct      - caller context
1987 2094   *              flags   - case flags
1988 2095   *
1989 2096   *      RETURN: 0 on success, error code on failure.
1990 2097   *
1991 2098   * Timestamps:
1992 2099   *      dvp - ctime|mtime updated
1993 2100   */
1994 2101  /*ARGSUSED*/
1995 2102  static int
1996 2103  zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
1997 2104      caller_context_t *ct, int flags)
1998 2105  {
1999 2106          znode_t         *dzp = VTOZ(dvp);
2000 2107          znode_t         *zp;
2001 2108          vnode_t         *vp;
2002 2109          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
2003 2110          zilog_t         *zilog;
2004 2111          zfs_dirlock_t   *dl;
2005 2112          dmu_tx_t        *tx;
2006 2113          int             error;
2007 2114          int             zflg = ZEXISTS;
2008 2115          boolean_t       waited = B_FALSE;
2009 2116  
2010 2117          ZFS_ENTER(zfsvfs);
2011 2118          ZFS_VERIFY_ZP(dzp);
2012 2119          zilog = zfsvfs->z_log;
2013 2120  
2014 2121          if (flags & FIGNORECASE)
2015 2122                  zflg |= ZCILOOK;
2016 2123  top:
2017 2124          zp = NULL;
2018 2125  
2019 2126          /*
2020 2127           * Attempt to lock directory; fail if entry doesn't exist.
2021 2128           */
2022 2129          if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2023 2130              NULL, NULL)) {
2024 2131                  ZFS_EXIT(zfsvfs);
2025 2132                  return (error);
2026 2133          }
2027 2134  
2028 2135          vp = ZTOV(zp);
2029 2136  
2030 2137          if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2031 2138                  goto out;
2032 2139          }
2033 2140  
2034 2141          if (vp->v_type != VDIR) {
2035 2142                  error = SET_ERROR(ENOTDIR);
2036 2143                  goto out;
2037 2144          }
2038 2145  
2039 2146          if (vp == cwd) {
2040 2147                  error = SET_ERROR(EINVAL);
2041 2148                  goto out;
2042 2149          }
2043 2150  
2044 2151          vnevent_rmdir(vp, dvp, name, ct);
2045 2152  
2046 2153          /*
2047 2154           * Grab a lock on the directory to make sure that noone is
2048 2155           * trying to add (or lookup) entries while we are removing it.
2049 2156           */
2050 2157          rw_enter(&zp->z_name_lock, RW_WRITER);
2051 2158  
2052 2159          /*
2053 2160           * Grab a lock on the parent pointer to make sure we play well
2054 2161           * with the treewalk and directory rename code.
2055 2162           */
2056 2163          rw_enter(&zp->z_parent_lock, RW_WRITER);
2057 2164  
2058 2165          tx = dmu_tx_create(zfsvfs->z_os);
2059 2166          dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2060 2167          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2061 2168          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2062 2169          zfs_sa_upgrade_txholds(tx, zp);
2063 2170          zfs_sa_upgrade_txholds(tx, dzp);
2064 2171          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
2065 2172          if (error) {
2066 2173                  rw_exit(&zp->z_parent_lock);
2067 2174                  rw_exit(&zp->z_name_lock);
2068 2175                  zfs_dirent_unlock(dl);
2069 2176                  VN_RELE(vp);
2070 2177                  if (error == ERESTART) {
2071 2178                          waited = B_TRUE;
2072 2179                          dmu_tx_wait(tx);
2073 2180                          dmu_tx_abort(tx);
2074 2181                          goto top;
2075 2182                  }
2076 2183                  dmu_tx_abort(tx);
2077 2184                  ZFS_EXIT(zfsvfs);
2078 2185                  return (error);
2079 2186          }
2080 2187  
2081 2188          error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2082 2189  
2083 2190          if (error == 0) {
2084 2191                  uint64_t txtype = TX_RMDIR;
2085 2192                  if (flags & FIGNORECASE)
2086 2193                          txtype |= TX_CI;
2087 2194                  zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2088 2195          }
2089 2196  
2090 2197          dmu_tx_commit(tx);
2091 2198  
2092 2199          rw_exit(&zp->z_parent_lock);
2093 2200          rw_exit(&zp->z_name_lock);
2094 2201  out:
2095 2202          zfs_dirent_unlock(dl);
2096 2203  
2097 2204          VN_RELE(vp);
2098 2205  
2099 2206          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2100 2207                  zil_commit(zilog, 0);
2101 2208  
2102 2209          ZFS_EXIT(zfsvfs);
2103 2210          return (error);
2104 2211  }
2105 2212  
2106 2213  /*
2107 2214   * Read as many directory entries as will fit into the provided
2108 2215   * buffer from the given directory cursor position (specified in
2109 2216   * the uio structure).
2110 2217   *
2111 2218   *      IN:     vp      - vnode of directory to read.
2112 2219   *              uio     - structure supplying read location, range info,
2113 2220   *                        and return buffer.
2114 2221   *              cr      - credentials of caller.
2115 2222   *              ct      - caller context
2116 2223   *              flags   - case flags
2117 2224   *
2118 2225   *      OUT:    uio     - updated offset and range, buffer filled.
2119 2226   *              eofp    - set to true if end-of-file detected.
2120 2227   *
2121 2228   *      RETURN: 0 on success, error code on failure.
2122 2229   *
2123 2230   * Timestamps:
2124 2231   *      vp - atime updated
2125 2232   *
2126 2233   * Note that the low 4 bits of the cookie returned by zap is always zero.
2127 2234   * This allows us to use the low range for "special" directory entries:
2128 2235   * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2129 2236   * we use the offset 2 for the '.zfs' directory.
2130 2237   */
2131 2238  /* ARGSUSED */
2132 2239  static int
2133 2240  zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
2134 2241      caller_context_t *ct, int flags)
2135 2242  {
2136 2243          znode_t         *zp = VTOZ(vp);
2137 2244          iovec_t         *iovp;
2138 2245          edirent_t       *eodp;
2139 2246          dirent64_t      *odp;
2140 2247          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2141 2248          objset_t        *os;
2142 2249          caddr_t         outbuf;
2143 2250          size_t          bufsize;
2144 2251          zap_cursor_t    zc;
2145 2252          zap_attribute_t zap;
2146 2253          uint_t          bytes_wanted;
2147 2254          uint64_t        offset; /* must be unsigned; checks for < 1 */
2148 2255          uint64_t        parent;
2149 2256          int             local_eof;
2150 2257          int             outcount;
2151 2258          int             error;
2152 2259          uint8_t         prefetch;
2153 2260          boolean_t       check_sysattrs;
2154 2261  
2155 2262          ZFS_ENTER(zfsvfs);
2156 2263          ZFS_VERIFY_ZP(zp);
2157 2264  
2158 2265          if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2159 2266              &parent, sizeof (parent))) != 0) {
2160 2267                  ZFS_EXIT(zfsvfs);
2161 2268                  return (error);
2162 2269          }
2163 2270  
2164 2271          /*
2165 2272           * If we are not given an eof variable,
2166 2273           * use a local one.
2167 2274           */
2168 2275          if (eofp == NULL)
2169 2276                  eofp = &local_eof;
2170 2277  
2171 2278          /*
2172 2279           * Check for valid iov_len.
2173 2280           */
2174 2281          if (uio->uio_iov->iov_len <= 0) {
2175 2282                  ZFS_EXIT(zfsvfs);
2176 2283                  return (SET_ERROR(EINVAL));
2177 2284          }
2178 2285  
2179 2286          /*
2180 2287           * Quit if directory has been removed (posix)
2181 2288           */
2182 2289          if ((*eofp = zp->z_unlinked) != 0) {
2183 2290                  ZFS_EXIT(zfsvfs);
2184 2291                  return (0);
2185 2292          }
2186 2293  
2187 2294          error = 0;
2188 2295          os = zfsvfs->z_os;
2189 2296          offset = uio->uio_loffset;
2190 2297          prefetch = zp->z_zn_prefetch;
2191 2298  
2192 2299          /*
2193 2300           * Initialize the iterator cursor.
2194 2301           */
2195 2302          if (offset <= 3) {
2196 2303                  /*
2197 2304                   * Start iteration from the beginning of the directory.
2198 2305                   */
2199 2306                  zap_cursor_init(&zc, os, zp->z_id);
2200 2307          } else {
2201 2308                  /*
2202 2309                   * The offset is a serialized cursor.
2203 2310                   */
2204 2311                  zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2205 2312          }
2206 2313  
2207 2314          /*
2208 2315           * Get space to change directory entries into fs independent format.
2209 2316           */
2210 2317          iovp = uio->uio_iov;
2211 2318          bytes_wanted = iovp->iov_len;
2212 2319          if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2213 2320                  bufsize = bytes_wanted;
2214 2321                  outbuf = kmem_alloc(bufsize, KM_SLEEP);
2215 2322                  odp = (struct dirent64 *)outbuf;
2216 2323          } else {
2217 2324                  bufsize = bytes_wanted;
2218 2325                  outbuf = NULL;
2219 2326                  odp = (struct dirent64 *)iovp->iov_base;
2220 2327          }
2221 2328          eodp = (struct edirent *)odp;
2222 2329  
2223 2330          /*
2224 2331           * If this VFS supports the system attribute view interface; and
2225 2332           * we're looking at an extended attribute directory; and we care
2226 2333           * about normalization conflicts on this vfs; then we must check
2227 2334           * for normalization conflicts with the sysattr name space.
2228 2335           */
2229 2336          check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2230 2337              (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2231 2338              (flags & V_RDDIR_ENTFLAGS);
2232 2339  
2233 2340          /*
2234 2341           * Transform to file-system independent format
2235 2342           */
2236 2343          outcount = 0;
2237 2344          while (outcount < bytes_wanted) {
2238 2345                  ino64_t objnum;
2239 2346                  ushort_t reclen;
2240 2347                  off64_t *next = NULL;
2241 2348  
2242 2349                  /*
2243 2350                   * Special case `.', `..', and `.zfs'.
2244 2351                   */
2245 2352                  if (offset == 0) {
2246 2353                          (void) strcpy(zap.za_name, ".");
2247 2354                          zap.za_normalization_conflict = 0;
2248 2355                          objnum = zp->z_id;
2249 2356                  } else if (offset == 1) {
2250 2357                          (void) strcpy(zap.za_name, "..");
2251 2358                          zap.za_normalization_conflict = 0;
2252 2359                          objnum = parent;
2253 2360                  } else if (offset == 2 && zfs_show_ctldir(zp)) {
2254 2361                          (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2255 2362                          zap.za_normalization_conflict = 0;
2256 2363                          objnum = ZFSCTL_INO_ROOT;
2257 2364                  } else {
2258 2365                          /*
2259 2366                           * Grab next entry.
2260 2367                           */
2261 2368                          if (error = zap_cursor_retrieve(&zc, &zap)) {
2262 2369                                  if ((*eofp = (error == ENOENT)) != 0)
2263 2370                                          break;
2264 2371                                  else
2265 2372                                          goto update;
2266 2373                          }
2267 2374  
2268 2375                          if (zap.za_integer_length != 8 ||
2269 2376                              zap.za_num_integers != 1) {
2270 2377                                  cmn_err(CE_WARN, "zap_readdir: bad directory "
2271 2378                                      "entry, obj = %lld, offset = %lld\n",
2272 2379                                      (u_longlong_t)zp->z_id,
2273 2380                                      (u_longlong_t)offset);
2274 2381                                  error = SET_ERROR(ENXIO);
2275 2382                                  goto update;
2276 2383                          }
2277 2384  
2278 2385                          objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2279 2386                          /*
2280 2387                           * MacOS X can extract the object type here such as:
2281 2388                           * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2282 2389                           */
2283 2390  
2284 2391                          if (check_sysattrs && !zap.za_normalization_conflict) {
2285 2392                                  zap.za_normalization_conflict =
2286 2393                                      xattr_sysattr_casechk(zap.za_name);
2287 2394                          }
2288 2395                  }
2289 2396  
2290 2397                  if (flags & V_RDDIR_ACCFILTER) {
2291 2398                          /*
2292 2399                           * If we have no access at all, don't include
2293 2400                           * this entry in the returned information
2294 2401                           */
2295 2402                          znode_t *ezp;
2296 2403                          if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2297 2404                                  goto skip_entry;
2298 2405                          if (!zfs_has_access(ezp, cr)) {
2299 2406                                  VN_RELE(ZTOV(ezp));
2300 2407                                  goto skip_entry;
2301 2408                          }
2302 2409                          VN_RELE(ZTOV(ezp));
2303 2410                  }
2304 2411  
2305 2412                  if (flags & V_RDDIR_ENTFLAGS)
2306 2413                          reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2307 2414                  else
2308 2415                          reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2309 2416  
2310 2417                  /*
2311 2418                   * Will this entry fit in the buffer?
2312 2419                   */
2313 2420                  if (outcount + reclen > bufsize) {
2314 2421                          /*
2315 2422                           * Did we manage to fit anything in the buffer?
2316 2423                           */
2317 2424                          if (!outcount) {
2318 2425                                  error = SET_ERROR(EINVAL);
2319 2426                                  goto update;
2320 2427                          }
2321 2428                          break;
2322 2429                  }
2323 2430                  if (flags & V_RDDIR_ENTFLAGS) {
2324 2431                          /*
2325 2432                           * Add extended flag entry:
2326 2433                           */
2327 2434                          eodp->ed_ino = objnum;
2328 2435                          eodp->ed_reclen = reclen;
2329 2436                          /* NOTE: ed_off is the offset for the *next* entry */
2330 2437                          next = &(eodp->ed_off);
2331 2438                          eodp->ed_eflags = zap.za_normalization_conflict ?
2332 2439                              ED_CASE_CONFLICT : 0;
2333 2440                          (void) strncpy(eodp->ed_name, zap.za_name,
2334 2441                              EDIRENT_NAMELEN(reclen));
2335 2442                          eodp = (edirent_t *)((intptr_t)eodp + reclen);
2336 2443                  } else {
2337 2444                          /*
2338 2445                           * Add normal entry:
2339 2446                           */
2340 2447                          odp->d_ino = objnum;
2341 2448                          odp->d_reclen = reclen;
2342 2449                          /* NOTE: d_off is the offset for the *next* entry */
2343 2450                          next = &(odp->d_off);
2344 2451                          (void) strncpy(odp->d_name, zap.za_name,
2345 2452                              DIRENT64_NAMELEN(reclen));
2346 2453                          odp = (dirent64_t *)((intptr_t)odp + reclen);
2347 2454                  }
2348 2455                  outcount += reclen;
2349 2456  
2350 2457                  ASSERT(outcount <= bufsize);
2351 2458  
2352 2459                  /* Prefetch znode */
2353 2460                  if (prefetch)
2354 2461                          dmu_prefetch(os, objnum, 0, 0);
2355 2462  
2356 2463          skip_entry:
2357 2464                  /*
2358 2465                   * Move to the next entry, fill in the previous offset.
2359 2466                   */
2360 2467                  if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2361 2468                          zap_cursor_advance(&zc);
2362 2469                          offset = zap_cursor_serialize(&zc);
2363 2470                  } else {
2364 2471                          offset += 1;
2365 2472                  }
2366 2473                  if (next)
2367 2474                          *next = offset;
2368 2475          }
2369 2476          zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2370 2477  
2371 2478          if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2372 2479                  iovp->iov_base += outcount;
2373 2480                  iovp->iov_len -= outcount;
2374 2481                  uio->uio_resid -= outcount;
2375 2482          } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2376 2483                  /*
2377 2484                   * Reset the pointer.
2378 2485                   */
2379 2486                  offset = uio->uio_loffset;
2380 2487          }
2381 2488  
2382 2489  update:
2383 2490          zap_cursor_fini(&zc);
2384 2491          if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2385 2492                  kmem_free(outbuf, bufsize);
2386 2493  
2387 2494          if (error == ENOENT)
2388 2495                  error = 0;
2389 2496  
2390 2497          ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2391 2498  
2392 2499          uio->uio_loffset = offset;
2393 2500          ZFS_EXIT(zfsvfs);
2394 2501          return (error);
2395 2502  }
2396 2503  
2397 2504  ulong_t zfs_fsync_sync_cnt = 4;
2398 2505  
2399 2506  static int
2400 2507  zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2401 2508  {
2402 2509          znode_t *zp = VTOZ(vp);
2403 2510          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2404 2511  
2405 2512          /*
2406 2513           * Regardless of whether this is required for standards conformance,
2407 2514           * this is the logical behavior when fsync() is called on a file with
2408 2515           * dirty pages.  We use B_ASYNC since the ZIL transactions are already
2409 2516           * going to be pushed out as part of the zil_commit().
2410 2517           */
2411 2518          if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2412 2519              (vp->v_type == VREG) && !(IS_SWAPVP(vp)))
2413 2520                  (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct);
2414 2521  
2415 2522          (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2416 2523  
2417 2524          if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2418 2525                  ZFS_ENTER(zfsvfs);
2419 2526                  ZFS_VERIFY_ZP(zp);
2420 2527                  zil_commit(zfsvfs->z_log, zp->z_id);
2421 2528                  ZFS_EXIT(zfsvfs);
2422 2529          }
2423 2530          return (0);
2424 2531  }
2425 2532  
2426 2533  
2427 2534  /*
2428 2535   * Get the requested file attributes and place them in the provided
2429 2536   * vattr structure.
2430 2537   *
2431 2538   *      IN:     vp      - vnode of file.
2432 2539   *              vap     - va_mask identifies requested attributes.
2433 2540   *                        If AT_XVATTR set, then optional attrs are requested
2434 2541   *              flags   - ATTR_NOACLCHECK (CIFS server context)
2435 2542   *              cr      - credentials of caller.
2436 2543   *              ct      - caller context
2437 2544   *
2438 2545   *      OUT:    vap     - attribute values.
2439 2546   *
2440 2547   *      RETURN: 0 (always succeeds).
2441 2548   */
2442 2549  /* ARGSUSED */
2443 2550  static int
2444 2551  zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2445 2552      caller_context_t *ct)
2446 2553  {
2447 2554          znode_t *zp = VTOZ(vp);
2448 2555          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2449 2556          int     error = 0;
2450 2557          uint64_t links;
2451 2558          uint64_t mtime[2], ctime[2];
2452 2559          xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2453 2560          xoptattr_t *xoap = NULL;
2454 2561          boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2455 2562          sa_bulk_attr_t bulk[2];
2456 2563          int count = 0;
2457 2564  
2458 2565          ZFS_ENTER(zfsvfs);
2459 2566          ZFS_VERIFY_ZP(zp);
2460 2567  
2461 2568          zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2462 2569  
2463 2570          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2464 2571          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2465 2572  
2466 2573          if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2467 2574                  ZFS_EXIT(zfsvfs);
2468 2575                  return (error);
2469 2576          }
2470 2577  
2471 2578          /*
2472 2579           * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2473 2580           * Also, if we are the owner don't bother, since owner should
2474 2581           * always be allowed to read basic attributes of file.
2475 2582           */
2476 2583          if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2477 2584              (vap->va_uid != crgetuid(cr))) {
2478 2585                  if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2479 2586                      skipaclchk, cr)) {
2480 2587                          ZFS_EXIT(zfsvfs);
2481 2588                          return (error);
2482 2589                  }
2483 2590          }
2484 2591  
2485 2592          /*
2486 2593           * Return all attributes.  It's cheaper to provide the answer
2487 2594           * than to determine whether we were asked the question.
2488 2595           */
2489 2596  
2490 2597          mutex_enter(&zp->z_lock);
2491 2598          vap->va_type = vp->v_type;
2492 2599          vap->va_mode = zp->z_mode & MODEMASK;
2493 2600          vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2494 2601          vap->va_nodeid = zp->z_id;
2495 2602          if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2496 2603                  links = zp->z_links + 1;
2497 2604          else
2498 2605                  links = zp->z_links;
2499 2606          vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
2500 2607          vap->va_size = zp->z_size;
2501 2608          vap->va_rdev = vp->v_rdev;
2502 2609          vap->va_seq = zp->z_seq;
2503 2610  
2504 2611          /*
2505 2612           * Add in any requested optional attributes and the create time.
2506 2613           * Also set the corresponding bits in the returned attribute bitmap.
2507 2614           */
2508 2615          if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2509 2616                  if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2510 2617                          xoap->xoa_archive =
2511 2618                              ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2512 2619                          XVA_SET_RTN(xvap, XAT_ARCHIVE);
2513 2620                  }
2514 2621  
2515 2622                  if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2516 2623                          xoap->xoa_readonly =
2517 2624                              ((zp->z_pflags & ZFS_READONLY) != 0);
2518 2625                          XVA_SET_RTN(xvap, XAT_READONLY);
2519 2626                  }
2520 2627  
2521 2628                  if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2522 2629                          xoap->xoa_system =
2523 2630                              ((zp->z_pflags & ZFS_SYSTEM) != 0);
2524 2631                          XVA_SET_RTN(xvap, XAT_SYSTEM);
2525 2632                  }
2526 2633  
2527 2634                  if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2528 2635                          xoap->xoa_hidden =
2529 2636                              ((zp->z_pflags & ZFS_HIDDEN) != 0);
2530 2637                          XVA_SET_RTN(xvap, XAT_HIDDEN);
2531 2638                  }
2532 2639  
2533 2640                  if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2534 2641                          xoap->xoa_nounlink =
2535 2642                              ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2536 2643                          XVA_SET_RTN(xvap, XAT_NOUNLINK);
2537 2644                  }
2538 2645  
2539 2646                  if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2540 2647                          xoap->xoa_immutable =
2541 2648                              ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2542 2649                          XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2543 2650                  }
2544 2651  
2545 2652                  if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2546 2653                          xoap->xoa_appendonly =
2547 2654                              ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2548 2655                          XVA_SET_RTN(xvap, XAT_APPENDONLY);
2549 2656                  }
2550 2657  
2551 2658                  if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2552 2659                          xoap->xoa_nodump =
2553 2660                              ((zp->z_pflags & ZFS_NODUMP) != 0);
2554 2661                          XVA_SET_RTN(xvap, XAT_NODUMP);
2555 2662                  }
2556 2663  
2557 2664                  if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2558 2665                          xoap->xoa_opaque =
2559 2666                              ((zp->z_pflags & ZFS_OPAQUE) != 0);
2560 2667                          XVA_SET_RTN(xvap, XAT_OPAQUE);
2561 2668                  }
2562 2669  
2563 2670                  if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2564 2671                          xoap->xoa_av_quarantined =
2565 2672                              ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2566 2673                          XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2567 2674                  }
2568 2675  
2569 2676                  if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2570 2677                          xoap->xoa_av_modified =
2571 2678                              ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2572 2679                          XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2573 2680                  }
2574 2681  
2575 2682                  if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2576 2683                      vp->v_type == VREG) {
2577 2684                          zfs_sa_get_scanstamp(zp, xvap);
2578 2685                  }
2579 2686  
2580 2687                  if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2581 2688                          uint64_t times[2];
2582 2689  
2583 2690                          (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2584 2691                              times, sizeof (times));
2585 2692                          ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2586 2693                          XVA_SET_RTN(xvap, XAT_CREATETIME);
2587 2694                  }
2588 2695  
2589 2696                  if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2590 2697                          xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2591 2698                          XVA_SET_RTN(xvap, XAT_REPARSE);
2592 2699                  }
2593 2700                  if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2594 2701                          xoap->xoa_generation = zp->z_gen;
2595 2702                          XVA_SET_RTN(xvap, XAT_GEN);
2596 2703                  }
2597 2704  
2598 2705                  if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2599 2706                          xoap->xoa_offline =
2600 2707                              ((zp->z_pflags & ZFS_OFFLINE) != 0);
2601 2708                          XVA_SET_RTN(xvap, XAT_OFFLINE);
2602 2709                  }
2603 2710  
2604 2711                  if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2605 2712                          xoap->xoa_sparse =
2606 2713                              ((zp->z_pflags & ZFS_SPARSE) != 0);
2607 2714                          XVA_SET_RTN(xvap, XAT_SPARSE);
2608 2715                  }
2609 2716          }
2610 2717  
2611 2718          ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2612 2719          ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2613 2720          ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2614 2721  
2615 2722          mutex_exit(&zp->z_lock);
2616 2723  
2617 2724          sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
2618 2725  
2619 2726          if (zp->z_blksz == 0) {
2620 2727                  /*
2621 2728                   * Block size hasn't been set; suggest maximal I/O transfers.
2622 2729                   */
2623 2730                  vap->va_blksize = zfsvfs->z_max_blksz;
2624 2731          }
2625 2732  
2626 2733          ZFS_EXIT(zfsvfs);
2627 2734          return (0);
2628 2735  }
2629 2736  
2630 2737  /*
2631 2738   * Set the file attributes to the values contained in the
2632 2739   * vattr structure.
2633 2740   *
2634 2741   *      IN:     vp      - vnode of file to be modified.
2635 2742   *              vap     - new attribute values.
2636 2743   *                        If AT_XVATTR set, then optional attrs are being set
2637 2744   *              flags   - ATTR_UTIME set if non-default time values provided.
2638 2745   *                      - ATTR_NOACLCHECK (CIFS context only).
2639 2746   *              cr      - credentials of caller.
2640 2747   *              ct      - caller context
2641 2748   *
2642 2749   *      RETURN: 0 on success, error code on failure.
2643 2750   *
2644 2751   * Timestamps:
2645 2752   *      vp - ctime updated, mtime updated if size changed.
2646 2753   */
2647 2754  /* ARGSUSED */
2648 2755  static int
2649 2756  zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2650 2757      caller_context_t *ct)
2651 2758  {
2652 2759          znode_t         *zp = VTOZ(vp);
2653 2760          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2654 2761          zilog_t         *zilog;
2655 2762          dmu_tx_t        *tx;
2656 2763          vattr_t         oldva;
2657 2764          xvattr_t        tmpxvattr;
2658 2765          uint_t          mask = vap->va_mask;
2659 2766          uint_t          saved_mask = 0;
2660 2767          int             trim_mask = 0;
2661 2768          uint64_t        new_mode;
2662 2769          uint64_t        new_uid, new_gid;
2663 2770          uint64_t        xattr_obj;
2664 2771          uint64_t        mtime[2], ctime[2];
2665 2772          znode_t         *attrzp;
2666 2773          int             need_policy = FALSE;
2667 2774          int             err, err2;
2668 2775          zfs_fuid_info_t *fuidp = NULL;
2669 2776          xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2670 2777          xoptattr_t      *xoap;
2671 2778          zfs_acl_t       *aclp;
2672 2779          boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2673 2780          boolean_t       fuid_dirtied = B_FALSE;
2674 2781          sa_bulk_attr_t  bulk[7], xattr_bulk[7];
2675 2782          int             count = 0, xattr_count = 0;
2676 2783  
2677 2784          if (mask == 0)
2678 2785                  return (0);
2679 2786  
2680 2787          if (mask & AT_NOSET)
2681 2788                  return (SET_ERROR(EINVAL));
2682 2789  
2683 2790          ZFS_ENTER(zfsvfs);
2684 2791          ZFS_VERIFY_ZP(zp);
2685 2792  
2686 2793          zilog = zfsvfs->z_log;
2687 2794  
2688 2795          /*
2689 2796           * Make sure that if we have ephemeral uid/gid or xvattr specified
2690 2797           * that file system is at proper version level
2691 2798           */
2692 2799  
2693 2800          if (zfsvfs->z_use_fuids == B_FALSE &&
2694 2801              (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2695 2802              ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2696 2803              (mask & AT_XVATTR))) {
2697 2804                  ZFS_EXIT(zfsvfs);
2698 2805                  return (SET_ERROR(EINVAL));
2699 2806          }
2700 2807  
2701 2808          if (mask & AT_SIZE && vp->v_type == VDIR) {
2702 2809                  ZFS_EXIT(zfsvfs);
2703 2810                  return (SET_ERROR(EISDIR));
2704 2811          }
2705 2812  
2706 2813          if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2707 2814                  ZFS_EXIT(zfsvfs);
2708 2815                  return (SET_ERROR(EINVAL));
2709 2816          }
2710 2817  
2711 2818          /*
2712 2819           * If this is an xvattr_t, then get a pointer to the structure of
2713 2820           * optional attributes.  If this is NULL, then we have a vattr_t.
2714 2821           */
2715 2822          xoap = xva_getxoptattr(xvap);
2716 2823  
2717 2824          xva_init(&tmpxvattr);
2718 2825  
2719 2826          /*
2720 2827           * Immutable files can only alter immutable bit and atime
2721 2828           */
2722 2829          if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2723 2830              ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2724 2831              ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2725 2832                  ZFS_EXIT(zfsvfs);
2726 2833                  return (SET_ERROR(EPERM));
2727 2834          }
2728 2835  
2729 2836          if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2730 2837                  ZFS_EXIT(zfsvfs);
2731 2838                  return (SET_ERROR(EPERM));
2732 2839          }
2733 2840  
2734 2841          /*
2735 2842           * Verify timestamps doesn't overflow 32 bits.
2736 2843           * ZFS can handle large timestamps, but 32bit syscalls can't
2737 2844           * handle times greater than 2039.  This check should be removed
2738 2845           * once large timestamps are fully supported.
2739 2846           */
2740 2847          if (mask & (AT_ATIME | AT_MTIME)) {
2741 2848                  if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2742 2849                      ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2743 2850                          ZFS_EXIT(zfsvfs);
2744 2851                          return (SET_ERROR(EOVERFLOW));
2745 2852                  }
2746 2853          }
2747 2854  
2748 2855  top:
2749 2856          attrzp = NULL;
2750 2857          aclp = NULL;
2751 2858  
2752 2859          /* Can this be moved to before the top label? */
2753 2860          if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2754 2861                  ZFS_EXIT(zfsvfs);
2755 2862                  return (SET_ERROR(EROFS));
2756 2863          }
2757 2864  
2758 2865          /*
2759 2866           * First validate permissions
2760 2867           */
2761 2868  
2762 2869          if (mask & AT_SIZE) {
2763 2870                  err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2764 2871                  if (err) {
2765 2872                          ZFS_EXIT(zfsvfs);
2766 2873                          return (err);
2767 2874                  }
2768 2875                  /*
2769 2876                   * XXX - Note, we are not providing any open
2770 2877                   * mode flags here (like FNDELAY), so we may
2771 2878                   * block if there are locks present... this
2772 2879                   * should be addressed in openat().
2773 2880                   */
2774 2881                  /* XXX - would it be OK to generate a log record here? */
2775 2882                  err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2776 2883                  if (err) {
2777 2884                          ZFS_EXIT(zfsvfs);
2778 2885                          return (err);
2779 2886                  }
2780 2887  
2781 2888                  if (vap->va_size == 0)
2782 2889                          vnevent_truncate(ZTOV(zp), ct);
2783 2890          }
2784 2891  
2785 2892          if (mask & (AT_ATIME|AT_MTIME) ||
2786 2893              ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2787 2894              XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2788 2895              XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2789 2896              XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2790 2897              XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2791 2898              XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2792 2899              XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2793 2900                  need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2794 2901                      skipaclchk, cr);
2795 2902          }
2796 2903  
2797 2904          if (mask & (AT_UID|AT_GID)) {
2798 2905                  int     idmask = (mask & (AT_UID|AT_GID));
2799 2906                  int     take_owner;
2800 2907                  int     take_group;
2801 2908  
2802 2909                  /*
2803 2910                   * NOTE: even if a new mode is being set,
2804 2911                   * we may clear S_ISUID/S_ISGID bits.
2805 2912                   */
2806 2913  
2807 2914                  if (!(mask & AT_MODE))
2808 2915                          vap->va_mode = zp->z_mode;
2809 2916  
2810 2917                  /*
2811 2918                   * Take ownership or chgrp to group we are a member of
2812 2919                   */
2813 2920  
2814 2921                  take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2815 2922                  take_group = (mask & AT_GID) &&
2816 2923                      zfs_groupmember(zfsvfs, vap->va_gid, cr);
2817 2924  
2818 2925                  /*
2819 2926                   * If both AT_UID and AT_GID are set then take_owner and
2820 2927                   * take_group must both be set in order to allow taking
2821 2928                   * ownership.
2822 2929                   *
2823 2930                   * Otherwise, send the check through secpolicy_vnode_setattr()
2824 2931                   *
2825 2932                   */
2826 2933  
2827 2934                  if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2828 2935                      ((idmask == AT_UID) && take_owner) ||
2829 2936                      ((idmask == AT_GID) && take_group)) {
2830 2937                          if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2831 2938                              skipaclchk, cr) == 0) {
2832 2939                                  /*
2833 2940                                   * Remove setuid/setgid for non-privileged users
2834 2941                                   */
2835 2942                                  secpolicy_setid_clear(vap, cr);
2836 2943                                  trim_mask = (mask & (AT_UID|AT_GID));
2837 2944                          } else {
2838 2945                                  need_policy =  TRUE;
2839 2946                          }
2840 2947                  } else {
2841 2948                          need_policy =  TRUE;
2842 2949                  }
2843 2950          }
2844 2951  
2845 2952          mutex_enter(&zp->z_lock);
2846 2953          oldva.va_mode = zp->z_mode;
2847 2954          zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2848 2955          if (mask & AT_XVATTR) {
2849 2956                  /*
2850 2957                   * Update xvattr mask to include only those attributes
2851 2958                   * that are actually changing.
2852 2959                   *
2853 2960                   * the bits will be restored prior to actually setting
2854 2961                   * the attributes so the caller thinks they were set.
2855 2962                   */
2856 2963                  if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2857 2964                          if (xoap->xoa_appendonly !=
2858 2965                              ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2859 2966                                  need_policy = TRUE;
2860 2967                          } else {
2861 2968                                  XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2862 2969                                  XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2863 2970                          }
2864 2971                  }
2865 2972  
2866 2973                  if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2867 2974                          if (xoap->xoa_nounlink !=
2868 2975                              ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2869 2976                                  need_policy = TRUE;
2870 2977                          } else {
2871 2978                                  XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2872 2979                                  XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2873 2980                          }
2874 2981                  }
2875 2982  
2876 2983                  if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2877 2984                          if (xoap->xoa_immutable !=
2878 2985                              ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2879 2986                                  need_policy = TRUE;
2880 2987                          } else {
2881 2988                                  XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2882 2989                                  XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2883 2990                          }
2884 2991                  }
2885 2992  
2886 2993                  if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2887 2994                          if (xoap->xoa_nodump !=
2888 2995                              ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2889 2996                                  need_policy = TRUE;
2890 2997                          } else {
2891 2998                                  XVA_CLR_REQ(xvap, XAT_NODUMP);
2892 2999                                  XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2893 3000                          }
2894 3001                  }
2895 3002  
2896 3003                  if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2897 3004                          if (xoap->xoa_av_modified !=
2898 3005                              ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2899 3006                                  need_policy = TRUE;
2900 3007                          } else {
2901 3008                                  XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2902 3009                                  XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2903 3010                          }
2904 3011                  }
2905 3012  
2906 3013                  if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2907 3014                          if ((vp->v_type != VREG &&
2908 3015                              xoap->xoa_av_quarantined) ||
2909 3016                              xoap->xoa_av_quarantined !=
2910 3017                              ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2911 3018                                  need_policy = TRUE;
2912 3019                          } else {
2913 3020                                  XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2914 3021                                  XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2915 3022                          }
2916 3023                  }
2917 3024  
2918 3025                  if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2919 3026                          mutex_exit(&zp->z_lock);
2920 3027                          ZFS_EXIT(zfsvfs);
2921 3028                          return (SET_ERROR(EPERM));
2922 3029                  }
2923 3030  
2924 3031                  if (need_policy == FALSE &&
2925 3032                      (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2926 3033                      XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2927 3034                          need_policy = TRUE;
2928 3035                  }
2929 3036          }
2930 3037  
2931 3038          mutex_exit(&zp->z_lock);
2932 3039  
2933 3040          if (mask & AT_MODE) {
2934 3041                  if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2935 3042                          err = secpolicy_setid_setsticky_clear(vp, vap,
2936 3043                              &oldva, cr);
2937 3044                          if (err) {
2938 3045                                  ZFS_EXIT(zfsvfs);
2939 3046                                  return (err);
2940 3047                          }
2941 3048                          trim_mask |= AT_MODE;
2942 3049                  } else {
2943 3050                          need_policy = TRUE;
2944 3051                  }
2945 3052          }
2946 3053  
2947 3054          if (need_policy) {
2948 3055                  /*
2949 3056                   * If trim_mask is set then take ownership
2950 3057                   * has been granted or write_acl is present and user
2951 3058                   * has the ability to modify mode.  In that case remove
2952 3059                   * UID|GID and or MODE from mask so that
2953 3060                   * secpolicy_vnode_setattr() doesn't revoke it.
2954 3061                   */
2955 3062  
2956 3063                  if (trim_mask) {
2957 3064                          saved_mask = vap->va_mask;
2958 3065                          vap->va_mask &= ~trim_mask;
2959 3066                  }
2960 3067                  err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2961 3068                      (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2962 3069                  if (err) {
2963 3070                          ZFS_EXIT(zfsvfs);
2964 3071                          return (err);
2965 3072                  }
2966 3073  
2967 3074                  if (trim_mask)
2968 3075                          vap->va_mask |= saved_mask;
2969 3076          }
2970 3077  
2971 3078          /*
2972 3079           * secpolicy_vnode_setattr, or take ownership may have
2973 3080           * changed va_mask
2974 3081           */
2975 3082          mask = vap->va_mask;
2976 3083  
2977 3084          if ((mask & (AT_UID | AT_GID))) {
2978 3085                  err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2979 3086                      &xattr_obj, sizeof (xattr_obj));
2980 3087  
2981 3088                  if (err == 0 && xattr_obj) {
2982 3089                          err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
2983 3090                          if (err)
2984 3091                                  goto out2;
2985 3092                  }
2986 3093                  if (mask & AT_UID) {
2987 3094                          new_uid = zfs_fuid_create(zfsvfs,
2988 3095                              (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2989 3096                          if (new_uid != zp->z_uid &&
2990 3097                              zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
2991 3098                                  if (attrzp)
2992 3099                                          VN_RELE(ZTOV(attrzp));
2993 3100                                  err = SET_ERROR(EDQUOT);
2994 3101                                  goto out2;
2995 3102                          }
2996 3103                  }
2997 3104  
2998 3105                  if (mask & AT_GID) {
2999 3106                          new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3000 3107                              cr, ZFS_GROUP, &fuidp);
3001 3108                          if (new_gid != zp->z_gid &&
3002 3109                              zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3003 3110                                  if (attrzp)
3004 3111                                          VN_RELE(ZTOV(attrzp));
3005 3112                                  err = SET_ERROR(EDQUOT);
3006 3113                                  goto out2;
3007 3114                          }
3008 3115                  }
3009 3116          }
3010 3117          tx = dmu_tx_create(zfsvfs->z_os);
3011 3118  
3012 3119          if (mask & AT_MODE) {
3013 3120                  uint64_t pmode = zp->z_mode;
3014 3121                  uint64_t acl_obj;
3015 3122                  new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3016 3123  
3017 3124                  if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3018 3125                      !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3019 3126                          err = SET_ERROR(EPERM);
3020 3127                          goto out;
3021 3128                  }
3022 3129  
3023 3130                  if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3024 3131                          goto out;
3025 3132  
3026 3133                  mutex_enter(&zp->z_lock);
3027 3134                  if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3028 3135                          /*
3029 3136                           * Are we upgrading ACL from old V0 format
3030 3137                           * to V1 format?
3031 3138                           */
3032 3139                          if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3033 3140                              zfs_znode_acl_version(zp) ==
3034 3141                              ZFS_ACL_VERSION_INITIAL) {
3035 3142                                  dmu_tx_hold_free(tx, acl_obj, 0,
3036 3143                                      DMU_OBJECT_END);
3037 3144                                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3038 3145                                      0, aclp->z_acl_bytes);
3039 3146                          } else {
3040 3147                                  dmu_tx_hold_write(tx, acl_obj, 0,
3041 3148                                      aclp->z_acl_bytes);
3042 3149                          }
3043 3150                  } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3044 3151                          dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3045 3152                              0, aclp->z_acl_bytes);
3046 3153                  }
3047 3154                  mutex_exit(&zp->z_lock);
3048 3155                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3049 3156          } else {
3050 3157                  if ((mask & AT_XVATTR) &&
3051 3158                      XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3052 3159                          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3053 3160                  else
3054 3161                          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3055 3162          }
3056 3163  
3057 3164          if (attrzp) {
3058 3165                  dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3059 3166          }
3060 3167  
3061 3168          fuid_dirtied = zfsvfs->z_fuid_dirty;
3062 3169          if (fuid_dirtied)
3063 3170                  zfs_fuid_txhold(zfsvfs, tx);
3064 3171  
3065 3172          zfs_sa_upgrade_txholds(tx, zp);
3066 3173  
3067 3174          err = dmu_tx_assign(tx, TXG_WAIT);
3068 3175          if (err)
3069 3176                  goto out;
3070 3177  
3071 3178          count = 0;
3072 3179          /*
3073 3180           * Set each attribute requested.
3074 3181           * We group settings according to the locks they need to acquire.
3075 3182           *
3076 3183           * Note: you cannot set ctime directly, although it will be
3077 3184           * updated as a side-effect of calling this function.
3078 3185           */
3079 3186  
3080 3187  
3081 3188          if (mask & (AT_UID|AT_GID|AT_MODE))
3082 3189                  mutex_enter(&zp->z_acl_lock);
3083 3190          mutex_enter(&zp->z_lock);
3084 3191  
3085 3192          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3086 3193              &zp->z_pflags, sizeof (zp->z_pflags));
3087 3194  
3088 3195          if (attrzp) {
3089 3196                  if (mask & (AT_UID|AT_GID|AT_MODE))
3090 3197                          mutex_enter(&attrzp->z_acl_lock);
3091 3198                  mutex_enter(&attrzp->z_lock);
3092 3199                  SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3093 3200                      SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3094 3201                      sizeof (attrzp->z_pflags));
3095 3202          }
3096 3203  
3097 3204          if (mask & (AT_UID|AT_GID)) {
3098 3205  
3099 3206                  if (mask & AT_UID) {
3100 3207                          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3101 3208                              &new_uid, sizeof (new_uid));
3102 3209                          zp->z_uid = new_uid;
3103 3210                          if (attrzp) {
3104 3211                                  SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3105 3212                                      SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3106 3213                                      sizeof (new_uid));
3107 3214                                  attrzp->z_uid = new_uid;
3108 3215                          }
3109 3216                  }
3110 3217  
3111 3218                  if (mask & AT_GID) {
3112 3219                          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3113 3220                              NULL, &new_gid, sizeof (new_gid));
3114 3221                          zp->z_gid = new_gid;
3115 3222                          if (attrzp) {
3116 3223                                  SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3117 3224                                      SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3118 3225                                      sizeof (new_gid));
3119 3226                                  attrzp->z_gid = new_gid;
3120 3227                          }
3121 3228                  }
3122 3229                  if (!(mask & AT_MODE)) {
3123 3230                          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3124 3231                              NULL, &new_mode, sizeof (new_mode));
3125 3232                          new_mode = zp->z_mode;
3126 3233                  }
3127 3234                  err = zfs_acl_chown_setattr(zp);
3128 3235                  ASSERT(err == 0);
3129 3236                  if (attrzp) {
3130 3237                          err = zfs_acl_chown_setattr(attrzp);
3131 3238                          ASSERT(err == 0);
3132 3239                  }
3133 3240          }
3134 3241  
3135 3242          if (mask & AT_MODE) {
3136 3243                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3137 3244                      &new_mode, sizeof (new_mode));
3138 3245                  zp->z_mode = new_mode;
3139 3246                  ASSERT3U((uintptr_t)aclp, !=, NULL);
3140 3247                  err = zfs_aclset_common(zp, aclp, cr, tx);
3141 3248                  ASSERT0(err);
3142 3249                  if (zp->z_acl_cached)
3143 3250                          zfs_acl_free(zp->z_acl_cached);
3144 3251                  zp->z_acl_cached = aclp;
3145 3252                  aclp = NULL;
3146 3253          }
3147 3254  
3148 3255  
3149 3256          if (mask & AT_ATIME) {
3150 3257                  ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3151 3258                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3152 3259                      &zp->z_atime, sizeof (zp->z_atime));
3153 3260          }
3154 3261  
3155 3262          if (mask & AT_MTIME) {
3156 3263                  ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3157 3264                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3158 3265                      mtime, sizeof (mtime));
3159 3266          }
3160 3267  
3161 3268          /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3162 3269          if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3163 3270                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3164 3271                      NULL, mtime, sizeof (mtime));
3165 3272                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3166 3273                      &ctime, sizeof (ctime));
3167 3274                  zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3168 3275                      B_TRUE);
3169 3276          } else if (mask != 0) {
3170 3277                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3171 3278                      &ctime, sizeof (ctime));
3172 3279                  zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3173 3280                      B_TRUE);
3174 3281                  if (attrzp) {
3175 3282                          SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3176 3283                              SA_ZPL_CTIME(zfsvfs), NULL,
3177 3284                              &ctime, sizeof (ctime));
3178 3285                          zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3179 3286                              mtime, ctime, B_TRUE);
3180 3287                  }
3181 3288          }
3182 3289          /*
3183 3290           * Do this after setting timestamps to prevent timestamp
3184 3291           * update from toggling bit
3185 3292           */
3186 3293  
3187 3294          if (xoap && (mask & AT_XVATTR)) {
3188 3295  
3189 3296                  /*
3190 3297                   * restore trimmed off masks
3191 3298                   * so that return masks can be set for caller.
3192 3299                   */
3193 3300  
3194 3301                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3195 3302                          XVA_SET_REQ(xvap, XAT_APPENDONLY);
3196 3303                  }
3197 3304                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3198 3305                          XVA_SET_REQ(xvap, XAT_NOUNLINK);
3199 3306                  }
3200 3307                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3201 3308                          XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3202 3309                  }
3203 3310                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3204 3311                          XVA_SET_REQ(xvap, XAT_NODUMP);
3205 3312                  }
3206 3313                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3207 3314                          XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3208 3315                  }
3209 3316                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3210 3317                          XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3211 3318                  }
3212 3319  
3213 3320                  if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3214 3321                          ASSERT(vp->v_type == VREG);
3215 3322  
3216 3323                  zfs_xvattr_set(zp, xvap, tx);
3217 3324          }
3218 3325  
3219 3326          if (fuid_dirtied)
3220 3327                  zfs_fuid_sync(zfsvfs, tx);
3221 3328  
3222 3329          if (mask != 0)
3223 3330                  zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3224 3331  
3225 3332          mutex_exit(&zp->z_lock);
3226 3333          if (mask & (AT_UID|AT_GID|AT_MODE))
3227 3334                  mutex_exit(&zp->z_acl_lock);
3228 3335  
3229 3336          if (attrzp) {
3230 3337                  if (mask & (AT_UID|AT_GID|AT_MODE))
3231 3338                          mutex_exit(&attrzp->z_acl_lock);
3232 3339                  mutex_exit(&attrzp->z_lock);
3233 3340          }
3234 3341  out:
3235 3342          if (err == 0 && attrzp) {
3236 3343                  err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3237 3344                      xattr_count, tx);
3238 3345                  ASSERT(err2 == 0);
3239 3346          }
3240 3347  
3241 3348          if (attrzp)
3242 3349                  VN_RELE(ZTOV(attrzp));
3243 3350  
3244 3351          if (aclp)
3245 3352                  zfs_acl_free(aclp);
3246 3353  
3247 3354          if (fuidp) {
3248 3355                  zfs_fuid_info_free(fuidp);
3249 3356                  fuidp = NULL;
3250 3357          }
3251 3358  
3252 3359          if (err) {
3253 3360                  dmu_tx_abort(tx);
3254 3361                  if (err == ERESTART)
3255 3362                          goto top;
3256 3363          } else {
3257 3364                  err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3258 3365                  dmu_tx_commit(tx);
3259 3366          }
3260 3367  
3261 3368  out2:
3262 3369          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3263 3370                  zil_commit(zilog, 0);
3264 3371  
3265 3372          ZFS_EXIT(zfsvfs);
3266 3373          return (err);
3267 3374  }
3268 3375  
3269 3376  typedef struct zfs_zlock {
3270 3377          krwlock_t       *zl_rwlock;     /* lock we acquired */
3271 3378          znode_t         *zl_znode;      /* znode we held */
3272 3379          struct zfs_zlock *zl_next;      /* next in list */
3273 3380  } zfs_zlock_t;
3274 3381  
3275 3382  /*
3276 3383   * Drop locks and release vnodes that were held by zfs_rename_lock().
3277 3384   */
3278 3385  static void
3279 3386  zfs_rename_unlock(zfs_zlock_t **zlpp)
3280 3387  {
3281 3388          zfs_zlock_t *zl;
3282 3389  
3283 3390          while ((zl = *zlpp) != NULL) {
3284 3391                  if (zl->zl_znode != NULL)
3285 3392                          VN_RELE(ZTOV(zl->zl_znode));
3286 3393                  rw_exit(zl->zl_rwlock);
3287 3394                  *zlpp = zl->zl_next;
3288 3395                  kmem_free(zl, sizeof (*zl));
3289 3396          }
3290 3397  }
3291 3398  
3292 3399  /*
3293 3400   * Search back through the directory tree, using the ".." entries.
3294 3401   * Lock each directory in the chain to prevent concurrent renames.
3295 3402   * Fail any attempt to move a directory into one of its own descendants.
3296 3403   * XXX - z_parent_lock can overlap with map or grow locks
3297 3404   */
3298 3405  static int
3299 3406  zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3300 3407  {
3301 3408          zfs_zlock_t     *zl;
3302 3409          znode_t         *zp = tdzp;
3303 3410          uint64_t        rootid = zp->z_zfsvfs->z_root;
3304 3411          uint64_t        oidp = zp->z_id;
3305 3412          krwlock_t       *rwlp = &szp->z_parent_lock;
3306 3413          krw_t           rw = RW_WRITER;
3307 3414  
3308 3415          /*
3309 3416           * First pass write-locks szp and compares to zp->z_id.
3310 3417           * Later passes read-lock zp and compare to zp->z_parent.
3311 3418           */
3312 3419          do {
3313 3420                  if (!rw_tryenter(rwlp, rw)) {
3314 3421                          /*
3315 3422                           * Another thread is renaming in this path.
3316 3423                           * Note that if we are a WRITER, we don't have any
3317 3424                           * parent_locks held yet.
3318 3425                           */
3319 3426                          if (rw == RW_READER && zp->z_id > szp->z_id) {
3320 3427                                  /*
3321 3428                                   * Drop our locks and restart
3322 3429                                   */
3323 3430                                  zfs_rename_unlock(&zl);
3324 3431                                  *zlpp = NULL;
3325 3432                                  zp = tdzp;
3326 3433                                  oidp = zp->z_id;
3327 3434                                  rwlp = &szp->z_parent_lock;
3328 3435                                  rw = RW_WRITER;
3329 3436                                  continue;
3330 3437                          } else {
3331 3438                                  /*
3332 3439                                   * Wait for other thread to drop its locks
3333 3440                                   */
3334 3441                                  rw_enter(rwlp, rw);
3335 3442                          }
3336 3443                  }
3337 3444  
3338 3445                  zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3339 3446                  zl->zl_rwlock = rwlp;
3340 3447                  zl->zl_znode = NULL;
3341 3448                  zl->zl_next = *zlpp;
3342 3449                  *zlpp = zl;
3343 3450  
3344 3451                  if (oidp == szp->z_id)          /* We're a descendant of szp */
3345 3452                          return (SET_ERROR(EINVAL));
3346 3453  
3347 3454                  if (oidp == rootid)             /* We've hit the top */
3348 3455                          return (0);
3349 3456  
3350 3457                  if (rw == RW_READER) {          /* i.e. not the first pass */
3351 3458                          int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3352 3459                          if (error)
3353 3460                                  return (error);
3354 3461                          zl->zl_znode = zp;
3355 3462                  }
3356 3463                  (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3357 3464                      &oidp, sizeof (oidp));
3358 3465                  rwlp = &zp->z_parent_lock;
3359 3466                  rw = RW_READER;
3360 3467  
3361 3468          } while (zp->z_id != sdzp->z_id);
3362 3469  
3363 3470          return (0);
3364 3471  }
3365 3472  
3366 3473  /*
3367 3474   * Move an entry from the provided source directory to the target
3368 3475   * directory.  Change the entry name as indicated.
3369 3476   *
3370 3477   *      IN:     sdvp    - Source directory containing the "old entry".
3371 3478   *              snm     - Old entry name.
3372 3479   *              tdvp    - Target directory to contain the "new entry".
3373 3480   *              tnm     - New entry name.
3374 3481   *              cr      - credentials of caller.
3375 3482   *              ct      - caller context
3376 3483   *              flags   - case flags
3377 3484   *
3378 3485   *      RETURN: 0 on success, error code on failure.
3379 3486   *
3380 3487   * Timestamps:
3381 3488   *      sdvp,tdvp - ctime|mtime updated
3382 3489   */
3383 3490  /*ARGSUSED*/
3384 3491  static int
3385 3492  zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3386 3493      caller_context_t *ct, int flags)
3387 3494  {
3388 3495          znode_t         *tdzp, *szp, *tzp;
3389 3496          znode_t         *sdzp = VTOZ(sdvp);
3390 3497          zfsvfs_t        *zfsvfs = sdzp->z_zfsvfs;
3391 3498          zilog_t         *zilog;
3392 3499          vnode_t         *realvp;
3393 3500          zfs_dirlock_t   *sdl, *tdl;
3394 3501          dmu_tx_t        *tx;
3395 3502          zfs_zlock_t     *zl;
3396 3503          int             cmp, serr, terr;
3397 3504          int             error = 0;
3398 3505          int             zflg = 0;
3399 3506          boolean_t       waited = B_FALSE;
3400 3507  
3401 3508          ZFS_ENTER(zfsvfs);
3402 3509          ZFS_VERIFY_ZP(sdzp);
3403 3510          zilog = zfsvfs->z_log;
3404 3511  
3405 3512          /*
3406 3513           * Make sure we have the real vp for the target directory.
3407 3514           */
3408 3515          if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3409 3516                  tdvp = realvp;
3410 3517  
3411 3518          tdzp = VTOZ(tdvp);
3412 3519          ZFS_VERIFY_ZP(tdzp);
3413 3520  
3414 3521          /*
3415 3522           * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
3416 3523           * ctldir appear to have the same v_vfsp.
3417 3524           */
3418 3525          if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
3419 3526                  ZFS_EXIT(zfsvfs);
3420 3527                  return (SET_ERROR(EXDEV));
3421 3528          }
3422 3529  
3423 3530          if (zfsvfs->z_utf8 && u8_validate(tnm,
3424 3531              strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3425 3532                  ZFS_EXIT(zfsvfs);
3426 3533                  return (SET_ERROR(EILSEQ));
3427 3534          }
3428 3535  
3429 3536          if (flags & FIGNORECASE)
3430 3537                  zflg |= ZCILOOK;
3431 3538  
3432 3539  top:
3433 3540          szp = NULL;
3434 3541          tzp = NULL;
3435 3542          zl = NULL;
3436 3543  
3437 3544          /*
3438 3545           * This is to prevent the creation of links into attribute space
3439 3546           * by renaming a linked file into/outof an attribute directory.
3440 3547           * See the comment in zfs_link() for why this is considered bad.
3441 3548           */
3442 3549          if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3443 3550                  ZFS_EXIT(zfsvfs);
3444 3551                  return (SET_ERROR(EINVAL));
3445 3552          }
3446 3553  
3447 3554          /*
3448 3555           * Lock source and target directory entries.  To prevent deadlock,
3449 3556           * a lock ordering must be defined.  We lock the directory with
3450 3557           * the smallest object id first, or if it's a tie, the one with
3451 3558           * the lexically first name.
3452 3559           */
3453 3560          if (sdzp->z_id < tdzp->z_id) {
3454 3561                  cmp = -1;
3455 3562          } else if (sdzp->z_id > tdzp->z_id) {
3456 3563                  cmp = 1;
3457 3564          } else {
3458 3565                  /*
3459 3566                   * First compare the two name arguments without
3460 3567                   * considering any case folding.
3461 3568                   */
3462 3569                  int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3463 3570  
3464 3571                  cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3465 3572                  ASSERT(error == 0 || !zfsvfs->z_utf8);
3466 3573                  if (cmp == 0) {
3467 3574                          /*
3468 3575                           * POSIX: "If the old argument and the new argument
3469 3576                           * both refer to links to the same existing file,
3470 3577                           * the rename() function shall return successfully
3471 3578                           * and perform no other action."
3472 3579                           */
3473 3580                          ZFS_EXIT(zfsvfs);
3474 3581                          return (0);
3475 3582                  }
3476 3583                  /*
3477 3584                   * If the file system is case-folding, then we may
3478 3585                   * have some more checking to do.  A case-folding file
3479 3586                   * system is either supporting mixed case sensitivity
3480 3587                   * access or is completely case-insensitive.  Note
3481 3588                   * that the file system is always case preserving.
3482 3589                   *
3483 3590                   * In mixed sensitivity mode case sensitive behavior
3484 3591                   * is the default.  FIGNORECASE must be used to
3485 3592                   * explicitly request case insensitive behavior.
3486 3593                   *
3487 3594                   * If the source and target names provided differ only
3488 3595                   * by case (e.g., a request to rename 'tim' to 'Tim'),
3489 3596                   * we will treat this as a special case in the
3490 3597                   * case-insensitive mode: as long as the source name
3491 3598                   * is an exact match, we will allow this to proceed as
3492 3599                   * a name-change request.
3493 3600                   */
3494 3601                  if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3495 3602                      (zfsvfs->z_case == ZFS_CASE_MIXED &&
3496 3603                      flags & FIGNORECASE)) &&
3497 3604                      u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3498 3605                      &error) == 0) {
3499 3606                          /*
3500 3607                           * case preserving rename request, require exact
3501 3608                           * name matches
3502 3609                           */
3503 3610                          zflg |= ZCIEXACT;
3504 3611                          zflg &= ~ZCILOOK;
3505 3612                  }
3506 3613          }
3507 3614  
3508 3615          /*
3509 3616           * If the source and destination directories are the same, we should
3510 3617           * grab the z_name_lock of that directory only once.
3511 3618           */
3512 3619          if (sdzp == tdzp) {
3513 3620                  zflg |= ZHAVELOCK;
3514 3621                  rw_enter(&sdzp->z_name_lock, RW_READER);
3515 3622          }
3516 3623  
3517 3624          if (cmp < 0) {
3518 3625                  serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3519 3626                      ZEXISTS | zflg, NULL, NULL);
3520 3627                  terr = zfs_dirent_lock(&tdl,
3521 3628                      tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3522 3629          } else {
3523 3630                  terr = zfs_dirent_lock(&tdl,
3524 3631                      tdzp, tnm, &tzp, zflg, NULL, NULL);
3525 3632                  serr = zfs_dirent_lock(&sdl,
3526 3633                      sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3527 3634                      NULL, NULL);
3528 3635          }
3529 3636  
3530 3637          if (serr) {
3531 3638                  /*
3532 3639                   * Source entry invalid or not there.
3533 3640                   */
3534 3641                  if (!terr) {
3535 3642                          zfs_dirent_unlock(tdl);
3536 3643                          if (tzp)
3537 3644                                  VN_RELE(ZTOV(tzp));
3538 3645                  }
3539 3646  
3540 3647                  if (sdzp == tdzp)
3541 3648                          rw_exit(&sdzp->z_name_lock);
3542 3649  
3543 3650                  if (strcmp(snm, "..") == 0)
3544 3651                          serr = SET_ERROR(EINVAL);
3545 3652                  ZFS_EXIT(zfsvfs);
3546 3653                  return (serr);
3547 3654          }
3548 3655          if (terr) {
3549 3656                  zfs_dirent_unlock(sdl);
3550 3657                  VN_RELE(ZTOV(szp));
3551 3658  
3552 3659                  if (sdzp == tdzp)
3553 3660                          rw_exit(&sdzp->z_name_lock);
3554 3661  
3555 3662                  if (strcmp(tnm, "..") == 0)
3556 3663                          terr = SET_ERROR(EINVAL);
3557 3664                  ZFS_EXIT(zfsvfs);
3558 3665                  return (terr);
3559 3666          }
3560 3667  
3561 3668          /*
3562 3669           * Must have write access at the source to remove the old entry
3563 3670           * and write access at the target to create the new entry.
3564 3671           * Note that if target and source are the same, this can be
3565 3672           * done in a single check.
3566 3673           */
3567 3674  
3568 3675          if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3569 3676                  goto out;
3570 3677  
3571 3678          if (ZTOV(szp)->v_type == VDIR) {
3572 3679                  /*
3573 3680                   * Check to make sure rename is valid.
3574 3681                   * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3575 3682                   */
3576 3683                  if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3577 3684                          goto out;
3578 3685          }
3579 3686  
3580 3687          /*
3581 3688           * Does target exist?
3582 3689           */
3583 3690          if (tzp) {
3584 3691                  /*
3585 3692                   * Source and target must be the same type.
3586 3693                   */
3587 3694                  if (ZTOV(szp)->v_type == VDIR) {
3588 3695                          if (ZTOV(tzp)->v_type != VDIR) {
3589 3696                                  error = SET_ERROR(ENOTDIR);
3590 3697                                  goto out;
3591 3698                          }
3592 3699                  } else {
3593 3700                          if (ZTOV(tzp)->v_type == VDIR) {
3594 3701                                  error = SET_ERROR(EISDIR);
3595 3702                                  goto out;
3596 3703                          }
3597 3704                  }
3598 3705                  /*
3599 3706                   * POSIX dictates that when the source and target
3600 3707                   * entries refer to the same file object, rename
3601 3708                   * must do nothing and exit without error.
3602 3709                   */
3603 3710                  if (szp->z_id == tzp->z_id) {
3604 3711                          error = 0;
3605 3712                          goto out;
3606 3713                  }
3607 3714          }
3608 3715  
3609 3716          vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3610 3717          if (tzp)
3611 3718                  vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3612 3719  
3613 3720          /*
3614 3721           * notify the target directory if it is not the same
3615 3722           * as source directory.
3616 3723           */
3617 3724          if (tdvp != sdvp) {
3618 3725                  vnevent_rename_dest_dir(tdvp, ct);
3619 3726          }
3620 3727  
3621 3728          tx = dmu_tx_create(zfsvfs->z_os);
3622 3729          dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3623 3730          dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3624 3731          dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3625 3732          dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3626 3733          if (sdzp != tdzp) {
3627 3734                  dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3628 3735                  zfs_sa_upgrade_txholds(tx, tdzp);
3629 3736          }
3630 3737          if (tzp) {
3631 3738                  dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3632 3739                  zfs_sa_upgrade_txholds(tx, tzp);
3633 3740          }
3634 3741  
3635 3742          zfs_sa_upgrade_txholds(tx, szp);
3636 3743          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3637 3744          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
3638 3745          if (error) {
3639 3746                  if (zl != NULL)
3640 3747                          zfs_rename_unlock(&zl);
3641 3748                  zfs_dirent_unlock(sdl);
3642 3749                  zfs_dirent_unlock(tdl);
3643 3750  
3644 3751                  if (sdzp == tdzp)
3645 3752                          rw_exit(&sdzp->z_name_lock);
3646 3753  
3647 3754                  VN_RELE(ZTOV(szp));
3648 3755                  if (tzp)
3649 3756                          VN_RELE(ZTOV(tzp));
3650 3757                  if (error == ERESTART) {
3651 3758                          waited = B_TRUE;
3652 3759                          dmu_tx_wait(tx);
3653 3760                          dmu_tx_abort(tx);
3654 3761                          goto top;
3655 3762                  }
3656 3763                  dmu_tx_abort(tx);
3657 3764                  ZFS_EXIT(zfsvfs);
3658 3765                  return (error);
3659 3766          }
3660 3767  
3661 3768          if (tzp)        /* Attempt to remove the existing target */
3662 3769                  error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3663 3770  
3664 3771          if (error == 0) {
3665 3772                  error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3666 3773                  if (error == 0) {
3667 3774                          szp->z_pflags |= ZFS_AV_MODIFIED;
3668 3775  
3669 3776                          error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3670 3777                              (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3671 3778                          ASSERT0(error);
3672 3779  
3673 3780                          error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3674 3781                          if (error == 0) {
3675 3782                                  zfs_log_rename(zilog, tx, TX_RENAME |
3676 3783                                      (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3677 3784                                      sdl->dl_name, tdzp, tdl->dl_name, szp);
3678 3785  
3679 3786                                  /*
3680 3787                                   * Update path information for the target vnode
3681 3788                                   */
3682 3789                                  vn_renamepath(tdvp, ZTOV(szp), tnm,
3683 3790                                      strlen(tnm));
3684 3791                          } else {
3685 3792                                  /*
3686 3793                                   * At this point, we have successfully created
3687 3794                                   * the target name, but have failed to remove
3688 3795                                   * the source name.  Since the create was done
3689 3796                                   * with the ZRENAMING flag, there are
3690 3797                                   * complications; for one, the link count is
3691 3798                                   * wrong.  The easiest way to deal with this
3692 3799                                   * is to remove the newly created target, and
3693 3800                                   * return the original error.  This must
3694 3801                                   * succeed; fortunately, it is very unlikely to
3695 3802                                   * fail, since we just created it.
3696 3803                                   */
3697 3804                                  VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3698 3805                                      ZRENAMING, NULL), ==, 0);
3699 3806                          }
3700 3807                  }
3701 3808          }
3702 3809  
3703 3810          dmu_tx_commit(tx);
3704 3811  out:
3705 3812          if (zl != NULL)
3706 3813                  zfs_rename_unlock(&zl);
3707 3814  
3708 3815          zfs_dirent_unlock(sdl);
3709 3816          zfs_dirent_unlock(tdl);
3710 3817  
3711 3818          if (sdzp == tdzp)
3712 3819                  rw_exit(&sdzp->z_name_lock);
3713 3820  
3714 3821  
3715 3822          VN_RELE(ZTOV(szp));
3716 3823          if (tzp)
3717 3824                  VN_RELE(ZTOV(tzp));
3718 3825  
3719 3826          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3720 3827                  zil_commit(zilog, 0);
3721 3828  
3722 3829          ZFS_EXIT(zfsvfs);
3723 3830          return (error);
3724 3831  }
3725 3832  
3726 3833  /*
3727 3834   * Insert the indicated symbolic reference entry into the directory.
3728 3835   *
3729 3836   *      IN:     dvp     - Directory to contain new symbolic link.
3730 3837   *              link    - Name for new symlink entry.
3731 3838   *              vap     - Attributes of new entry.
3732 3839   *              cr      - credentials of caller.
3733 3840   *              ct      - caller context
3734 3841   *              flags   - case flags
3735 3842   *
3736 3843   *      RETURN: 0 on success, error code on failure.
3737 3844   *
3738 3845   * Timestamps:
3739 3846   *      dvp - ctime|mtime updated
3740 3847   */
3741 3848  /*ARGSUSED*/
3742 3849  static int
3743 3850  zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
3744 3851      caller_context_t *ct, int flags)
3745 3852  {
3746 3853          znode_t         *zp, *dzp = VTOZ(dvp);
3747 3854          zfs_dirlock_t   *dl;
3748 3855          dmu_tx_t        *tx;
3749 3856          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3750 3857          zilog_t         *zilog;
3751 3858          uint64_t        len = strlen(link);
3752 3859          int             error;
3753 3860          int             zflg = ZNEW;
3754 3861          zfs_acl_ids_t   acl_ids;
3755 3862          boolean_t       fuid_dirtied;
3756 3863          uint64_t        txtype = TX_SYMLINK;
3757 3864          boolean_t       waited = B_FALSE;
3758 3865  
3759 3866          ASSERT(vap->va_type == VLNK);
3760 3867  
3761 3868          ZFS_ENTER(zfsvfs);
3762 3869          ZFS_VERIFY_ZP(dzp);
3763 3870          zilog = zfsvfs->z_log;
3764 3871  
3765 3872          if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3766 3873              NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3767 3874                  ZFS_EXIT(zfsvfs);
3768 3875                  return (SET_ERROR(EILSEQ));
3769 3876          }
3770 3877          if (flags & FIGNORECASE)
3771 3878                  zflg |= ZCILOOK;
3772 3879  
3773 3880          if (len > MAXPATHLEN) {
3774 3881                  ZFS_EXIT(zfsvfs);
3775 3882                  return (SET_ERROR(ENAMETOOLONG));
3776 3883          }
3777 3884  
3778 3885          if ((error = zfs_acl_ids_create(dzp, 0,
3779 3886              vap, cr, NULL, &acl_ids)) != 0) {
3780 3887                  ZFS_EXIT(zfsvfs);
3781 3888                  return (error);
3782 3889          }
3783 3890  top:
3784 3891          /*
3785 3892           * Attempt to lock directory; fail if entry already exists.
3786 3893           */
3787 3894          error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3788 3895          if (error) {
3789 3896                  zfs_acl_ids_free(&acl_ids);
3790 3897                  ZFS_EXIT(zfsvfs);
3791 3898                  return (error);
3792 3899          }
3793 3900  
3794 3901          if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3795 3902                  zfs_acl_ids_free(&acl_ids);
3796 3903                  zfs_dirent_unlock(dl);
3797 3904                  ZFS_EXIT(zfsvfs);
3798 3905                  return (error);
3799 3906          }
3800 3907  
3801 3908          if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
3802 3909                  zfs_acl_ids_free(&acl_ids);
3803 3910                  zfs_dirent_unlock(dl);
3804 3911                  ZFS_EXIT(zfsvfs);
3805 3912                  return (SET_ERROR(EDQUOT));
3806 3913          }
3807 3914          tx = dmu_tx_create(zfsvfs->z_os);
3808 3915          fuid_dirtied = zfsvfs->z_fuid_dirty;
3809 3916          dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3810 3917          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3811 3918          dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3812 3919              ZFS_SA_BASE_ATTR_SIZE + len);
3813 3920          dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3814 3921          if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3815 3922                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3816 3923                      acl_ids.z_aclp->z_acl_bytes);
3817 3924          }
3818 3925          if (fuid_dirtied)
3819 3926                  zfs_fuid_txhold(zfsvfs, tx);
3820 3927          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
3821 3928          if (error) {
3822 3929                  zfs_dirent_unlock(dl);
3823 3930                  if (error == ERESTART) {
3824 3931                          waited = B_TRUE;
3825 3932                          dmu_tx_wait(tx);
3826 3933                          dmu_tx_abort(tx);
3827 3934                          goto top;
3828 3935                  }
3829 3936                  zfs_acl_ids_free(&acl_ids);
3830 3937                  dmu_tx_abort(tx);
3831 3938                  ZFS_EXIT(zfsvfs);
3832 3939                  return (error);
3833 3940          }
3834 3941  
3835 3942          /*
3836 3943           * Create a new object for the symlink.
3837 3944           * for version 4 ZPL datsets the symlink will be an SA attribute
3838 3945           */
3839 3946          zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3840 3947  
3841 3948          if (fuid_dirtied)
3842 3949                  zfs_fuid_sync(zfsvfs, tx);
3843 3950  
3844 3951          mutex_enter(&zp->z_lock);
3845 3952          if (zp->z_is_sa)
3846 3953                  error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3847 3954                      link, len, tx);
3848 3955          else
3849 3956                  zfs_sa_symlink(zp, link, len, tx);
3850 3957          mutex_exit(&zp->z_lock);
3851 3958  
3852 3959          zp->z_size = len;
3853 3960          (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3854 3961              &zp->z_size, sizeof (zp->z_size), tx);
3855 3962          /*
3856 3963           * Insert the new object into the directory.
3857 3964           */
3858 3965          (void) zfs_link_create(dl, zp, tx, ZNEW);
3859 3966  
3860 3967          if (flags & FIGNORECASE)
3861 3968                  txtype |= TX_CI;
3862 3969          zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3863 3970  
3864 3971          zfs_acl_ids_free(&acl_ids);
3865 3972  
3866 3973          dmu_tx_commit(tx);
3867 3974  
3868 3975          zfs_dirent_unlock(dl);
3869 3976  
3870 3977          VN_RELE(ZTOV(zp));
3871 3978  
3872 3979          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3873 3980                  zil_commit(zilog, 0);
3874 3981  
3875 3982          ZFS_EXIT(zfsvfs);
3876 3983          return (error);
3877 3984  }
3878 3985  
3879 3986  /*
3880 3987   * Return, in the buffer contained in the provided uio structure,
3881 3988   * the symbolic path referred to by vp.
3882 3989   *
3883 3990   *      IN:     vp      - vnode of symbolic link.
3884 3991   *              uio     - structure to contain the link path.
3885 3992   *              cr      - credentials of caller.
3886 3993   *              ct      - caller context
3887 3994   *
3888 3995   *      OUT:    uio     - structure containing the link path.
3889 3996   *
3890 3997   *      RETURN: 0 on success, error code on failure.
3891 3998   *
3892 3999   * Timestamps:
3893 4000   *      vp - atime updated
3894 4001   */
3895 4002  /* ARGSUSED */
3896 4003  static int
3897 4004  zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3898 4005  {
3899 4006          znode_t         *zp = VTOZ(vp);
3900 4007          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3901 4008          int             error;
3902 4009  
3903 4010          ZFS_ENTER(zfsvfs);
3904 4011          ZFS_VERIFY_ZP(zp);
3905 4012  
3906 4013          mutex_enter(&zp->z_lock);
3907 4014          if (zp->z_is_sa)
3908 4015                  error = sa_lookup_uio(zp->z_sa_hdl,
3909 4016                      SA_ZPL_SYMLINK(zfsvfs), uio);
3910 4017          else
3911 4018                  error = zfs_sa_readlink(zp, uio);
3912 4019          mutex_exit(&zp->z_lock);
3913 4020  
3914 4021          ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3915 4022  
3916 4023          ZFS_EXIT(zfsvfs);
3917 4024          return (error);
3918 4025  }
3919 4026  
3920 4027  /*
3921 4028   * Insert a new entry into directory tdvp referencing svp.
3922 4029   *
3923 4030   *      IN:     tdvp    - Directory to contain new entry.
3924 4031   *              svp     - vnode of new entry.
3925 4032   *              name    - name of new entry.
3926 4033   *              cr      - credentials of caller.
3927 4034   *              ct      - caller context
3928 4035   *
3929 4036   *      RETURN: 0 on success, error code on failure.
3930 4037   *
3931 4038   * Timestamps:
3932 4039   *      tdvp - ctime|mtime updated
3933 4040   *       svp - ctime updated
3934 4041   */
3935 4042  /* ARGSUSED */
3936 4043  static int
3937 4044  zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
3938 4045      caller_context_t *ct, int flags)
3939 4046  {
3940 4047          znode_t         *dzp = VTOZ(tdvp);
3941 4048          znode_t         *tzp, *szp;
3942 4049          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3943 4050          zilog_t         *zilog;
3944 4051          zfs_dirlock_t   *dl;
3945 4052          dmu_tx_t        *tx;
3946 4053          vnode_t         *realvp;
3947 4054          int             error;
3948 4055          int             zf = ZNEW;
3949 4056          uint64_t        parent;
3950 4057          uid_t           owner;
3951 4058          boolean_t       waited = B_FALSE;
3952 4059  
3953 4060          ASSERT(tdvp->v_type == VDIR);
3954 4061  
3955 4062          ZFS_ENTER(zfsvfs);
3956 4063          ZFS_VERIFY_ZP(dzp);
3957 4064          zilog = zfsvfs->z_log;
3958 4065  
3959 4066          if (VOP_REALVP(svp, &realvp, ct) == 0)
3960 4067                  svp = realvp;
3961 4068  
3962 4069          /*
3963 4070           * POSIX dictates that we return EPERM here.
3964 4071           * Better choices include ENOTSUP or EISDIR.
3965 4072           */
3966 4073          if (svp->v_type == VDIR) {
3967 4074                  ZFS_EXIT(zfsvfs);
3968 4075                  return (SET_ERROR(EPERM));
3969 4076          }
3970 4077  
3971 4078          szp = VTOZ(svp);
3972 4079          ZFS_VERIFY_ZP(szp);
3973 4080  
3974 4081          /*
3975 4082           * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
3976 4083           * ctldir appear to have the same v_vfsp.
3977 4084           */
3978 4085          if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
3979 4086                  ZFS_EXIT(zfsvfs);
3980 4087                  return (SET_ERROR(EXDEV));
3981 4088          }
3982 4089  
3983 4090          /* Prevent links to .zfs/shares files */
3984 4091  
3985 4092          if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3986 4093              &parent, sizeof (uint64_t))) != 0) {
3987 4094                  ZFS_EXIT(zfsvfs);
3988 4095                  return (error);
3989 4096          }
3990 4097          if (parent == zfsvfs->z_shares_dir) {
3991 4098                  ZFS_EXIT(zfsvfs);
3992 4099                  return (SET_ERROR(EPERM));
3993 4100          }
3994 4101  
3995 4102          if (zfsvfs->z_utf8 && u8_validate(name,
3996 4103              strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3997 4104                  ZFS_EXIT(zfsvfs);
3998 4105                  return (SET_ERROR(EILSEQ));
3999 4106          }
4000 4107          if (flags & FIGNORECASE)
4001 4108                  zf |= ZCILOOK;
4002 4109  
4003 4110          /*
4004 4111           * We do not support links between attributes and non-attributes
4005 4112           * because of the potential security risk of creating links
4006 4113           * into "normal" file space in order to circumvent restrictions
4007 4114           * imposed in attribute space.
4008 4115           */
4009 4116          if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4010 4117                  ZFS_EXIT(zfsvfs);
4011 4118                  return (SET_ERROR(EINVAL));
4012 4119          }
4013 4120  
4014 4121  
4015 4122          owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4016 4123          if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
4017 4124                  ZFS_EXIT(zfsvfs);
4018 4125                  return (SET_ERROR(EPERM));
4019 4126          }
4020 4127  
4021 4128          if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4022 4129                  ZFS_EXIT(zfsvfs);
4023 4130                  return (error);
4024 4131          }
4025 4132  
4026 4133  top:
4027 4134          /*
4028 4135           * Attempt to lock directory; fail if entry already exists.
4029 4136           */
4030 4137          error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4031 4138          if (error) {
4032 4139                  ZFS_EXIT(zfsvfs);
4033 4140                  return (error);
4034 4141          }
4035 4142  
4036 4143          tx = dmu_tx_create(zfsvfs->z_os);
4037 4144          dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4038 4145          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4039 4146          zfs_sa_upgrade_txholds(tx, szp);
4040 4147          zfs_sa_upgrade_txholds(tx, dzp);
4041 4148          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
4042 4149          if (error) {
4043 4150                  zfs_dirent_unlock(dl);
4044 4151                  if (error == ERESTART) {
4045 4152                          waited = B_TRUE;
4046 4153                          dmu_tx_wait(tx);
4047 4154                          dmu_tx_abort(tx);
4048 4155                          goto top;
4049 4156                  }
4050 4157                  dmu_tx_abort(tx);
4051 4158                  ZFS_EXIT(zfsvfs);
4052 4159                  return (error);
4053 4160          }
4054 4161  
4055 4162          error = zfs_link_create(dl, szp, tx, 0);
4056 4163  
4057 4164          if (error == 0) {
4058 4165                  uint64_t txtype = TX_LINK;
4059 4166                  if (flags & FIGNORECASE)
4060 4167                          txtype |= TX_CI;
4061 4168                  zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4062 4169          }
4063 4170  
4064 4171          dmu_tx_commit(tx);
4065 4172  
4066 4173          zfs_dirent_unlock(dl);
4067 4174  
4068 4175          if (error == 0) {
4069 4176                  vnevent_link(svp, ct);
4070 4177          }
4071 4178  
4072 4179          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4073 4180                  zil_commit(zilog, 0);
4074 4181  
4075 4182          ZFS_EXIT(zfsvfs);
4076 4183          return (error);
4077 4184  }
4078 4185  
4079 4186  /*
4080 4187   * zfs_null_putapage() is used when the file system has been force
4081 4188   * unmounted. It just drops the pages.
4082 4189   */
4083 4190  /* ARGSUSED */
4084 4191  static int
4085 4192  zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4086 4193                  size_t *lenp, int flags, cred_t *cr)
4087 4194  {
4088 4195          pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4089 4196          return (0);
4090 4197  }
4091 4198  
4092 4199  /*
4093 4200   * Push a page out to disk, klustering if possible.
4094 4201   *
4095 4202   *      IN:     vp      - file to push page to.
4096 4203   *              pp      - page to push.
4097 4204   *              flags   - additional flags.
4098 4205   *              cr      - credentials of caller.
4099 4206   *
4100 4207   *      OUT:    offp    - start of range pushed.
4101 4208   *              lenp    - len of range pushed.
4102 4209   *
4103 4210   *      RETURN: 0 on success, error code on failure.
4104 4211   *
4105 4212   * NOTE: callers must have locked the page to be pushed.  On
4106 4213   * exit, the page (and all other pages in the kluster) must be
4107 4214   * unlocked.
4108 4215   */
4109 4216  /* ARGSUSED */
4110 4217  static int
4111 4218  zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4112 4219                  size_t *lenp, int flags, cred_t *cr)
4113 4220  {
4114 4221          znode_t         *zp = VTOZ(vp);
4115 4222          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4116 4223          dmu_tx_t        *tx;
4117 4224          u_offset_t      off, koff;
4118 4225          size_t          len, klen;
4119 4226          int             err;
4120 4227  
4121 4228          off = pp->p_offset;
4122 4229          len = PAGESIZE;
4123 4230          /*
4124 4231           * If our blocksize is bigger than the page size, try to kluster
4125 4232           * multiple pages so that we write a full block (thus avoiding
4126 4233           * a read-modify-write).
4127 4234           */
4128 4235          if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4129 4236                  klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4130 4237                  koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4131 4238                  ASSERT(koff <= zp->z_size);
4132 4239                  if (koff + klen > zp->z_size)
4133 4240                          klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4134 4241                  pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4135 4242          }
4136 4243          ASSERT3U(btop(len), ==, btopr(len));
4137 4244  
4138 4245          /*
4139 4246           * Can't push pages past end-of-file.
4140 4247           */
4141 4248          if (off >= zp->z_size) {
4142 4249                  /* ignore all pages */
4143 4250                  err = 0;
4144 4251                  goto out;
4145 4252          } else if (off + len > zp->z_size) {
4146 4253                  int npages = btopr(zp->z_size - off);
4147 4254                  page_t *trunc;
4148 4255  
4149 4256                  page_list_break(&pp, &trunc, npages);
4150 4257                  /* ignore pages past end of file */
4151 4258                  if (trunc)
4152 4259                          pvn_write_done(trunc, flags);
4153 4260                  len = zp->z_size - off;
4154 4261          }
4155 4262  
4156 4263          if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4157 4264              zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4158 4265                  err = SET_ERROR(EDQUOT);
4159 4266                  goto out;
4160 4267          }
4161 4268          tx = dmu_tx_create(zfsvfs->z_os);
4162 4269          dmu_tx_hold_write(tx, zp->z_id, off, len);
4163 4270  
4164 4271          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4165 4272          zfs_sa_upgrade_txholds(tx, zp);
4166 4273          err = dmu_tx_assign(tx, TXG_WAIT);
4167 4274          if (err != 0) {
4168 4275                  dmu_tx_abort(tx);
4169 4276                  goto out;
4170 4277          }
4171 4278  
4172 4279          if (zp->z_blksz <= PAGESIZE) {
4173 4280                  caddr_t va = zfs_map_page(pp, S_READ);
4174 4281                  ASSERT3U(len, <=, PAGESIZE);
4175 4282                  dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4176 4283                  zfs_unmap_page(pp, va);
4177 4284          } else {
4178 4285                  err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4179 4286          }
4180 4287  
4181 4288          if (err == 0) {
4182 4289                  uint64_t mtime[2], ctime[2];
4183 4290                  sa_bulk_attr_t bulk[3];
4184 4291                  int count = 0;
4185 4292  
4186 4293                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4187 4294                      &mtime, 16);
4188 4295                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4189 4296                      &ctime, 16);
4190 4297                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4191 4298                      &zp->z_pflags, 8);
4192 4299                  zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4193 4300                      B_TRUE);
4194 4301                  zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4195 4302          }
4196 4303          dmu_tx_commit(tx);
4197 4304  
4198 4305  out:
4199 4306          pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4200 4307          if (offp)
4201 4308                  *offp = off;
4202 4309          if (lenp)
4203 4310                  *lenp = len;
4204 4311  
4205 4312          return (err);
4206 4313  }
4207 4314  
4208 4315  /*
4209 4316   * Copy the portion of the file indicated from pages into the file.
4210 4317   * The pages are stored in a page list attached to the files vnode.
4211 4318   *
4212 4319   *      IN:     vp      - vnode of file to push page data to.
4213 4320   *              off     - position in file to put data.
4214 4321   *              len     - amount of data to write.
4215 4322   *              flags   - flags to control the operation.
4216 4323   *              cr      - credentials of caller.
4217 4324   *              ct      - caller context.
4218 4325   *
4219 4326   *      RETURN: 0 on success, error code on failure.
4220 4327   *
4221 4328   * Timestamps:
4222 4329   *      vp - ctime|mtime updated
4223 4330   */
4224 4331  /*ARGSUSED*/
4225 4332  static int
4226 4333  zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4227 4334      caller_context_t *ct)
4228 4335  {
4229 4336          znode_t         *zp = VTOZ(vp);
4230 4337          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4231 4338          page_t          *pp;
4232 4339          size_t          io_len;
4233 4340          u_offset_t      io_off;
4234 4341          uint_t          blksz;
4235 4342          rl_t            *rl;
4236 4343          int             error = 0;
4237 4344  
4238 4345          ZFS_ENTER(zfsvfs);
4239 4346          ZFS_VERIFY_ZP(zp);
4240 4347  
4241 4348          /*
4242 4349           * There's nothing to do if no data is cached.
4243 4350           */
4244 4351          if (!vn_has_cached_data(vp)) {
4245 4352                  ZFS_EXIT(zfsvfs);
4246 4353                  return (0);
4247 4354          }
4248 4355  
4249 4356          /*
4250 4357           * Align this request to the file block size in case we kluster.
4251 4358           * XXX - this can result in pretty aggresive locking, which can
4252 4359           * impact simultanious read/write access.  One option might be
4253 4360           * to break up long requests (len == 0) into block-by-block
4254 4361           * operations to get narrower locking.
4255 4362           */
4256 4363          blksz = zp->z_blksz;
4257 4364          if (ISP2(blksz))
4258 4365                  io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4259 4366          else
4260 4367                  io_off = 0;
4261 4368          if (len > 0 && ISP2(blksz))
4262 4369                  io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4263 4370          else
4264 4371                  io_len = 0;
4265 4372  
4266 4373          if (io_len == 0) {
4267 4374                  /*
4268 4375                   * Search the entire vp list for pages >= io_off.
4269 4376                   */
4270 4377                  rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
4271 4378                  error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4272 4379                  goto out;
4273 4380          }
4274 4381          rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
4275 4382  
4276 4383          if (off > zp->z_size) {
4277 4384                  /* past end of file */
4278 4385                  zfs_range_unlock(rl);
4279 4386                  ZFS_EXIT(zfsvfs);
4280 4387                  return (0);
4281 4388          }
4282 4389  
4283 4390          len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4284 4391  
4285 4392          for (off = io_off; io_off < off + len; io_off += io_len) {
4286 4393                  if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4287 4394                          pp = page_lookup(vp, io_off,
4288 4395                              (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4289 4396                  } else {
4290 4397                          pp = page_lookup_nowait(vp, io_off,
4291 4398                              (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4292 4399                  }
4293 4400  
4294 4401                  if (pp != NULL && pvn_getdirty(pp, flags)) {
4295 4402                          int err;
4296 4403  
4297 4404                          /*
4298 4405                           * Found a dirty page to push
4299 4406                           */
4300 4407                          err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4301 4408                          if (err)
4302 4409                                  error = err;
4303 4410                  } else {
4304 4411                          io_len = PAGESIZE;
4305 4412                  }
4306 4413          }
4307 4414  out:
4308 4415          zfs_range_unlock(rl);
4309 4416          if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4310 4417                  zil_commit(zfsvfs->z_log, zp->z_id);
4311 4418          ZFS_EXIT(zfsvfs);
4312 4419          return (error);
4313 4420  }
4314 4421  
4315 4422  /*ARGSUSED*/
4316 4423  void
4317 4424  zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4318 4425  {
4319 4426          znode_t *zp = VTOZ(vp);
4320 4427          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4321 4428          int error;
4322 4429  
4323 4430          rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4324 4431          if (zp->z_sa_hdl == NULL) {
4325 4432                  /*
4326 4433                   * The fs has been unmounted, or we did a
4327 4434                   * suspend/resume and this file no longer exists.
4328 4435                   */
4329 4436                  if (vn_has_cached_data(vp)) {
4330 4437                          (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
4331 4438                              B_INVAL, cr);
4332 4439                  }
4333 4440  
4334 4441                  mutex_enter(&zp->z_lock);
4335 4442                  mutex_enter(&vp->v_lock);
4336 4443                  ASSERT(vp->v_count == 1);
4337 4444                  vp->v_count = 0;
4338 4445                  mutex_exit(&vp->v_lock);
4339 4446                  mutex_exit(&zp->z_lock);
4340 4447                  rw_exit(&zfsvfs->z_teardown_inactive_lock);
4341 4448                  zfs_znode_free(zp);
4342 4449                  return;
4343 4450          }
4344 4451  
4345 4452          /*
4346 4453           * Attempt to push any data in the page cache.  If this fails
4347 4454           * we will get kicked out later in zfs_zinactive().
4348 4455           */
4349 4456          if (vn_has_cached_data(vp)) {
4350 4457                  (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
4351 4458                      cr);
4352 4459          }
4353 4460  
4354 4461          if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4355 4462                  dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4356 4463  
4357 4464                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4358 4465                  zfs_sa_upgrade_txholds(tx, zp);
4359 4466                  error = dmu_tx_assign(tx, TXG_WAIT);
4360 4467                  if (error) {
4361 4468                          dmu_tx_abort(tx);
4362 4469                  } else {
4363 4470                          mutex_enter(&zp->z_lock);
4364 4471                          (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4365 4472                              (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4366 4473                          zp->z_atime_dirty = 0;
4367 4474                          mutex_exit(&zp->z_lock);
4368 4475                          dmu_tx_commit(tx);
4369 4476                  }
4370 4477          }
4371 4478  
4372 4479          zfs_zinactive(zp);
4373 4480          rw_exit(&zfsvfs->z_teardown_inactive_lock);
4374 4481  }
4375 4482  
4376 4483  /*
4377 4484   * Bounds-check the seek operation.
4378 4485   *
4379 4486   *      IN:     vp      - vnode seeking within
4380 4487   *              ooff    - old file offset
4381 4488   *              noffp   - pointer to new file offset
4382 4489   *              ct      - caller context
4383 4490   *
4384 4491   *      RETURN: 0 on success, EINVAL if new offset invalid.
4385 4492   */
4386 4493  /* ARGSUSED */
4387 4494  static int
4388 4495  zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4389 4496      caller_context_t *ct)
4390 4497  {
4391 4498          if (vp->v_type == VDIR)
4392 4499                  return (0);
4393 4500          return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4394 4501  }
4395 4502  
4396 4503  /*
4397 4504   * Pre-filter the generic locking function to trap attempts to place
4398 4505   * a mandatory lock on a memory mapped file.
4399 4506   */
4400 4507  static int
4401 4508  zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4402 4509      flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4403 4510  {
4404 4511          znode_t *zp = VTOZ(vp);
4405 4512          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4406 4513  
4407 4514          ZFS_ENTER(zfsvfs);
4408 4515          ZFS_VERIFY_ZP(zp);
4409 4516  
4410 4517          /*
4411 4518           * We are following the UFS semantics with respect to mapcnt
4412 4519           * here: If we see that the file is mapped already, then we will
4413 4520           * return an error, but we don't worry about races between this
4414 4521           * function and zfs_map().
4415 4522           */
4416 4523          if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4417 4524                  ZFS_EXIT(zfsvfs);
4418 4525                  return (SET_ERROR(EAGAIN));
4419 4526          }
4420 4527          ZFS_EXIT(zfsvfs);
4421 4528          return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4422 4529  }
4423 4530  
4424 4531  /*
4425 4532   * If we can't find a page in the cache, we will create a new page
4426 4533   * and fill it with file data.  For efficiency, we may try to fill
4427 4534   * multiple pages at once (klustering) to fill up the supplied page
4428 4535   * list.  Note that the pages to be filled are held with an exclusive
4429 4536   * lock to prevent access by other threads while they are being filled.
4430 4537   */
4431 4538  static int
4432 4539  zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4433 4540      caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4434 4541  {
4435 4542          znode_t *zp = VTOZ(vp);
4436 4543          page_t *pp, *cur_pp;
4437 4544          objset_t *os = zp->z_zfsvfs->z_os;
4438 4545          u_offset_t io_off, total;
4439 4546          size_t io_len;
4440 4547          int err;
4441 4548  
4442 4549          if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4443 4550                  /*
4444 4551                   * We only have a single page, don't bother klustering
4445 4552                   */
4446 4553                  io_off = off;
4447 4554                  io_len = PAGESIZE;
4448 4555                  pp = page_create_va(vp, io_off, io_len,
4449 4556                      PG_EXCL | PG_WAIT, seg, addr);
4450 4557          } else {
4451 4558                  /*
4452 4559                   * Try to find enough pages to fill the page list
4453 4560                   */
4454 4561                  pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4455 4562                      &io_len, off, plsz, 0);
4456 4563          }
4457 4564          if (pp == NULL) {
4458 4565                  /*
4459 4566                   * The page already exists, nothing to do here.
4460 4567                   */
4461 4568                  *pl = NULL;
4462 4569                  return (0);
4463 4570          }
4464 4571  
4465 4572          /*
4466 4573           * Fill the pages in the kluster.
4467 4574           */
4468 4575          cur_pp = pp;
4469 4576          for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4470 4577                  caddr_t va;
4471 4578  
4472 4579                  ASSERT3U(io_off, ==, cur_pp->p_offset);
4473 4580                  va = zfs_map_page(cur_pp, S_WRITE);
4474 4581                  err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4475 4582                      DMU_READ_PREFETCH);
4476 4583                  zfs_unmap_page(cur_pp, va);
4477 4584                  if (err) {
4478 4585                          /* On error, toss the entire kluster */
4479 4586                          pvn_read_done(pp, B_ERROR);
4480 4587                          /* convert checksum errors into IO errors */
4481 4588                          if (err == ECKSUM)
4482 4589                                  err = SET_ERROR(EIO);
4483 4590                          return (err);
4484 4591                  }
4485 4592                  cur_pp = cur_pp->p_next;
4486 4593          }
4487 4594  
4488 4595          /*
4489 4596           * Fill in the page list array from the kluster starting
4490 4597           * from the desired offset `off'.
4491 4598           * NOTE: the page list will always be null terminated.
4492 4599           */
4493 4600          pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4494 4601          ASSERT(pl == NULL || (*pl)->p_offset == off);
4495 4602  
4496 4603          return (0);
4497 4604  }
4498 4605  
4499 4606  /*
4500 4607   * Return pointers to the pages for the file region [off, off + len]
4501 4608   * in the pl array.  If plsz is greater than len, this function may
4502 4609   * also return page pointers from after the specified region
4503 4610   * (i.e. the region [off, off + plsz]).  These additional pages are
4504 4611   * only returned if they are already in the cache, or were created as
4505 4612   * part of a klustered read.
4506 4613   *
4507 4614   *      IN:     vp      - vnode of file to get data from.
4508 4615   *              off     - position in file to get data from.
4509 4616   *              len     - amount of data to retrieve.
4510 4617   *              plsz    - length of provided page list.
4511 4618   *              seg     - segment to obtain pages for.
4512 4619   *              addr    - virtual address of fault.
4513 4620   *              rw      - mode of created pages.
4514 4621   *              cr      - credentials of caller.
4515 4622   *              ct      - caller context.
4516 4623   *
4517 4624   *      OUT:    protp   - protection mode of created pages.
4518 4625   *              pl      - list of pages created.
4519 4626   *
4520 4627   *      RETURN: 0 on success, error code on failure.
4521 4628   *
4522 4629   * Timestamps:
4523 4630   *      vp - atime updated
4524 4631   */
4525 4632  /* ARGSUSED */
4526 4633  static int
4527 4634  zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4528 4635      page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4529 4636      enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4530 4637  {
4531 4638          znode_t         *zp = VTOZ(vp);
4532 4639          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4533 4640          page_t          **pl0 = pl;
4534 4641          int             err = 0;
4535 4642  
4536 4643          /* we do our own caching, faultahead is unnecessary */
4537 4644          if (pl == NULL)
4538 4645                  return (0);
4539 4646          else if (len > plsz)
4540 4647                  len = plsz;
4541 4648          else
4542 4649                  len = P2ROUNDUP(len, PAGESIZE);
4543 4650          ASSERT(plsz >= len);
4544 4651  
4545 4652          ZFS_ENTER(zfsvfs);
4546 4653          ZFS_VERIFY_ZP(zp);
4547 4654  
4548 4655          if (protp)
4549 4656                  *protp = PROT_ALL;
4550 4657  
4551 4658          /*
4552 4659           * Loop through the requested range [off, off + len) looking
4553 4660           * for pages.  If we don't find a page, we will need to create
4554 4661           * a new page and fill it with data from the file.
4555 4662           */
4556 4663          while (len > 0) {
4557 4664                  if (*pl = page_lookup(vp, off, SE_SHARED))
4558 4665                          *(pl+1) = NULL;
4559 4666                  else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4560 4667                          goto out;
4561 4668                  while (*pl) {
4562 4669                          ASSERT3U((*pl)->p_offset, ==, off);
4563 4670                          off += PAGESIZE;
4564 4671                          addr += PAGESIZE;
4565 4672                          if (len > 0) {
4566 4673                                  ASSERT3U(len, >=, PAGESIZE);
4567 4674                                  len -= PAGESIZE;
4568 4675                          }
4569 4676                          ASSERT3U(plsz, >=, PAGESIZE);
4570 4677                          plsz -= PAGESIZE;
4571 4678                          pl++;
4572 4679                  }
4573 4680          }
4574 4681  
4575 4682          /*
4576 4683           * Fill out the page array with any pages already in the cache.
4577 4684           */
4578 4685          while (plsz > 0 &&
4579 4686              (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4580 4687                          off += PAGESIZE;
4581 4688                          plsz -= PAGESIZE;
4582 4689          }
4583 4690  out:
4584 4691          if (err) {
4585 4692                  /*
4586 4693                   * Release any pages we have previously locked.
4587 4694                   */
4588 4695                  while (pl > pl0)
4589 4696                          page_unlock(*--pl);
4590 4697          } else {
4591 4698                  ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4592 4699          }
4593 4700  
4594 4701          *pl = NULL;
4595 4702  
4596 4703          ZFS_EXIT(zfsvfs);
4597 4704          return (err);
4598 4705  }
4599 4706  
4600 4707  /*
4601 4708   * Request a memory map for a section of a file.  This code interacts
4602 4709   * with common code and the VM system as follows:
4603 4710   *
4604 4711   * - common code calls mmap(), which ends up in smmap_common()
4605 4712   * - this calls VOP_MAP(), which takes you into (say) zfs
4606 4713   * - zfs_map() calls as_map(), passing segvn_create() as the callback
4607 4714   * - segvn_create() creates the new segment and calls VOP_ADDMAP()
4608 4715   * - zfs_addmap() updates z_mapcnt
4609 4716   */
4610 4717  /*ARGSUSED*/
4611 4718  static int
4612 4719  zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4613 4720      size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4614 4721      caller_context_t *ct)
4615 4722  {
4616 4723          znode_t *zp = VTOZ(vp);
4617 4724          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4618 4725          segvn_crargs_t  vn_a;
4619 4726          int             error;
4620 4727  
4621 4728          ZFS_ENTER(zfsvfs);
4622 4729          ZFS_VERIFY_ZP(zp);
4623 4730  
4624 4731          if ((prot & PROT_WRITE) && (zp->z_pflags &
4625 4732              (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4626 4733                  ZFS_EXIT(zfsvfs);
4627 4734                  return (SET_ERROR(EPERM));
4628 4735          }
4629 4736  
4630 4737          if ((prot & (PROT_READ | PROT_EXEC)) &&
4631 4738              (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4632 4739                  ZFS_EXIT(zfsvfs);
4633 4740                  return (SET_ERROR(EACCES));
4634 4741          }
4635 4742  
4636 4743          if (vp->v_flag & VNOMAP) {
4637 4744                  ZFS_EXIT(zfsvfs);
4638 4745                  return (SET_ERROR(ENOSYS));
4639 4746          }
4640 4747  
4641 4748          if (off < 0 || len > MAXOFFSET_T - off) {
4642 4749                  ZFS_EXIT(zfsvfs);
4643 4750                  return (SET_ERROR(ENXIO));
4644 4751          }
4645 4752  
4646 4753          if (vp->v_type != VREG) {
4647 4754                  ZFS_EXIT(zfsvfs);
4648 4755                  return (SET_ERROR(ENODEV));
4649 4756          }
4650 4757  
4651 4758          /*
4652 4759           * If file is locked, disallow mapping.
4653 4760           */
4654 4761          if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
4655 4762                  ZFS_EXIT(zfsvfs);
4656 4763                  return (SET_ERROR(EAGAIN));
4657 4764          }
4658 4765  
4659 4766          as_rangelock(as);
4660 4767          error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4661 4768          if (error != 0) {
4662 4769                  as_rangeunlock(as);
4663 4770                  ZFS_EXIT(zfsvfs);
4664 4771                  return (error);
4665 4772          }
4666 4773  
4667 4774          vn_a.vp = vp;
4668 4775          vn_a.offset = (u_offset_t)off;
4669 4776          vn_a.type = flags & MAP_TYPE;
4670 4777          vn_a.prot = prot;
4671 4778          vn_a.maxprot = maxprot;
4672 4779          vn_a.cred = cr;
4673 4780          vn_a.amp = NULL;
4674 4781          vn_a.flags = flags & ~MAP_TYPE;
4675 4782          vn_a.szc = 0;
4676 4783          vn_a.lgrp_mem_policy_flags = 0;
4677 4784  
4678 4785          error = as_map(as, *addrp, len, segvn_create, &vn_a);
4679 4786  
4680 4787          as_rangeunlock(as);
4681 4788          ZFS_EXIT(zfsvfs);
4682 4789          return (error);
4683 4790  }
4684 4791  
4685 4792  /* ARGSUSED */
4686 4793  static int
4687 4794  zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4688 4795      size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4689 4796      caller_context_t *ct)
4690 4797  {
4691 4798          uint64_t pages = btopr(len);
4692 4799  
4693 4800          atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
4694 4801          return (0);
4695 4802  }
4696 4803  
4697 4804  /*
4698 4805   * The reason we push dirty pages as part of zfs_delmap() is so that we get a
4699 4806   * more accurate mtime for the associated file.  Since we don't have a way of
4700 4807   * detecting when the data was actually modified, we have to resort to
4701 4808   * heuristics.  If an explicit msync() is done, then we mark the mtime when the
4702 4809   * last page is pushed.  The problem occurs when the msync() call is omitted,
4703 4810   * which by far the most common case:
4704 4811   *
4705 4812   *      open()
4706 4813   *      mmap()
4707 4814   *      <modify memory>
4708 4815   *      munmap()
4709 4816   *      close()
4710 4817   *      <time lapse>
4711 4818   *      putpage() via fsflush
4712 4819   *
4713 4820   * If we wait until fsflush to come along, we can have a modification time that
4714 4821   * is some arbitrary point in the future.  In order to prevent this in the
4715 4822   * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
4716 4823   * torn down.
4717 4824   */
4718 4825  /* ARGSUSED */
4719 4826  static int
4720 4827  zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4721 4828      size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4722 4829      caller_context_t *ct)
4723 4830  {
4724 4831          uint64_t pages = btopr(len);
4725 4832  
4726 4833          ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
4727 4834          atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
4728 4835  
4729 4836          if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
4730 4837              vn_has_cached_data(vp))
4731 4838                  (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
4732 4839  
4733 4840          return (0);
4734 4841  }
4735 4842  
4736 4843  /*
4737 4844   * Free or allocate space in a file.  Currently, this function only
4738 4845   * supports the `F_FREESP' command.  However, this command is somewhat
4739 4846   * misnamed, as its functionality includes the ability to allocate as
4740 4847   * well as free space.
4741 4848   *
4742 4849   *      IN:     vp      - vnode of file to free data in.
4743 4850   *              cmd     - action to take (only F_FREESP supported).
4744 4851   *              bfp     - section of file to free/alloc.
4745 4852   *              flag    - current file open mode flags.
4746 4853   *              offset  - current file offset.
4747 4854   *              cr      - credentials of caller [UNUSED].
4748 4855   *              ct      - caller context.
4749 4856   *
4750 4857   *      RETURN: 0 on success, error code on failure.
4751 4858   *
4752 4859   * Timestamps:
4753 4860   *      vp - ctime|mtime updated
4754 4861   */
4755 4862  /* ARGSUSED */
4756 4863  static int
4757 4864  zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
4758 4865      offset_t offset, cred_t *cr, caller_context_t *ct)
4759 4866  {
4760 4867          znode_t         *zp = VTOZ(vp);
4761 4868          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4762 4869          uint64_t        off, len;
4763 4870          int             error;
4764 4871  
4765 4872          ZFS_ENTER(zfsvfs);
4766 4873          ZFS_VERIFY_ZP(zp);
4767 4874  
4768 4875          if (cmd != F_FREESP) {
4769 4876                  ZFS_EXIT(zfsvfs);
4770 4877                  return (SET_ERROR(EINVAL));
4771 4878          }
4772 4879  
4773 4880          /*
4774 4881           * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
4775 4882           * callers might not be able to detect properly that we are read-only,
4776 4883           * so check it explicitly here.
4777 4884           */
4778 4885          if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
4779 4886                  ZFS_EXIT(zfsvfs);
4780 4887                  return (SET_ERROR(EROFS));
4781 4888          }
4782 4889  
4783 4890          if (error = convoff(vp, bfp, 0, offset)) {
4784 4891                  ZFS_EXIT(zfsvfs);
4785 4892                  return (error);
4786 4893          }
4787 4894  
4788 4895          if (bfp->l_len < 0) {
4789 4896                  ZFS_EXIT(zfsvfs);
4790 4897                  return (SET_ERROR(EINVAL));
4791 4898          }
4792 4899  
4793 4900          off = bfp->l_start;
4794 4901          len = bfp->l_len; /* 0 means from off to end of file */
4795 4902  
4796 4903          error = zfs_freesp(zp, off, len, flag, TRUE);
4797 4904  
4798 4905          if (error == 0 && off == 0 && len == 0)
4799 4906                  vnevent_truncate(ZTOV(zp), ct);
4800 4907  
4801 4908          ZFS_EXIT(zfsvfs);
4802 4909          return (error);
4803 4910  }
4804 4911  
4805 4912  /*ARGSUSED*/
4806 4913  static int
4807 4914  zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4808 4915  {
4809 4916          znode_t         *zp = VTOZ(vp);
4810 4917          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4811 4918          uint32_t        gen;
4812 4919          uint64_t        gen64;
4813 4920          uint64_t        object = zp->z_id;
4814 4921          zfid_short_t    *zfid;
4815 4922          int             size, i, error;
4816 4923  
4817 4924          ZFS_ENTER(zfsvfs);
4818 4925          ZFS_VERIFY_ZP(zp);
4819 4926  
4820 4927          if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4821 4928              &gen64, sizeof (uint64_t))) != 0) {
4822 4929                  ZFS_EXIT(zfsvfs);
4823 4930                  return (error);
4824 4931          }
4825 4932  
4826 4933          gen = (uint32_t)gen64;
4827 4934  
4828 4935          size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4829 4936          if (fidp->fid_len < size) {
4830 4937                  fidp->fid_len = size;
4831 4938                  ZFS_EXIT(zfsvfs);
4832 4939                  return (SET_ERROR(ENOSPC));
4833 4940          }
4834 4941  
4835 4942          zfid = (zfid_short_t *)fidp;
4836 4943  
4837 4944          zfid->zf_len = size;
4838 4945  
4839 4946          for (i = 0; i < sizeof (zfid->zf_object); i++)
4840 4947                  zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4841 4948  
4842 4949          /* Must have a non-zero generation number to distinguish from .zfs */
4843 4950          if (gen == 0)
4844 4951                  gen = 1;
4845 4952          for (i = 0; i < sizeof (zfid->zf_gen); i++)
4846 4953                  zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4847 4954  
4848 4955          if (size == LONG_FID_LEN) {
4849 4956                  uint64_t        objsetid = dmu_objset_id(zfsvfs->z_os);
4850 4957                  zfid_long_t     *zlfid;
4851 4958  
4852 4959                  zlfid = (zfid_long_t *)fidp;
4853 4960  
4854 4961                  for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4855 4962                          zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4856 4963  
4857 4964                  /* XXX - this should be the generation number for the objset */
4858 4965                  for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4859 4966                          zlfid->zf_setgen[i] = 0;
4860 4967          }
4861 4968  
4862 4969          ZFS_EXIT(zfsvfs);
4863 4970          return (0);
4864 4971  }
4865 4972  
4866 4973  static int
4867 4974  zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4868 4975      caller_context_t *ct)
4869 4976  {
4870 4977          znode_t         *zp, *xzp;
4871 4978          zfsvfs_t        *zfsvfs;
4872 4979          zfs_dirlock_t   *dl;
4873 4980          int             error;
4874 4981  
4875 4982          switch (cmd) {
4876 4983          case _PC_LINK_MAX:
4877 4984                  *valp = ULONG_MAX;
4878 4985                  return (0);
4879 4986  
4880 4987          case _PC_FILESIZEBITS:
4881 4988                  *valp = 64;
4882 4989                  return (0);
4883 4990  
4884 4991          case _PC_XATTR_EXISTS:
4885 4992                  zp = VTOZ(vp);
4886 4993                  zfsvfs = zp->z_zfsvfs;
4887 4994                  ZFS_ENTER(zfsvfs);
4888 4995                  ZFS_VERIFY_ZP(zp);
4889 4996                  *valp = 0;
4890 4997                  error = zfs_dirent_lock(&dl, zp, "", &xzp,
4891 4998                      ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
4892 4999                  if (error == 0) {
4893 5000                          zfs_dirent_unlock(dl);
4894 5001                          if (!zfs_dirempty(xzp))
4895 5002                                  *valp = 1;
4896 5003                          VN_RELE(ZTOV(xzp));
4897 5004                  } else if (error == ENOENT) {
4898 5005                          /*
4899 5006                           * If there aren't extended attributes, it's the
4900 5007                           * same as having zero of them.
4901 5008                           */
4902 5009                          error = 0;
4903 5010                  }
4904 5011                  ZFS_EXIT(zfsvfs);
4905 5012                  return (error);
4906 5013  
4907 5014          case _PC_SATTR_ENABLED:
4908 5015          case _PC_SATTR_EXISTS:
4909 5016                  *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4910 5017                      (vp->v_type == VREG || vp->v_type == VDIR);
4911 5018                  return (0);
4912 5019  
4913 5020          case _PC_ACCESS_FILTERING:
4914 5021                  *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4915 5022                      vp->v_type == VDIR;
4916 5023                  return (0);
4917 5024  
4918 5025          case _PC_ACL_ENABLED:
4919 5026                  *valp = _ACL_ACE_ENABLED;
4920 5027                  return (0);
4921 5028  
4922 5029          case _PC_MIN_HOLE_SIZE:
4923 5030                  *valp = (ulong_t)SPA_MINBLOCKSIZE;
4924 5031                  return (0);
4925 5032  
4926 5033          case _PC_TIMESTAMP_RESOLUTION:
4927 5034                  /* nanosecond timestamp resolution */
4928 5035                  *valp = 1L;
4929 5036                  return (0);
4930 5037  
4931 5038          default:
4932 5039                  return (fs_pathconf(vp, cmd, valp, cr, ct));
4933 5040          }
4934 5041  }
4935 5042  
4936 5043  /*ARGSUSED*/
4937 5044  static int
4938 5045  zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4939 5046      caller_context_t *ct)
4940 5047  {
4941 5048          znode_t *zp = VTOZ(vp);
4942 5049          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4943 5050          int error;
4944 5051          boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4945 5052  
4946 5053          ZFS_ENTER(zfsvfs);
4947 5054          ZFS_VERIFY_ZP(zp);
4948 5055          error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4949 5056          ZFS_EXIT(zfsvfs);
4950 5057  
4951 5058          return (error);
4952 5059  }
4953 5060  
4954 5061  /*ARGSUSED*/
4955 5062  static int
4956 5063  zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4957 5064      caller_context_t *ct)
4958 5065  {
4959 5066          znode_t *zp = VTOZ(vp);
4960 5067          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4961 5068          int error;
4962 5069          boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4963 5070          zilog_t *zilog = zfsvfs->z_log;
4964 5071  
4965 5072          ZFS_ENTER(zfsvfs);
4966 5073          ZFS_VERIFY_ZP(zp);
4967 5074  
4968 5075          error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4969 5076  
4970 5077          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4971 5078                  zil_commit(zilog, 0);
4972 5079  
4973 5080          ZFS_EXIT(zfsvfs);
4974 5081          return (error);
4975 5082  }
4976 5083  
4977 5084  /*
4978 5085   * The smallest read we may consider to loan out an arcbuf.
4979 5086   * This must be a power of 2.
4980 5087   */
4981 5088  int zcr_blksz_min = (1 << 10);  /* 1K */
4982 5089  /*
4983 5090   * If set to less than the file block size, allow loaning out of an
4984 5091   * arcbuf for a partial block read.  This must be a power of 2.
4985 5092   */
4986 5093  int zcr_blksz_max = (1 << 17);  /* 128K */
4987 5094  
4988 5095  /*ARGSUSED*/
4989 5096  static int
4990 5097  zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
4991 5098      caller_context_t *ct)
4992 5099  {
4993 5100          znode_t *zp = VTOZ(vp);
4994 5101          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4995 5102          int max_blksz = zfsvfs->z_max_blksz;
4996 5103          uio_t *uio = &xuio->xu_uio;
4997 5104          ssize_t size = uio->uio_resid;
4998 5105          offset_t offset = uio->uio_loffset;
4999 5106          int blksz;
5000 5107          int fullblk, i;
5001 5108          arc_buf_t *abuf;
5002 5109          ssize_t maxsize;
5003 5110          int preamble, postamble;
5004 5111  
5005 5112          if (xuio->xu_type != UIOTYPE_ZEROCOPY)
5006 5113                  return (SET_ERROR(EINVAL));
5007 5114  
5008 5115          ZFS_ENTER(zfsvfs);
5009 5116          ZFS_VERIFY_ZP(zp);
5010 5117          switch (ioflag) {
5011 5118          case UIO_WRITE:
5012 5119                  /*
5013 5120                   * Loan out an arc_buf for write if write size is bigger than
5014 5121                   * max_blksz, and the file's block size is also max_blksz.
5015 5122                   */
5016 5123                  blksz = max_blksz;
5017 5124                  if (size < blksz || zp->z_blksz != blksz) {
5018 5125                          ZFS_EXIT(zfsvfs);
5019 5126                          return (SET_ERROR(EINVAL));
5020 5127                  }
5021 5128                  /*
5022 5129                   * Caller requests buffers for write before knowing where the
5023 5130                   * write offset might be (e.g. NFS TCP write).
5024 5131                   */
5025 5132                  if (offset == -1) {
5026 5133                          preamble = 0;
5027 5134                  } else {
5028 5135                          preamble = P2PHASE(offset, blksz);
5029 5136                          if (preamble) {
5030 5137                                  preamble = blksz - preamble;
5031 5138                                  size -= preamble;
5032 5139                          }
5033 5140                  }
5034 5141  
5035 5142                  postamble = P2PHASE(size, blksz);
5036 5143                  size -= postamble;
5037 5144  
5038 5145                  fullblk = size / blksz;
5039 5146                  (void) dmu_xuio_init(xuio,
5040 5147                      (preamble != 0) + fullblk + (postamble != 0));
5041 5148                  DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
5042 5149                      int, postamble, int,
5043 5150                      (preamble != 0) + fullblk + (postamble != 0));
5044 5151  
5045 5152                  /*
5046 5153                   * Have to fix iov base/len for partial buffers.  They
5047 5154                   * currently represent full arc_buf's.
5048 5155                   */
5049 5156                  if (preamble) {
5050 5157                          /* data begins in the middle of the arc_buf */
5051 5158                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5052 5159                              blksz);
5053 5160                          ASSERT(abuf);
5054 5161                          (void) dmu_xuio_add(xuio, abuf,
5055 5162                              blksz - preamble, preamble);
5056 5163                  }
5057 5164  
5058 5165                  for (i = 0; i < fullblk; i++) {
5059 5166                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5060 5167                              blksz);
5061 5168                          ASSERT(abuf);
5062 5169                          (void) dmu_xuio_add(xuio, abuf, 0, blksz);
5063 5170                  }
5064 5171  
5065 5172                  if (postamble) {
5066 5173                          /* data ends in the middle of the arc_buf */
5067 5174                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5068 5175                              blksz);
5069 5176                          ASSERT(abuf);
5070 5177                          (void) dmu_xuio_add(xuio, abuf, 0, postamble);
5071 5178                  }
5072 5179                  break;
5073 5180          case UIO_READ:
5074 5181                  /*
5075 5182                   * Loan out an arc_buf for read if the read size is larger than
5076 5183                   * the current file block size.  Block alignment is not
5077 5184                   * considered.  Partial arc_buf will be loaned out for read.
5078 5185                   */
5079 5186                  blksz = zp->z_blksz;
5080 5187                  if (blksz < zcr_blksz_min)
5081 5188                          blksz = zcr_blksz_min;
5082 5189                  if (blksz > zcr_blksz_max)
5083 5190                          blksz = zcr_blksz_max;
5084 5191                  /* avoid potential complexity of dealing with it */
5085 5192                  if (blksz > max_blksz) {
5086 5193                          ZFS_EXIT(zfsvfs);
5087 5194                          return (SET_ERROR(EINVAL));
5088 5195                  }
5089 5196  
5090 5197                  maxsize = zp->z_size - uio->uio_loffset;
5091 5198                  if (size > maxsize)
5092 5199                          size = maxsize;
5093 5200  
5094 5201                  if (size < blksz || vn_has_cached_data(vp)) {
5095 5202                          ZFS_EXIT(zfsvfs);
5096 5203                          return (SET_ERROR(EINVAL));
5097 5204                  }
5098 5205                  break;
5099 5206          default:
5100 5207                  ZFS_EXIT(zfsvfs);
5101 5208                  return (SET_ERROR(EINVAL));
5102 5209          }
5103 5210  
5104 5211          uio->uio_extflg = UIO_XUIO;
5105 5212          XUIO_XUZC_RW(xuio) = ioflag;
5106 5213          ZFS_EXIT(zfsvfs);
5107 5214          return (0);
5108 5215  }
5109 5216  
5110 5217  /*ARGSUSED*/
5111 5218  static int
5112 5219  zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5113 5220  {
5114 5221          int i;
5115 5222          arc_buf_t *abuf;
5116 5223          int ioflag = XUIO_XUZC_RW(xuio);
5117 5224  
5118 5225          ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5119 5226  
5120 5227          i = dmu_xuio_cnt(xuio);
5121 5228          while (i-- > 0) {
5122 5229                  abuf = dmu_xuio_arcbuf(xuio, i);
5123 5230                  /*
5124 5231                   * if abuf == NULL, it must be a write buffer
5125 5232                   * that has been returned in zfs_write().
5126 5233                   */
5127 5234                  if (abuf)
5128 5235                          dmu_return_arcbuf(abuf);
5129 5236                  ASSERT(abuf || ioflag == UIO_WRITE);
5130 5237          }
5131 5238  
5132 5239          dmu_xuio_fini(xuio);
5133 5240          return (0);
5134 5241  }
5135 5242  
5136 5243  /*
5137 5244   * Predeclare these here so that the compiler assumes that
5138 5245   * this is an "old style" function declaration that does
5139 5246   * not include arguments => we won't get type mismatch errors
5140 5247   * in the initializations that follow.
5141 5248   */
5142 5249  static int zfs_inval();
5143 5250  static int zfs_isdir();
5144 5251  
5145 5252  static int
5146 5253  zfs_inval()
5147 5254  {
5148 5255          return (SET_ERROR(EINVAL));
5149 5256  }
5150 5257  
5151 5258  static int
5152 5259  zfs_isdir()
5153 5260  {
5154 5261          return (SET_ERROR(EISDIR));
5155 5262  }
5156 5263  /*
5157 5264   * Directory vnode operations template
5158 5265   */
5159 5266  vnodeops_t *zfs_dvnodeops;
5160 5267  const fs_operation_def_t zfs_dvnodeops_template[] = {
5161 5268          VOPNAME_OPEN,           { .vop_open = zfs_open },
5162 5269          VOPNAME_CLOSE,          { .vop_close = zfs_close },
5163 5270          VOPNAME_READ,           { .error = zfs_isdir },
5164 5271          VOPNAME_WRITE,          { .error = zfs_isdir },
5165 5272          VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5166 5273          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5167 5274          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5168 5275          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5169 5276          VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5170 5277          VOPNAME_CREATE,         { .vop_create = zfs_create },
5171 5278          VOPNAME_REMOVE,         { .vop_remove = zfs_remove },
5172 5279          VOPNAME_LINK,           { .vop_link = zfs_link },
5173 5280          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5174 5281          VOPNAME_MKDIR,          { .vop_mkdir = zfs_mkdir },
5175 5282          VOPNAME_RMDIR,          { .vop_rmdir = zfs_rmdir },
5176 5283          VOPNAME_READDIR,        { .vop_readdir = zfs_readdir },
5177 5284          VOPNAME_SYMLINK,        { .vop_symlink = zfs_symlink },
5178 5285          VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5179 5286          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5180 5287          VOPNAME_FID,            { .vop_fid = zfs_fid },
5181 5288          VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5182 5289          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5183 5290          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5184 5291          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5185 5292          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5186 5293          NULL,                   NULL
5187 5294  };
5188 5295  
5189 5296  /*
5190 5297   * Regular file vnode operations template
5191 5298   */
5192 5299  vnodeops_t *zfs_fvnodeops;
5193 5300  const fs_operation_def_t zfs_fvnodeops_template[] = {
5194 5301          VOPNAME_OPEN,           { .vop_open = zfs_open },
5195 5302          VOPNAME_CLOSE,          { .vop_close = zfs_close },
5196 5303          VOPNAME_READ,           { .vop_read = zfs_read },
5197 5304          VOPNAME_WRITE,          { .vop_write = zfs_write },
5198 5305          VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5199 5306          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5200 5307          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5201 5308          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5202 5309          VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5203 5310          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5204 5311          VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5205 5312          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5206 5313          VOPNAME_FID,            { .vop_fid = zfs_fid },
5207 5314          VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5208 5315          VOPNAME_FRLOCK,         { .vop_frlock = zfs_frlock },
5209 5316          VOPNAME_SPACE,          { .vop_space = zfs_space },
5210 5317          VOPNAME_GETPAGE,        { .vop_getpage = zfs_getpage },
5211 5318          VOPNAME_PUTPAGE,        { .vop_putpage = zfs_putpage },
5212 5319          VOPNAME_MAP,            { .vop_map = zfs_map },
5213 5320          VOPNAME_ADDMAP,         { .vop_addmap = zfs_addmap },
5214 5321          VOPNAME_DELMAP,         { .vop_delmap = zfs_delmap },
5215 5322          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5216 5323          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5217 5324          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5218 5325          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5219 5326          VOPNAME_REQZCBUF,       { .vop_reqzcbuf = zfs_reqzcbuf },
5220 5327          VOPNAME_RETZCBUF,       { .vop_retzcbuf = zfs_retzcbuf },
5221 5328          NULL,                   NULL
5222 5329  };
5223 5330  
5224 5331  /*
5225 5332   * Symbolic link vnode operations template
5226 5333   */
5227 5334  vnodeops_t *zfs_symvnodeops;
5228 5335  const fs_operation_def_t zfs_symvnodeops_template[] = {
5229 5336          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5230 5337          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5231 5338          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5232 5339          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5233 5340          VOPNAME_READLINK,       { .vop_readlink = zfs_readlink },
5234 5341          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5235 5342          VOPNAME_FID,            { .vop_fid = zfs_fid },
5236 5343          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5237 5344          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5238 5345          NULL,                   NULL
5239 5346  };
5240 5347  
5241 5348  /*
5242 5349   * special share hidden files vnode operations template
5243 5350   */
5244 5351  vnodeops_t *zfs_sharevnodeops;
5245 5352  const fs_operation_def_t zfs_sharevnodeops_template[] = {
5246 5353          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5247 5354          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5248 5355          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5249 5356          VOPNAME_FID,            { .vop_fid = zfs_fid },
5250 5357          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5251 5358          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5252 5359          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5253 5360          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5254 5361          NULL,                   NULL
5255 5362  };
5256 5363  
5257 5364  /*
5258 5365   * Extended attribute directory vnode operations template
5259 5366   *
5260 5367   * This template is identical to the directory vnodes
5261 5368   * operation template except for restricted operations:
5262 5369   *      VOP_MKDIR()
5263 5370   *      VOP_SYMLINK()
5264 5371   *
5265 5372   * Note that there are other restrictions embedded in:
5266 5373   *      zfs_create()    - restrict type to VREG
5267 5374   *      zfs_link()      - no links into/out of attribute space
5268 5375   *      zfs_rename()    - no moves into/out of attribute space
5269 5376   */
5270 5377  vnodeops_t *zfs_xdvnodeops;
5271 5378  const fs_operation_def_t zfs_xdvnodeops_template[] = {
5272 5379          VOPNAME_OPEN,           { .vop_open = zfs_open },
5273 5380          VOPNAME_CLOSE,          { .vop_close = zfs_close },
5274 5381          VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5275 5382          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5276 5383          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5277 5384          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5278 5385          VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5279 5386          VOPNAME_CREATE,         { .vop_create = zfs_create },
5280 5387          VOPNAME_REMOVE,         { .vop_remove = zfs_remove },
5281 5388          VOPNAME_LINK,           { .vop_link = zfs_link },
5282 5389          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5283 5390          VOPNAME_MKDIR,          { .error = zfs_inval },
5284 5391          VOPNAME_RMDIR,          { .vop_rmdir = zfs_rmdir },
5285 5392          VOPNAME_READDIR,        { .vop_readdir = zfs_readdir },
5286 5393          VOPNAME_SYMLINK,        { .error = zfs_inval },
5287 5394          VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5288 5395          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5289 5396          VOPNAME_FID,            { .vop_fid = zfs_fid },
5290 5397          VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5291 5398          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5292 5399          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5293 5400          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5294 5401          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5295 5402          NULL,                   NULL
5296 5403  };
5297 5404  
5298 5405  /*
5299 5406   * Error vnode operations template
5300 5407   */
5301 5408  vnodeops_t *zfs_evnodeops;
5302 5409  const fs_operation_def_t zfs_evnodeops_template[] = {
5303 5410          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5304 5411          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5305 5412          NULL,                   NULL
5306 5413  };

↓ open down ↓

4338 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX