illumos-gate Wdiff usr/src/uts/common/fs/nfs/nfs3_vnops.c

Print this page

3484 enhance and document tail follow support
Reviewed by: Joshua M. Clulow <jmc@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/nfs/nfs3_vnops.c
          +++ new/usr/src/uts/common/fs/nfs/nfs3_vnops.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */

↓ open down ↓

20 lines elided

↑ open up ↑

  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /*
  27   27   *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
  28   28   *      All rights reserved.
  29   29   */
  30   30  
       31 +/*
       32 + * Copyright (c) 2013, Joyent, Inc. All rights reserved.
       33 + */
       34 +
  31   35  #include <sys/param.h>
  32   36  #include <sys/types.h>
  33   37  #include <sys/systm.h>
  34   38  #include <sys/cred.h>
  35   39  #include <sys/time.h>
  36   40  #include <sys/vnode.h>
  37   41  #include <sys/vfs.h>
  38   42  #include <sys/vfs_opreg.h>
  39   43  #include <sys/file.h>
  40   44  #include <sys/filio.h>

  41   45  #include <sys/uio.h>
  42   46  #include <sys/buf.h>
  43   47  #include <sys/mman.h>
  44   48  #include <sys/pathname.h>
  45   49  #include <sys/dirent.h>
  46   50  #include <sys/debug.h>
  47   51  #include <sys/vmsystm.h>
  48   52  #include <sys/fcntl.h>
  49   53  #include <sys/flock.h>
  50   54  #include <sys/swap.h>
  51   55  #include <sys/errno.h>
  52   56  #include <sys/strsubr.h>
  53   57  #include <sys/sysmacros.h>
  54   58  #include <sys/kmem.h>
  55   59  #include <sys/cmn_err.h>
  56   60  #include <sys/pathconf.h>
  57   61  #include <sys/utsname.h>
  58   62  #include <sys/dnlc.h>
  59   63  #include <sys/acl.h>
  60   64  #include <sys/systeminfo.h>
  61   65  #include <sys/atomic.h>
  62   66  #include <sys/policy.h>
  63   67  #include <sys/sdt.h>
  64   68  #include <sys/zone.h>
  65   69  
  66   70  #include <rpc/types.h>
  67   71  #include <rpc/auth.h>
  68   72  #include <rpc/clnt.h>
  69   73  #include <rpc/rpc_rdma.h>
  70   74  
  71   75  #include <nfs/nfs.h>
  72   76  #include <nfs/nfs_clnt.h>
  73   77  #include <nfs/rnode.h>
  74   78  #include <nfs/nfs_acl.h>
  75   79  #include <nfs/lm.h>
  76   80  
  77   81  #include <vm/hat.h>
  78   82  #include <vm/as.h>
  79   83  #include <vm/page.h>
  80   84  #include <vm/pvn.h>
  81   85  #include <vm/seg.h>
  82   86  #include <vm/seg_map.h>
  83   87  #include <vm/seg_kpm.h>
  84   88  #include <vm/seg_vn.h>
  85   89  
  86   90  #include <fs/fs_subr.h>
  87   91  
  88   92  #include <sys/ddi.h>
  89   93  
  90   94  static int      nfs3_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
  91   95                          cred_t *);
  92   96  static int      nfs3write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
  93   97                          stable_how *);
  94   98  static int      nfs3read(vnode_t *, caddr_t, offset_t, int, size_t *, cred_t *);
  95   99  static int      nfs3setattr(vnode_t *, struct vattr *, int, cred_t *);
  96  100  static int      nfs3_accessx(void *, int, cred_t *);
  97  101  static int      nfs3lookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
  98  102  static int      nfs3lookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
  99  103  static int      nfs3create(vnode_t *, char *, struct vattr *, enum vcexcl,
 100  104                          int, vnode_t **, cred_t *, int);
 101  105  static int      nfs3excl_create_settimes(vnode_t *, struct vattr *, cred_t *);
 102  106  static int      nfs3mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
 103  107                          int, vnode_t **, cred_t *);
 104  108  static int      nfs3rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 105  109                          caller_context_t *);
 106  110  static int      do_nfs3readdir(vnode_t *, rddir_cache *, cred_t *);
 107  111  static void     nfs3readdir(vnode_t *, rddir_cache *, cred_t *);
 108  112  static void     nfs3readdirplus(vnode_t *, rddir_cache *, cred_t *);
 109  113  static int      nfs3_bio(struct buf *, stable_how *, cred_t *);
 110  114  static int      nfs3_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
 111  115                          page_t *[], size_t, struct seg *, caddr_t,
 112  116                          enum seg_rw, cred_t *);
 113  117  static void     nfs3_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
 114  118                          cred_t *);
 115  119  static int      nfs3_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
 116  120                          int, cred_t *);
 117  121  static int      nfs3_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
 118  122                          int, cred_t *);
 119  123  static int      nfs3_commit(vnode_t *, offset3, count3, cred_t *);
 120  124  static void     nfs3_set_mod(vnode_t *);
 121  125  static void     nfs3_get_commit(vnode_t *);
 122  126  static void     nfs3_get_commit_range(vnode_t *, u_offset_t, size_t);
 123  127  static int      nfs3_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
 124  128  static int      nfs3_commit_vp(vnode_t *, u_offset_t, size_t,  cred_t *);
 125  129  static int      nfs3_sync_commit(vnode_t *, page_t *, offset3, count3,
 126  130                          cred_t *);
 127  131  static void     nfs3_async_commit(vnode_t *, page_t *, offset3, count3,
 128  132                          cred_t *);
 129  133  static void     nfs3_delmap_callback(struct as *, void *, uint_t);
 130  134  
 131  135  /*
 132  136   * Error flags used to pass information about certain special errors
 133  137   * which need to be handled specially.
 134  138   */
 135  139  #define NFS_EOF                 -98
 136  140  #define NFS_VERF_MISMATCH       -97
 137  141  
 138  142  /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
 139  143  #define ALIGN64(x, ptr, sz)                                             \
 140  144          x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);               \
 141  145          if (x) {                                                        \
 142  146                  x = sizeof (uint64_t) - (x);                            \
 143  147                  sz -= (x);                                              \
 144  148                  ptr += (x);                                             \
 145  149          }
 146  150  
 147  151  /*
 148  152   * These are the vnode ops routines which implement the vnode interface to
 149  153   * the networked file system.  These routines just take their parameters,
 150  154   * make them look networkish by putting the right info into interface structs,
 151  155   * and then calling the appropriate remote routine(s) to do the work.
 152  156   *
 153  157   * Note on directory name lookup cacheing:  If we detect a stale fhandle,
 154  158   * we purge the directory cache relative to that vnode.  This way, the
 155  159   * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
 156  160   * more details on rnode locking.
 157  161   */
 158  162  
 159  163  static int      nfs3_open(vnode_t **, int, cred_t *, caller_context_t *);
 160  164  static int      nfs3_close(vnode_t *, int, int, offset_t, cred_t *,
 161  165                          caller_context_t *);
 162  166  static int      nfs3_read(vnode_t *, struct uio *, int, cred_t *,
 163  167                          caller_context_t *);
 164  168  static int      nfs3_write(vnode_t *, struct uio *, int, cred_t *,
 165  169                          caller_context_t *);
 166  170  static int      nfs3_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
 167  171                          caller_context_t *);
 168  172  static int      nfs3_getattr(vnode_t *, struct vattr *, int, cred_t *,
 169  173                          caller_context_t *);
 170  174  static int      nfs3_setattr(vnode_t *, struct vattr *, int, cred_t *,
 171  175                          caller_context_t *);
 172  176  static int      nfs3_access(vnode_t *, int, int, cred_t *, caller_context_t *);
 173  177  static int      nfs3_readlink(vnode_t *, struct uio *, cred_t *,
 174  178                          caller_context_t *);
 175  179  static int      nfs3_fsync(vnode_t *, int, cred_t *, caller_context_t *);
 176  180  static void     nfs3_inactive(vnode_t *, cred_t *, caller_context_t *);
 177  181  static int      nfs3_lookup(vnode_t *, char *, vnode_t **,
 178  182                          struct pathname *, int, vnode_t *, cred_t *,
 179  183                          caller_context_t *, int *, pathname_t *);
 180  184  static int      nfs3_create(vnode_t *, char *, struct vattr *, enum vcexcl,
 181  185                          int, vnode_t **, cred_t *, int, caller_context_t *,
 182  186                          vsecattr_t *);
 183  187  static int      nfs3_remove(vnode_t *, char *, cred_t *, caller_context_t *,
 184  188                          int);
 185  189  static int      nfs3_link(vnode_t *, vnode_t *, char *, cred_t *,
 186  190                          caller_context_t *, int);
 187  191  static int      nfs3_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 188  192                          caller_context_t *, int);
 189  193  static int      nfs3_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
 190  194                          cred_t *, caller_context_t *, int, vsecattr_t *);
 191  195  static int      nfs3_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
 192  196                          caller_context_t *, int);
 193  197  static int      nfs3_symlink(vnode_t *, char *, struct vattr *, char *,
 194  198                          cred_t *, caller_context_t *, int);
 195  199  static int      nfs3_readdir(vnode_t *, struct uio *, cred_t *, int *,
 196  200                          caller_context_t *, int);
 197  201  static int      nfs3_fid(vnode_t *, fid_t *, caller_context_t *);
 198  202  static int      nfs3_rwlock(vnode_t *, int, caller_context_t *);
 199  203  static void     nfs3_rwunlock(vnode_t *, int, caller_context_t *);
 200  204  static int      nfs3_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
 201  205  static int      nfs3_getpage(vnode_t *, offset_t, size_t, uint_t *,
 202  206                          page_t *[], size_t, struct seg *, caddr_t,
 203  207                          enum seg_rw, cred_t *, caller_context_t *);
 204  208  static int      nfs3_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
 205  209                          caller_context_t *);
 206  210  static int      nfs3_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
 207  211                          uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 208  212  static int      nfs3_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 209  213                          uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 210  214  static int      nfs3_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
 211  215                          struct flk_callback *, cred_t *, caller_context_t *);
 212  216  static int      nfs3_space(vnode_t *, int, struct flock64 *, int, offset_t,
 213  217                          cred_t *, caller_context_t *);
 214  218  static int      nfs3_realvp(vnode_t *, vnode_t **, caller_context_t *);
 215  219  static int      nfs3_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 216  220                          uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
 217  221  static int      nfs3_pathconf(vnode_t *, int, ulong_t *, cred_t *,
 218  222                          caller_context_t *);
 219  223  static int      nfs3_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
 220  224                          cred_t *, caller_context_t *);
 221  225  static void     nfs3_dispose(vnode_t *, page_t *, int, int, cred_t *,
 222  226                          caller_context_t *);
 223  227  static int      nfs3_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 224  228                          caller_context_t *);
 225  229  static int      nfs3_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 226  230                          caller_context_t *);
 227  231  static int      nfs3_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
 228  232                          caller_context_t *);
 229  233  
 230  234  struct vnodeops *nfs3_vnodeops;
 231  235  
 232  236  const fs_operation_def_t nfs3_vnodeops_template[] = {
 233  237          VOPNAME_OPEN,           { .vop_open = nfs3_open },
 234  238          VOPNAME_CLOSE,          { .vop_close = nfs3_close },
 235  239          VOPNAME_READ,           { .vop_read = nfs3_read },
 236  240          VOPNAME_WRITE,          { .vop_write = nfs3_write },
 237  241          VOPNAME_IOCTL,          { .vop_ioctl = nfs3_ioctl },
 238  242          VOPNAME_GETATTR,        { .vop_getattr = nfs3_getattr },
 239  243          VOPNAME_SETATTR,        { .vop_setattr = nfs3_setattr },
 240  244          VOPNAME_ACCESS,         { .vop_access = nfs3_access },
 241  245          VOPNAME_LOOKUP,         { .vop_lookup = nfs3_lookup },
 242  246          VOPNAME_CREATE,         { .vop_create = nfs3_create },
 243  247          VOPNAME_REMOVE,         { .vop_remove = nfs3_remove },
 244  248          VOPNAME_LINK,           { .vop_link = nfs3_link },
 245  249          VOPNAME_RENAME,         { .vop_rename = nfs3_rename },
 246  250          VOPNAME_MKDIR,          { .vop_mkdir = nfs3_mkdir },
 247  251          VOPNAME_RMDIR,          { .vop_rmdir = nfs3_rmdir },
 248  252          VOPNAME_READDIR,        { .vop_readdir = nfs3_readdir },
 249  253          VOPNAME_SYMLINK,        { .vop_symlink = nfs3_symlink },
 250  254          VOPNAME_READLINK,       { .vop_readlink = nfs3_readlink },
 251  255          VOPNAME_FSYNC,          { .vop_fsync = nfs3_fsync },
 252  256          VOPNAME_INACTIVE,       { .vop_inactive = nfs3_inactive },
 253  257          VOPNAME_FID,            { .vop_fid = nfs3_fid },
 254  258          VOPNAME_RWLOCK,         { .vop_rwlock = nfs3_rwlock },
 255  259          VOPNAME_RWUNLOCK,       { .vop_rwunlock = nfs3_rwunlock },
 256  260          VOPNAME_SEEK,           { .vop_seek = nfs3_seek },
 257  261          VOPNAME_FRLOCK,         { .vop_frlock = nfs3_frlock },
 258  262          VOPNAME_SPACE,          { .vop_space = nfs3_space },
 259  263          VOPNAME_REALVP,         { .vop_realvp = nfs3_realvp },
 260  264          VOPNAME_GETPAGE,        { .vop_getpage = nfs3_getpage },
 261  265          VOPNAME_PUTPAGE,        { .vop_putpage = nfs3_putpage },
 262  266          VOPNAME_MAP,            { .vop_map = nfs3_map },
 263  267          VOPNAME_ADDMAP,         { .vop_addmap = nfs3_addmap },
 264  268          VOPNAME_DELMAP,         { .vop_delmap = nfs3_delmap },
 265  269          /* no separate nfs3_dump */
 266  270          VOPNAME_DUMP,           { .vop_dump = nfs_dump },
 267  271          VOPNAME_PATHCONF,       { .vop_pathconf = nfs3_pathconf },
 268  272          VOPNAME_PAGEIO,         { .vop_pageio = nfs3_pageio },
 269  273          VOPNAME_DISPOSE,        { .vop_dispose = nfs3_dispose },
 270  274          VOPNAME_SETSECATTR,     { .vop_setsecattr = nfs3_setsecattr },
 271  275          VOPNAME_GETSECATTR,     { .vop_getsecattr = nfs3_getsecattr },
 272  276          VOPNAME_SHRLOCK,        { .vop_shrlock = nfs3_shrlock },
 273  277          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
 274  278          NULL,                   NULL
 275  279  };
 276  280  
 277  281  /*
 278  282   * XXX:  This is referenced in modstubs.s
 279  283   */
 280  284  struct vnodeops *
 281  285  nfs3_getvnodeops(void)
 282  286  {
 283  287          return (nfs3_vnodeops);
 284  288  }
 285  289  
 286  290  /* ARGSUSED */
 287  291  static int
 288  292  nfs3_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 289  293  {
 290  294          int error;
 291  295          struct vattr va;
 292  296          rnode_t *rp;
 293  297          vnode_t *vp;
 294  298  
 295  299          vp = *vpp;
 296  300          if (nfs_zone() != VTOMI(vp)->mi_zone)
 297  301                  return (EIO);
 298  302          rp = VTOR(vp);
 299  303          mutex_enter(&rp->r_statelock);
 300  304          if (rp->r_cred == NULL) {
 301  305                  crhold(cr);
 302  306                  rp->r_cred = cr;
 303  307          }
 304  308          mutex_exit(&rp->r_statelock);
 305  309  
 306  310          /*
 307  311           * If there is no cached data or if close-to-open
 308  312           * consistency checking is turned off, we can avoid
 309  313           * the over the wire getattr.  Otherwise, if the
 310  314           * file system is mounted readonly, then just verify
 311  315           * the caches are up to date using the normal mechanism.
 312  316           * Else, if the file is not mmap'd, then just mark
 313  317           * the attributes as timed out.  They will be refreshed
 314  318           * and the caches validated prior to being used.
 315  319           * Else, the file system is mounted writeable so
 316  320           * force an over the wire GETATTR in order to ensure
 317  321           * that all cached data is valid.
 318  322           */
 319  323          if (vp->v_count > 1 ||
 320  324              ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
 321  325              !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
 322  326                  if (vn_is_readonly(vp))
 323  327                          error = nfs3_validate_caches(vp, cr);
 324  328                  else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
 325  329                          PURGE_ATTRCACHE(vp);
 326  330                          error = 0;
 327  331                  } else {
 328  332                          va.va_mask = AT_ALL;
 329  333                          error = nfs3_getattr_otw(vp, &va, cr);
 330  334                  }
 331  335          } else
 332  336                  error = 0;
 333  337  
 334  338          return (error);
 335  339  }
 336  340  
 337  341  /* ARGSUSED */
 338  342  static int
 339  343  nfs3_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 340  344                  caller_context_t *ct)
 341  345  {
 342  346          rnode_t *rp;
 343  347          int error;
 344  348          struct vattr va;
 345  349  
 346  350          /*
 347  351           * zone_enter(2) prevents processes from changing zones with NFS files
 348  352           * open; if we happen to get here from the wrong zone we can't do
 349  353           * anything over the wire.
 350  354           */
 351  355          if (VTOMI(vp)->mi_zone != nfs_zone()) {
 352  356                  /*
 353  357                   * We could attempt to clean up locks, except we're sure
 354  358                   * that the current process didn't acquire any locks on
 355  359                   * the file: any attempt to lock a file belong to another zone
 356  360                   * will fail, and one can't lock an NFS file and then change
 357  361                   * zones, as that fails too.
 358  362                   *
 359  363                   * Returning an error here is the sane thing to do.  A
 360  364                   * subsequent call to VN_RELE() which translates to a
 361  365                   * nfs3_inactive() will clean up state: if the zone of the
 362  366                   * vnode's origin is still alive and kicking, an async worker
 363  367                   * thread will handle the request (from the correct zone), and
 364  368                   * everything (minus the commit and final nfs3_getattr_otw()
 365  369                   * call) should be OK. If the zone is going away
 366  370                   * nfs_async_inactive() will throw away cached pages inline.
 367  371                   */
 368  372                  return (EIO);
 369  373          }
 370  374  
 371  375          /*
 372  376           * If we are using local locking for this filesystem, then
 373  377           * release all of the SYSV style record locks.  Otherwise,
 374  378           * we are doing network locking and we need to release all
 375  379           * of the network locks.  All of the locks held by this
 376  380           * process on this file are released no matter what the
 377  381           * incoming reference count is.
 378  382           */
 379  383          if (VTOMI(vp)->mi_flags & MI_LLOCK) {
 380  384                  cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 381  385                  cleanshares(vp, ttoproc(curthread)->p_pid);
 382  386          } else
 383  387                  nfs_lockrelease(vp, flag, offset, cr);
 384  388  
 385  389          if (count > 1)
 386  390                  return (0);
 387  391  
 388  392          /*
 389  393           * If the file has been `unlinked', then purge the
 390  394           * DNLC so that this vnode will get reycled quicker
 391  395           * and the .nfs* file on the server will get removed.
 392  396           */
 393  397          rp = VTOR(vp);
 394  398          if (rp->r_unldvp != NULL)
 395  399                  dnlc_purge_vp(vp);
 396  400  
 397  401          /*
 398  402           * If the file was open for write and there are pages,
 399  403           * then if the file system was mounted using the "no-close-
 400  404           *      to-open" semantics, then start an asynchronous flush
 401  405           *      of the all of the pages in the file.
 402  406           * else the file system was not mounted using the "no-close-
 403  407           *      to-open" semantics, then do a synchronous flush and
 404  408           *      commit of all of the dirty and uncommitted pages.
 405  409           *
 406  410           * The asynchronous flush of the pages in the "nocto" path
 407  411           * mostly just associates a cred pointer with the rnode so
 408  412           * writes which happen later will have a better chance of
 409  413           * working.  It also starts the data being written to the
 410  414           * server, but without unnecessarily delaying the application.
 411  415           */
 412  416          if ((flag & FWRITE) && vn_has_cached_data(vp)) {
 413  417                  if (VTOMI(vp)->mi_flags & MI_NOCTO) {
 414  418                          error = nfs3_putpage(vp, (offset_t)0, 0, B_ASYNC,
 415  419                              cr, ct);
 416  420                          if (error == EAGAIN)
 417  421                                  error = 0;
 418  422                  } else
 419  423                          error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr);
 420  424                  if (!error) {
 421  425                          mutex_enter(&rp->r_statelock);
 422  426                          error = rp->r_error;
 423  427                          rp->r_error = 0;
 424  428                          mutex_exit(&rp->r_statelock);
 425  429                  }
 426  430          } else {
 427  431                  mutex_enter(&rp->r_statelock);
 428  432                  error = rp->r_error;
 429  433                  rp->r_error = 0;
 430  434                  mutex_exit(&rp->r_statelock);
 431  435          }
 432  436  
 433  437          /*
 434  438           * If RWRITEATTR is set, then issue an over the wire GETATTR to
 435  439           * refresh the attribute cache with a set of attributes which
 436  440           * weren't returned from a WRITE.  This will enable the close-
 437  441           * to-open processing to work.
 438  442           */
 439  443          if (rp->r_flags & RWRITEATTR)
 440  444                  (void) nfs3_getattr_otw(vp, &va, cr);
 441  445  
 442  446          return (error);
 443  447  }
 444  448  
 445  449  /* ARGSUSED */
 446  450  static int
 447  451  nfs3_directio_read(vnode_t *vp, struct uio *uiop, cred_t *cr)
 448  452  {
 449  453          mntinfo_t *mi;
 450  454          READ3args args;
 451  455          READ3uiores res;
 452  456          int tsize;
 453  457          offset_t offset;
 454  458          ssize_t count;
 455  459          int error;
 456  460          int douprintf;
 457  461          failinfo_t fi;
 458  462          char *sv_hostname;
 459  463  
 460  464          mi = VTOMI(vp);
 461  465          ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
 462  466          sv_hostname = VTOR(vp)->r_server->sv_hostname;
 463  467  
 464  468          douprintf = 1;
 465  469          args.file = *VTOFH3(vp);
 466  470          fi.vp = vp;
 467  471          fi.fhp = (caddr_t)&args.file;
 468  472          fi.copyproc = nfs3copyfh;
 469  473          fi.lookupproc = nfs3lookup;
 470  474          fi.xattrdirproc = acl_getxattrdir3;
 471  475  
 472  476          res.uiop = uiop;
 473  477  
 474  478          res.wlist = NULL;
 475  479  
 476  480          offset = uiop->uio_loffset;
 477  481          count = uiop->uio_resid;
 478  482  
 479  483          do {
 480  484                  if (mi->mi_io_kstats) {
 481  485                          mutex_enter(&mi->mi_lock);
 482  486                          kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
 483  487                          mutex_exit(&mi->mi_lock);
 484  488                  }
 485  489  
 486  490                  do {
 487  491                          tsize = MIN(mi->mi_tsize, count);
 488  492                          args.offset = (offset3)offset;
 489  493                          args.count = (count3)tsize;
 490  494                          res.size = (uint_t)tsize;
 491  495                          args.res_uiop = uiop;
 492  496                          args.res_data_val_alt = NULL;
 493  497  
 494  498                          error = rfs3call(mi, NFSPROC3_READ,
 495  499                              xdr_READ3args, (caddr_t)&args,
 496  500                              xdr_READ3uiores, (caddr_t)&res, cr,
 497  501                              &douprintf, &res.status, 0, &fi);
 498  502                  } while (error == ENFS_TRYAGAIN);
 499  503  
 500  504                  if (mi->mi_io_kstats) {
 501  505                          mutex_enter(&mi->mi_lock);
 502  506                          kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
 503  507                          mutex_exit(&mi->mi_lock);
 504  508                  }
 505  509  
 506  510                  if (error)
 507  511                          return (error);
 508  512  
 509  513                  error = geterrno3(res.status);
 510  514                  if (error)
 511  515                          return (error);
 512  516  
 513  517                  if (res.count != res.size) {
 514  518                          zcmn_err(getzoneid(), CE_WARN,
 515  519  "nfs3_directio_read: server %s returned incorrect amount",
 516  520                              sv_hostname);
 517  521                          return (EIO);
 518  522                  }
 519  523                  count -= res.count;
 520  524                  offset += res.count;
 521  525                  if (mi->mi_io_kstats) {
 522  526                          mutex_enter(&mi->mi_lock);
 523  527                          KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
 524  528                          KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count;
 525  529                          mutex_exit(&mi->mi_lock);
 526  530                  }
 527  531                  lwp_stat_update(LWP_STAT_INBLK, 1);
 528  532          } while (count && !res.eof);
 529  533  
 530  534          return (0);
 531  535  }
 532  536  
 533  537  /* ARGSUSED */
 534  538  static int
 535  539  nfs3_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 536  540          caller_context_t *ct)
 537  541  {
 538  542          rnode_t *rp;
 539  543          u_offset_t off;
 540  544          offset_t diff;
 541  545          int on;
 542  546          size_t n;
 543  547          caddr_t base;
 544  548          uint_t flags;
 545  549          int error = 0;
 546  550          mntinfo_t *mi;
 547  551  
 548  552          rp = VTOR(vp);
 549  553          mi = VTOMI(vp);
 550  554  
 551  555          ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
 552  556  
 553  557          if (nfs_zone() != mi->mi_zone)
 554  558                  return (EIO);
 555  559  
 556  560          if (vp->v_type != VREG)
 557  561                  return (EISDIR);
 558  562  
 559  563          if (uiop->uio_resid == 0)
 560  564                  return (0);
 561  565  
 562  566          if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
 563  567                  return (EINVAL);
 564  568  
 565  569          /*
 566  570           * Bypass VM if caching has been disabled (e.g., locking) or if
 567  571           * using client-side direct I/O and the file is not mmap'd and
 568  572           * there are no cached pages.
 569  573           */
 570  574          if ((vp->v_flag & VNOCACHE) ||
 571  575              (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 572  576              rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 573  577              !vn_has_cached_data(vp))) {
 574  578                  return (nfs3_directio_read(vp, uiop, cr));
 575  579          }
 576  580  
 577  581          do {
 578  582                  off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 579  583                  on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 580  584                  n = MIN(MAXBSIZE - on, uiop->uio_resid);
 581  585  
 582  586                  error = nfs3_validate_caches(vp, cr);
 583  587                  if (error)
 584  588                          break;
 585  589  
 586  590                  mutex_enter(&rp->r_statelock);
 587  591                  while (rp->r_flags & RINCACHEPURGE) {
 588  592                          if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 589  593                                  mutex_exit(&rp->r_statelock);
 590  594                                  return (EINTR);
 591  595                          }
 592  596                  }
 593  597                  diff = rp->r_size - uiop->uio_loffset;
 594  598                  mutex_exit(&rp->r_statelock);
 595  599                  if (diff <= 0)
 596  600                          break;
 597  601                  if (diff < n)
 598  602                          n = (size_t)diff;
 599  603  
 600  604                  if (vpm_enable) {
 601  605                          /*
 602  606                           * Copy data.
 603  607                           */
 604  608                          error = vpm_data_copy(vp, off + on, n, uiop,
 605  609                              1, NULL, 0, S_READ);
 606  610                  } else {
 607  611                          base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
 608  612                              S_READ);
 609  613  
 610  614                          error = uiomove(base + on, n, UIO_READ, uiop);
 611  615                  }
 612  616  
 613  617                  if (!error) {
 614  618                          /*
 615  619                           * If read a whole block or read to eof,
 616  620                           * won't need this buffer again soon.
 617  621                           */
 618  622                          mutex_enter(&rp->r_statelock);
 619  623                          if (n + on == MAXBSIZE ||
 620  624                              uiop->uio_loffset == rp->r_size)
 621  625                                  flags = SM_DONTNEED;
 622  626                          else
 623  627                                  flags = 0;
 624  628                          mutex_exit(&rp->r_statelock);
 625  629                          if (vpm_enable) {
 626  630                                  error = vpm_sync_pages(vp, off, n, flags);
 627  631                          } else {
 628  632                                  error = segmap_release(segkmap, base, flags);
 629  633                          }
 630  634                  } else {
 631  635                          if (vpm_enable) {
 632  636                                  (void) vpm_sync_pages(vp, off, n, 0);
 633  637                          } else {
 634  638                                  (void) segmap_release(segkmap, base, 0);
 635  639                          }
 636  640                  }
 637  641          } while (!error && uiop->uio_resid > 0);
 638  642  
 639  643          return (error);
 640  644  }
 641  645  
 642  646  /* ARGSUSED */
 643  647  static int
 644  648  nfs3_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 645  649          caller_context_t *ct)
 646  650  {
 647  651          rlim64_t limit = uiop->uio_llimit;
 648  652          rnode_t *rp;
 649  653          u_offset_t off;
 650  654          caddr_t base;
 651  655          uint_t flags;
 652  656          int remainder;
 653  657          size_t n;
 654  658          int on;
 655  659          int error;
 656  660          int resid;
 657  661          offset_t offset;
 658  662          mntinfo_t *mi;
 659  663          uint_t bsize;
 660  664  
 661  665          rp = VTOR(vp);
 662  666  
 663  667          if (vp->v_type != VREG)
 664  668                  return (EISDIR);
 665  669  
 666  670          mi = VTOMI(vp);
 667  671          if (nfs_zone() != mi->mi_zone)
 668  672                  return (EIO);
 669  673          if (uiop->uio_resid == 0)
 670  674                  return (0);
 671  675  
 672  676          if (ioflag & FAPPEND) {
 673  677                  struct vattr va;
 674  678  
 675  679                  /*
 676  680                   * Must serialize if appending.
 677  681                   */
 678  682                  if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
 679  683                          nfs_rw_exit(&rp->r_rwlock);
 680  684                          if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
 681  685                              INTR(vp)))
 682  686                                  return (EINTR);
 683  687                  }
 684  688  
 685  689                  va.va_mask = AT_SIZE;
 686  690                  error = nfs3getattr(vp, &va, cr);
 687  691                  if (error)
 688  692                          return (error);
 689  693                  uiop->uio_loffset = va.va_size;
 690  694          }
 691  695  
 692  696          offset = uiop->uio_loffset + uiop->uio_resid;
 693  697  
 694  698          if (uiop->uio_loffset < 0 || offset < 0)
 695  699                  return (EINVAL);
 696  700  
 697  701          if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 698  702                  limit = MAXOFFSET_T;
 699  703  
 700  704          /*
 701  705           * Check to make sure that the process will not exceed
 702  706           * its limit on file size.  It is okay to write up to
 703  707           * the limit, but not beyond.  Thus, the write which
 704  708           * reaches the limit will be short and the next write
 705  709           * will return an error.
 706  710           */
 707  711          remainder = 0;
 708  712          if (offset > limit) {
 709  713                  remainder = offset - limit;
 710  714                  uiop->uio_resid = limit - uiop->uio_loffset;
 711  715                  if (uiop->uio_resid <= 0) {
 712  716                          proc_t *p = ttoproc(curthread);
 713  717  
 714  718                          uiop->uio_resid += remainder;
 715  719                          mutex_enter(&p->p_lock);
 716  720                          (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 717  721                              p->p_rctls, p, RCA_UNSAFE_SIGINFO);
 718  722                          mutex_exit(&p->p_lock);
 719  723                          return (EFBIG);
 720  724                  }
 721  725          }
 722  726  
 723  727          if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
 724  728                  return (EINTR);
 725  729  
 726  730          /*
 727  731           * Bypass VM if caching has been disabled (e.g., locking) or if
 728  732           * using client-side direct I/O and the file is not mmap'd and
 729  733           * there are no cached pages.
 730  734           */
 731  735          if ((vp->v_flag & VNOCACHE) ||
 732  736              (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 733  737              rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 734  738              !vn_has_cached_data(vp))) {
 735  739                  size_t bufsize;
 736  740                  int count;
 737  741                  u_offset_t org_offset;
 738  742                  stable_how stab_comm;
 739  743  
 740  744  nfs3_fwrite:
 741  745                  if (rp->r_flags & RSTALE) {
 742  746                          resid = uiop->uio_resid;
 743  747                          offset = uiop->uio_loffset;
 744  748                          error = rp->r_error;
 745  749                          /*
 746  750                           * A close may have cleared r_error, if so,
 747  751                           * propagate ESTALE error return properly
 748  752                           */
 749  753                          if (error == 0)
 750  754                                  error = ESTALE;
 751  755                          goto bottom;
 752  756                  }
 753  757                  bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
 754  758                  base = kmem_alloc(bufsize, KM_SLEEP);
 755  759                  do {
 756  760                          if (ioflag & FDSYNC)
 757  761                                  stab_comm = DATA_SYNC;
 758  762                          else
 759  763                                  stab_comm = FILE_SYNC;
 760  764                          resid = uiop->uio_resid;
 761  765                          offset = uiop->uio_loffset;
 762  766                          count = MIN(uiop->uio_resid, bufsize);
 763  767                          org_offset = uiop->uio_loffset;
 764  768                          error = uiomove(base, count, UIO_WRITE, uiop);
 765  769                          if (!error) {
 766  770                                  error = nfs3write(vp, base, org_offset,
 767  771                                      count, cr, &stab_comm);
 768  772                          }
 769  773                  } while (!error && uiop->uio_resid > 0);
 770  774                  kmem_free(base, bufsize);
 771  775                  goto bottom;
 772  776          }
 773  777  
 774  778  
 775  779          bsize = vp->v_vfsp->vfs_bsize;
 776  780  
 777  781          do {
 778  782                  off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 779  783                  on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 780  784                  n = MIN(MAXBSIZE - on, uiop->uio_resid);
 781  785  
 782  786                  resid = uiop->uio_resid;
 783  787                  offset = uiop->uio_loffset;
 784  788  
 785  789                  if (rp->r_flags & RSTALE) {
 786  790                          error = rp->r_error;
 787  791                          /*
 788  792                           * A close may have cleared r_error, if so,
 789  793                           * propagate ESTALE error return properly
 790  794                           */
 791  795                          if (error == 0)
 792  796                                  error = ESTALE;
 793  797                          break;
 794  798                  }
 795  799  
 796  800                  /*
 797  801                   * Don't create dirty pages faster than they
 798  802                   * can be cleaned so that the system doesn't
 799  803                   * get imbalanced.  If the async queue is
 800  804                   * maxed out, then wait for it to drain before
 801  805                   * creating more dirty pages.  Also, wait for
 802  806                   * any threads doing pagewalks in the vop_getattr
 803  807                   * entry points so that they don't block for
 804  808                   * long periods.
 805  809                   */
 806  810                  mutex_enter(&rp->r_statelock);
 807  811                  while ((mi->mi_max_threads != 0 &&
 808  812                      rp->r_awcount > 2 * mi->mi_max_threads) ||
 809  813                      rp->r_gcount > 0) {
 810  814                          if (INTR(vp)) {
 811  815                                  klwp_t *lwp = ttolwp(curthread);
 812  816  
 813  817                                  if (lwp != NULL)
 814  818                                          lwp->lwp_nostop++;
 815  819                                  if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 816  820                                          mutex_exit(&rp->r_statelock);
 817  821                                          if (lwp != NULL)
 818  822                                                  lwp->lwp_nostop--;
 819  823                                          error = EINTR;
 820  824                                          goto bottom;
 821  825                                  }
 822  826                                  if (lwp != NULL)
 823  827                                          lwp->lwp_nostop--;
 824  828                          } else
 825  829                                  cv_wait(&rp->r_cv, &rp->r_statelock);
 826  830                  }
 827  831                  mutex_exit(&rp->r_statelock);
 828  832  
 829  833                  /*
 830  834                   * Touch the page and fault it in if it is not in core
 831  835                   * before segmap_getmapflt or vpm_data_copy can lock it.
 832  836                   * This is to avoid the deadlock if the buffer is mapped
 833  837                   * to the same file through mmap which we want to write.
 834  838                   */
 835  839                  uio_prefaultpages((long)n, uiop);
 836  840  
 837  841                  if (vpm_enable) {
 838  842                          /*
 839  843                           * It will use kpm mappings, so no need to
 840  844                           * pass an address.
 841  845                           */
 842  846                          error = writerp(rp, NULL, n, uiop, 0);
 843  847                  } else  {
 844  848                          if (segmap_kpm) {
 845  849                                  int pon = uiop->uio_loffset & PAGEOFFSET;
 846  850                                  size_t pn = MIN(PAGESIZE - pon,
 847  851                                      uiop->uio_resid);
 848  852                                  int pagecreate;
 849  853  
 850  854                                  mutex_enter(&rp->r_statelock);
 851  855                                  pagecreate = (pon == 0) && (pn == PAGESIZE ||
 852  856                                      uiop->uio_loffset + pn >= rp->r_size);
 853  857                                  mutex_exit(&rp->r_statelock);
 854  858  
 855  859                                  base = segmap_getmapflt(segkmap, vp, off + on,
 856  860                                      pn, !pagecreate, S_WRITE);
 857  861  
 858  862                                  error = writerp(rp, base + pon, n, uiop,
 859  863                                      pagecreate);
 860  864  
 861  865                          } else {
 862  866                                  base = segmap_getmapflt(segkmap, vp, off + on,
 863  867                                      n, 0, S_READ);
 864  868                                  error = writerp(rp, base + on, n, uiop, 0);
 865  869                          }
 866  870                  }
 867  871  
 868  872                  if (!error) {
 869  873                          if (mi->mi_flags & MI_NOAC)
 870  874                                  flags = SM_WRITE;
 871  875                          else if ((uiop->uio_loffset % bsize) == 0 ||
 872  876                              IS_SWAPVP(vp)) {
 873  877                                  /*
 874  878                                   * Have written a whole block.
 875  879                                   * Start an asynchronous write
 876  880                                   * and mark the buffer to
 877  881                                   * indicate that it won't be
 878  882                                   * needed again soon.
 879  883                                   */
 880  884                                  flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
 881  885                          } else
 882  886                                  flags = 0;
 883  887                          if ((ioflag & (FSYNC|FDSYNC)) ||
 884  888                              (rp->r_flags & ROUTOFSPACE)) {
 885  889                                  flags &= ~SM_ASYNC;
 886  890                                  flags |= SM_WRITE;
 887  891                          }
 888  892                          if (vpm_enable) {
 889  893                                  error = vpm_sync_pages(vp, off, n, flags);
 890  894                          } else {
 891  895                                  error = segmap_release(segkmap, base, flags);
 892  896                          }
 893  897                  } else {
 894  898                          if (vpm_enable) {
 895  899                                  (void) vpm_sync_pages(vp, off, n, 0);
 896  900                          } else {
 897  901                                  (void) segmap_release(segkmap, base, 0);
 898  902                          }
 899  903                          /*
 900  904                           * In the event that we got an access error while
 901  905                           * faulting in a page for a write-only file just
 902  906                           * force a write.
 903  907                           */
 904  908                          if (error == EACCES)
 905  909                                  goto nfs3_fwrite;
 906  910                  }
 907  911          } while (!error && uiop->uio_resid > 0);
 908  912  
 909  913  bottom:
 910  914          if (error) {
 911  915                  uiop->uio_resid = resid + remainder;
 912  916                  uiop->uio_loffset = offset;
 913  917          } else
 914  918                  uiop->uio_resid += remainder;
 915  919  
 916  920          nfs_rw_exit(&rp->r_lkserlock);
 917  921  
 918  922          return (error);
 919  923  }
 920  924  
 921  925  /*
 922  926   * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
 923  927   */
 924  928  static int
 925  929  nfs3_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
 926  930          int flags, cred_t *cr)
 927  931  {
 928  932          struct buf *bp;
 929  933          int error;
 930  934          page_t *savepp;
 931  935          uchar_t fsdata;
 932  936          stable_how stab_comm;
 933  937  
 934  938          ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
 935  939          bp = pageio_setup(pp, len, vp, flags);
 936  940          ASSERT(bp != NULL);
 937  941  
 938  942          /*
 939  943           * pageio_setup should have set b_addr to 0.  This
 940  944           * is correct since we want to do I/O on a page
 941  945           * boundary.  bp_mapin will use this addr to calculate
 942  946           * an offset, and then set b_addr to the kernel virtual
 943  947           * address it allocated for us.
 944  948           */
 945  949          ASSERT(bp->b_un.b_addr == 0);
 946  950  
 947  951          bp->b_edev = 0;
 948  952          bp->b_dev = 0;
 949  953          bp->b_lblkno = lbtodb(off);
 950  954          bp->b_file = vp;
 951  955          bp->b_offset = (offset_t)off;
 952  956          bp_mapin(bp);
 953  957  
 954  958          /*
 955  959           * Calculate the desired level of stability to write data
 956  960           * on the server and then mark all of the pages to reflect
 957  961           * this.
 958  962           */
 959  963          if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
 960  964              freemem > desfree) {
 961  965                  stab_comm = UNSTABLE;
 962  966                  fsdata = C_DELAYCOMMIT;
 963  967          } else {
 964  968                  stab_comm = FILE_SYNC;
 965  969                  fsdata = C_NOCOMMIT;
 966  970          }
 967  971  
 968  972          savepp = pp;
 969  973          do {
 970  974                  pp->p_fsdata = fsdata;
 971  975          } while ((pp = pp->p_next) != savepp);
 972  976  
 973  977          error = nfs3_bio(bp, &stab_comm, cr);
 974  978  
 975  979          bp_mapout(bp);
 976  980          pageio_done(bp);
 977  981  
 978  982          /*
 979  983           * If the server wrote pages in a more stable fashion than
 980  984           * was requested, then clear all of the marks in the pages
 981  985           * indicating that COMMIT operations were required.
 982  986           */
 983  987          if (stab_comm != UNSTABLE && fsdata == C_DELAYCOMMIT) {
 984  988                  do {
 985  989                          pp->p_fsdata = C_NOCOMMIT;
 986  990                  } while ((pp = pp->p_next) != savepp);
 987  991          }
 988  992  
 989  993          return (error);
 990  994  }
 991  995  
 992  996  /*
 993  997   * Write to file.  Writes to remote server in largest size
 994  998   * chunks that the server can handle.  Write is synchronous.
 995  999   */
 996 1000  static int
 997 1001  nfs3write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
 998 1002          stable_how *stab_comm)
 999 1003  {
1000 1004          mntinfo_t *mi;
1001 1005          WRITE3args args;
1002 1006          WRITE3res res;
1003 1007          int error;
1004 1008          int tsize;
1005 1009          rnode_t *rp;
1006 1010          int douprintf;
1007 1011  
1008 1012          rp = VTOR(vp);
1009 1013          mi = VTOMI(vp);
1010 1014  
1011 1015          ASSERT(nfs_zone() == mi->mi_zone);
1012 1016  
1013 1017          args.file = *VTOFH3(vp);
1014 1018          args.stable = *stab_comm;
1015 1019  
1016 1020          *stab_comm = FILE_SYNC;
1017 1021  
1018 1022          douprintf = 1;
1019 1023  
1020 1024          do {
1021 1025                  if ((vp->v_flag & VNOCACHE) ||
1022 1026                      (rp->r_flags & RDIRECTIO) ||
1023 1027                      (mi->mi_flags & MI_DIRECTIO))
1024 1028                          tsize = MIN(mi->mi_stsize, count);
1025 1029                  else
1026 1030                          tsize = MIN(mi->mi_curwrite, count);
1027 1031                  args.offset = (offset3)offset;
1028 1032                  args.count = (count3)tsize;
1029 1033                  args.data.data_len = (uint_t)tsize;
1030 1034                  args.data.data_val = base;
1031 1035  
1032 1036                  if (mi->mi_io_kstats) {
1033 1037                          mutex_enter(&mi->mi_lock);
1034 1038                          kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1035 1039                          mutex_exit(&mi->mi_lock);
1036 1040                  }
1037 1041                  args.mblk = NULL;
1038 1042                  do {
1039 1043                          error = rfs3call(mi, NFSPROC3_WRITE,
1040 1044                              xdr_WRITE3args, (caddr_t)&args,
1041 1045                              xdr_WRITE3res, (caddr_t)&res, cr,
1042 1046                              &douprintf, &res.status, 0, NULL);
1043 1047                  } while (error == ENFS_TRYAGAIN);
1044 1048                  if (mi->mi_io_kstats) {
1045 1049                          mutex_enter(&mi->mi_lock);
1046 1050                          kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1047 1051                          mutex_exit(&mi->mi_lock);
1048 1052                  }
1049 1053  
1050 1054                  if (error)
1051 1055                          return (error);
1052 1056                  error = geterrno3(res.status);
1053 1057                  if (!error) {
1054 1058                          if (res.resok.count > args.count) {
1055 1059                                  zcmn_err(getzoneid(), CE_WARN,
1056 1060                                      "nfs3write: server %s wrote %u, "
1057 1061                                      "requested was %u",
1058 1062                                      rp->r_server->sv_hostname,
1059 1063                                      res.resok.count, args.count);
1060 1064                                  return (EIO);
1061 1065                          }
1062 1066                          if (res.resok.committed == UNSTABLE) {
1063 1067                                  *stab_comm = UNSTABLE;
1064 1068                                  if (args.stable == DATA_SYNC ||
1065 1069                                      args.stable == FILE_SYNC) {
1066 1070                                          zcmn_err(getzoneid(), CE_WARN,
1067 1071                          "nfs3write: server %s did not commit to stable storage",
1068 1072                                              rp->r_server->sv_hostname);
1069 1073                                          return (EIO);
1070 1074                                  }
1071 1075                          }
1072 1076                          tsize = (int)res.resok.count;
1073 1077                          count -= tsize;
1074 1078                          base += tsize;
1075 1079                          offset += tsize;
1076 1080                          if (mi->mi_io_kstats) {
1077 1081                                  mutex_enter(&mi->mi_lock);
1078 1082                                  KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
1079 1083                                  KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
1080 1084                                      tsize;
1081 1085                                  mutex_exit(&mi->mi_lock);
1082 1086                          }
1083 1087                          lwp_stat_update(LWP_STAT_OUBLK, 1);
1084 1088                          mutex_enter(&rp->r_statelock);
1085 1089                          if (rp->r_flags & RHAVEVERF) {
1086 1090                                  if (rp->r_verf != res.resok.verf) {
1087 1091                                          nfs3_set_mod(vp);
1088 1092                                          rp->r_verf = res.resok.verf;
1089 1093                                          /*
1090 1094                                           * If the data was written UNSTABLE,
1091 1095                                           * then might as well stop because
1092 1096                                           * the whole block will have to get
1093 1097                                           * rewritten anyway.
1094 1098                                           */
1095 1099                                          if (*stab_comm == UNSTABLE) {
1096 1100                                                  mutex_exit(&rp->r_statelock);
1097 1101                                                  break;
1098 1102                                          }
1099 1103                                  }
1100 1104                          } else {
1101 1105                                  rp->r_verf = res.resok.verf;
1102 1106                                  rp->r_flags |= RHAVEVERF;
1103 1107                          }
1104 1108                          /*
1105 1109                           * Mark the attribute cache as timed out and
1106 1110                           * set RWRITEATTR to indicate that the file
1107 1111                           * was modified with a WRITE operation and
1108 1112                           * that the attributes can not be trusted.
1109 1113                           */
1110 1114                          PURGE_ATTRCACHE_LOCKED(rp);
1111 1115                          rp->r_flags |= RWRITEATTR;
1112 1116                          mutex_exit(&rp->r_statelock);
1113 1117                  }
1114 1118          } while (!error && count);
1115 1119  
1116 1120          return (error);
1117 1121  }
1118 1122  
1119 1123  /*
1120 1124   * Read from a file.  Reads data in largest chunks our interface can handle.
1121 1125   */
1122 1126  static int
1123 1127  nfs3read(vnode_t *vp, caddr_t base, offset_t offset, int count,
1124 1128          size_t *residp, cred_t *cr)
1125 1129  {
1126 1130          mntinfo_t *mi;
1127 1131          READ3args args;
1128 1132          READ3vres res;
1129 1133          int tsize;
1130 1134          int error;
1131 1135          int douprintf;
1132 1136          failinfo_t fi;
1133 1137          rnode_t *rp;
1134 1138          struct vattr va;
1135 1139          hrtime_t t;
1136 1140  
1137 1141          rp = VTOR(vp);
1138 1142          mi = VTOMI(vp);
1139 1143          ASSERT(nfs_zone() == mi->mi_zone);
1140 1144          douprintf = 1;
1141 1145  
1142 1146          args.file = *VTOFH3(vp);
1143 1147          fi.vp = vp;
1144 1148          fi.fhp = (caddr_t)&args.file;
1145 1149          fi.copyproc = nfs3copyfh;
1146 1150          fi.lookupproc = nfs3lookup;
1147 1151          fi.xattrdirproc = acl_getxattrdir3;
1148 1152  
1149 1153          res.pov.fres.vp = vp;
1150 1154          res.pov.fres.vap = &va;
1151 1155  
1152 1156          res.wlist = NULL;
1153 1157          *residp = count;
1154 1158          do {
1155 1159                  if (mi->mi_io_kstats) {
1156 1160                          mutex_enter(&mi->mi_lock);
1157 1161                          kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1158 1162                          mutex_exit(&mi->mi_lock);
1159 1163                  }
1160 1164  
1161 1165                  do {
1162 1166                          if ((vp->v_flag & VNOCACHE) ||
1163 1167                              (rp->r_flags & RDIRECTIO) ||
1164 1168                              (mi->mi_flags & MI_DIRECTIO))
1165 1169                                  tsize = MIN(mi->mi_tsize, count);
1166 1170                          else
1167 1171                                  tsize = MIN(mi->mi_curread, count);
1168 1172                          res.data.data_val = base;
1169 1173                          res.data.data_len = tsize;
1170 1174                          args.offset = (offset3)offset;
1171 1175                          args.count = (count3)tsize;
1172 1176                          args.res_uiop = NULL;
1173 1177                          args.res_data_val_alt = base;
1174 1178  
1175 1179                          t = gethrtime();
1176 1180                          error = rfs3call(mi, NFSPROC3_READ,
1177 1181                              xdr_READ3args, (caddr_t)&args,
1178 1182                              xdr_READ3vres, (caddr_t)&res, cr,
1179 1183                              &douprintf, &res.status, 0, &fi);
1180 1184                  } while (error == ENFS_TRYAGAIN);
1181 1185  
1182 1186                  if (mi->mi_io_kstats) {
1183 1187                          mutex_enter(&mi->mi_lock);
1184 1188                          kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1185 1189                          mutex_exit(&mi->mi_lock);
1186 1190                  }
1187 1191  
1188 1192                  if (error)
1189 1193                          return (error);
1190 1194  
1191 1195                  error = geterrno3(res.status);
1192 1196                  if (error)
1193 1197                          return (error);
1194 1198  
1195 1199                  if (res.count != res.data.data_len) {
1196 1200                          zcmn_err(getzoneid(), CE_WARN,
1197 1201                              "nfs3read: server %s returned incorrect amount",
1198 1202                              rp->r_server->sv_hostname);
1199 1203                          return (EIO);
1200 1204                  }
1201 1205  
1202 1206                  count -= res.count;
1203 1207                  *residp = count;
1204 1208                  base += res.count;
1205 1209                  offset += res.count;
1206 1210                  if (mi->mi_io_kstats) {
1207 1211                          mutex_enter(&mi->mi_lock);
1208 1212                          KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
1209 1213                          KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count;
1210 1214                          mutex_exit(&mi->mi_lock);
1211 1215                  }
1212 1216                  lwp_stat_update(LWP_STAT_INBLK, 1);
1213 1217          } while (count && !res.eof);
1214 1218  
1215 1219          if (res.pov.attributes) {
1216 1220                  mutex_enter(&rp->r_statelock);
1217 1221                  if (!CACHE_VALID(rp, va.va_mtime, va.va_size)) {
1218 1222                          mutex_exit(&rp->r_statelock);
1219 1223                          PURGE_ATTRCACHE(vp);
1220 1224                  } else {
1221 1225                          if (rp->r_mtime <= t)
1222 1226                                  nfs_attrcache_va(vp, &va);
1223 1227                          mutex_exit(&rp->r_statelock);
1224 1228                  }
1225 1229          }
1226 1230  
1227 1231          return (0);
1228 1232  }
1229 1233  
1230 1234  /* ARGSUSED */
1231 1235  static int
1232 1236  nfs3_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
1233 1237          caller_context_t *ct)
1234 1238  {
1235 1239  
1236 1240          if (nfs_zone() != VTOMI(vp)->mi_zone)
1237 1241                  return (EIO);
1238 1242          switch (cmd) {
1239 1243                  case _FIODIRECTIO:
1240 1244                          return (nfs_directio(vp, (int)arg, cr));
1241 1245                  default:
1242 1246                          return (ENOTTY);
1243 1247          }
1244 1248  }
1245 1249  
1246 1250  /* ARGSUSED */
1247 1251  static int
1248 1252  nfs3_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1249 1253          caller_context_t *ct)
1250 1254  {
1251 1255          int error;
1252 1256          rnode_t *rp;
1253 1257  
1254 1258          if (nfs_zone() != VTOMI(vp)->mi_zone)
1255 1259                  return (EIO);
1256 1260          /*
1257 1261           * If it has been specified that the return value will
1258 1262           * just be used as a hint, and we are only being asked
1259 1263           * for size, fsid or rdevid, then return the client's
1260 1264           * notion of these values without checking to make sure
1261 1265           * that the attribute cache is up to date.
1262 1266           * The whole point is to avoid an over the wire GETATTR
1263 1267           * call.
1264 1268           */
1265 1269          rp = VTOR(vp);
1266 1270          if (flags & ATTR_HINT) {
1267 1271                  if (vap->va_mask ==
1268 1272                      (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
1269 1273                          mutex_enter(&rp->r_statelock);
1270 1274                          if (vap->va_mask | AT_SIZE)
1271 1275                                  vap->va_size = rp->r_size;
1272 1276                          if (vap->va_mask | AT_FSID)
1273 1277                                  vap->va_fsid = rp->r_attr.va_fsid;
1274 1278                          if (vap->va_mask | AT_RDEV)
1275 1279                                  vap->va_rdev = rp->r_attr.va_rdev;
1276 1280                          mutex_exit(&rp->r_statelock);
1277 1281                          return (0);
1278 1282                  }
1279 1283          }
1280 1284  
1281 1285          /*
1282 1286           * Only need to flush pages if asking for the mtime
1283 1287           * and if there any dirty pages or any outstanding
1284 1288           * asynchronous (write) requests for this file.
1285 1289           */
1286 1290          if (vap->va_mask & AT_MTIME) {
1287 1291                  if (vn_has_cached_data(vp) &&
1288 1292                      ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1289 1293                          mutex_enter(&rp->r_statelock);
1290 1294                          rp->r_gcount++;
1291 1295                          mutex_exit(&rp->r_statelock);
1292 1296                          error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1293 1297                          mutex_enter(&rp->r_statelock);
1294 1298                          if (error && (error == ENOSPC || error == EDQUOT)) {
1295 1299                                  if (!rp->r_error)
1296 1300                                          rp->r_error = error;
1297 1301                          }
1298 1302                          if (--rp->r_gcount == 0)
1299 1303                                  cv_broadcast(&rp->r_cv);
1300 1304                          mutex_exit(&rp->r_statelock);
1301 1305                  }
1302 1306          }
1303 1307  
1304 1308          return (nfs3getattr(vp, vap, cr));
1305 1309  }
1306 1310  
1307 1311  /*ARGSUSED4*/
1308 1312  static int
1309 1313  nfs3_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1310 1314                  caller_context_t *ct)
1311 1315  {
1312 1316          int error;
1313 1317          struct vattr va;
1314 1318  
1315 1319          if (vap->va_mask & AT_NOSET)
1316 1320                  return (EINVAL);
1317 1321          if (nfs_zone() != VTOMI(vp)->mi_zone)
1318 1322                  return (EIO);
1319 1323  
1320 1324          va.va_mask = AT_UID | AT_MODE;
1321 1325          error = nfs3getattr(vp, &va, cr);
1322 1326          if (error)
1323 1327                  return (error);
1324 1328  
1325 1329          error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs3_accessx,
1326 1330              vp);
1327 1331          if (error)
1328 1332                  return (error);
1329 1333  
1330 1334          return (nfs3setattr(vp, vap, flags, cr));
1331 1335  }
1332 1336  
1333 1337  static int
1334 1338  nfs3setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1335 1339  {
1336 1340          int error;
1337 1341          uint_t mask;
1338 1342          SETATTR3args args;
1339 1343          SETATTR3res res;
1340 1344          int douprintf;
1341 1345          rnode_t *rp;
1342 1346          struct vattr va;
1343 1347          mode_t omode;
1344 1348          vsecattr_t *vsp;
1345 1349          hrtime_t t;
1346 1350  
1347 1351          ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1348 1352          mask = vap->va_mask;
1349 1353  
1350 1354          rp = VTOR(vp);
1351 1355  
1352 1356          /*
1353 1357           * Only need to flush pages if there are any pages and
1354 1358           * if the file is marked as dirty in some fashion.  The
1355 1359           * file must be flushed so that we can accurately
1356 1360           * determine the size of the file and the cached data
1357 1361           * after the SETATTR returns.  A file is considered to
1358 1362           * be dirty if it is either marked with RDIRTY, has
1359 1363           * outstanding i/o's active, or is mmap'd.  In this
1360 1364           * last case, we can't tell whether there are dirty
1361 1365           * pages, so we flush just to be sure.
1362 1366           */
1363 1367          if (vn_has_cached_data(vp) &&
1364 1368              ((rp->r_flags & RDIRTY) ||
1365 1369              rp->r_count > 0 ||
1366 1370              rp->r_mapcnt > 0)) {
1367 1371                  ASSERT(vp->v_type != VCHR);
1368 1372                  error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
1369 1373                  if (error && (error == ENOSPC || error == EDQUOT)) {
1370 1374                          mutex_enter(&rp->r_statelock);
1371 1375                          if (!rp->r_error)
1372 1376                                  rp->r_error = error;
1373 1377                          mutex_exit(&rp->r_statelock);
1374 1378                  }
1375 1379          }
1376 1380  
1377 1381          args.object = *RTOFH3(rp);
1378 1382          /*
1379 1383           * If the intent is for the server to set the times,
1380 1384           * there is no point in have the mask indicating set mtime or
1381 1385           * atime, because the vap values may be junk, and so result
1382 1386           * in an overflow error. Remove these flags from the vap mask
1383 1387           * before calling in this case, and restore them afterwards.
1384 1388           */
1385 1389          if ((mask & (AT_ATIME | AT_MTIME)) && !(flags & ATTR_UTIME)) {
1386 1390                  /* Use server times, so don't set the args time fields */
1387 1391                  vap->va_mask &= ~(AT_ATIME | AT_MTIME);
1388 1392                  error = vattr_to_sattr3(vap, &args.new_attributes);
1389 1393                  vap->va_mask |= (mask & (AT_ATIME | AT_MTIME));
1390 1394                  if (mask & AT_ATIME) {
1391 1395                          args.new_attributes.atime.set_it = SET_TO_SERVER_TIME;
1392 1396                  }
1393 1397                  if (mask & AT_MTIME) {
1394 1398                          args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME;
1395 1399                  }
1396 1400          } else {
1397 1401                  /* Either do not set times or use the client specified times */
1398 1402                  error = vattr_to_sattr3(vap, &args.new_attributes);
1399 1403          }
1400 1404  
1401 1405          if (error) {
1402 1406                  /* req time field(s) overflow - return immediately */
1403 1407                  return (error);
1404 1408          }
1405 1409  
1406 1410          va.va_mask = AT_MODE | AT_CTIME;
1407 1411          error = nfs3getattr(vp, &va, cr);
1408 1412          if (error)
1409 1413                  return (error);
1410 1414          omode = va.va_mode;
1411 1415  
1412 1416  tryagain:
1413 1417          if (mask & AT_SIZE) {
1414 1418                  args.guard.check = TRUE;
1415 1419                  args.guard.obj_ctime.seconds = va.va_ctime.tv_sec;
1416 1420                  args.guard.obj_ctime.nseconds = va.va_ctime.tv_nsec;
1417 1421          } else
1418 1422                  args.guard.check = FALSE;
1419 1423  
1420 1424          douprintf = 1;
1421 1425  
1422 1426          t = gethrtime();
1423 1427  
1424 1428          error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR,
1425 1429              xdr_SETATTR3args, (caddr_t)&args,
1426 1430              xdr_SETATTR3res, (caddr_t)&res, cr,
1427 1431              &douprintf, &res.status, 0, NULL);
1428 1432  
1429 1433          /*
1430 1434           * Purge the access cache and ACL cache if changing either the
1431 1435           * owner of the file, the group owner, or the mode.  These may
1432 1436           * change the access permissions of the file, so purge old
1433 1437           * information and start over again.
1434 1438           */
1435 1439          if (mask & (AT_UID | AT_GID | AT_MODE)) {
1436 1440                  (void) nfs_access_purge_rp(rp);
1437 1441                  if (rp->r_secattr != NULL) {
1438 1442                          mutex_enter(&rp->r_statelock);
1439 1443                          vsp = rp->r_secattr;
1440 1444                          rp->r_secattr = NULL;
1441 1445                          mutex_exit(&rp->r_statelock);
1442 1446                          if (vsp != NULL)
1443 1447                                  nfs_acl_free(vsp);
1444 1448                  }
1445 1449          }
1446 1450  
1447 1451          if (error) {
1448 1452                  PURGE_ATTRCACHE(vp);
1449 1453                  return (error);
1450 1454          }
1451 1455  
1452 1456          error = geterrno3(res.status);
1453 1457          if (!error) {
1454 1458                  /*
1455 1459                   * If changing the size of the file, invalidate
1456 1460                   * any local cached data which is no longer part
1457 1461                   * of the file.  We also possibly invalidate the
1458 1462                   * last page in the file.  We could use
1459 1463                   * pvn_vpzero(), but this would mark the page as
1460 1464                   * modified and require it to be written back to
1461 1465                   * the server for no particularly good reason.
1462 1466                   * This way, if we access it, then we bring it
1463 1467                   * back in.  A read should be cheaper than a
1464 1468                   * write.
1465 1469                   */
1466 1470                  if (mask & AT_SIZE) {
1467 1471                          nfs_invalidate_pages(vp,
1468 1472                              (vap->va_size & PAGEMASK), cr);
1469 1473                  }
1470 1474                  nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr);
1471 1475                  /*
1472 1476                   * Some servers will change the mode to clear the setuid
1473 1477                   * and setgid bits when changing the uid or gid.  The
1474 1478                   * client needs to compensate appropriately.
1475 1479                   */
1476 1480                  if (mask & (AT_UID | AT_GID)) {
1477 1481                          int terror;
1478 1482  
1479 1483                          va.va_mask = AT_MODE;
1480 1484                          terror = nfs3getattr(vp, &va, cr);
1481 1485                          if (!terror &&
1482 1486                              (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
1483 1487                              (!(mask & AT_MODE) && va.va_mode != omode))) {
1484 1488                                  va.va_mask = AT_MODE;
1485 1489                                  if (mask & AT_MODE)
1486 1490                                          va.va_mode = vap->va_mode;
1487 1491                                  else
1488 1492                                          va.va_mode = omode;
1489 1493                                  (void) nfs3setattr(vp, &va, 0, cr);
1490 1494                          }
1491 1495                  }
1492 1496          } else {
1493 1497                  nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr);
1494 1498                  /*
1495 1499                   * If we got back a "not synchronized" error, then
1496 1500                   * we need to retry with a new guard value.  The
1497 1501                   * guard value used is the change time.  If the
1498 1502                   * server returned post_op_attr, then we can just
1499 1503                   * retry because we have the latest attributes.
1500 1504                   * Otherwise, we issue a GETATTR to get the latest
1501 1505                   * attributes and then retry.  If we couldn't get
1502 1506                   * the attributes this way either, then we give
1503 1507                   * up because we can't complete the operation as
1504 1508                   * required.
1505 1509                   */
1506 1510                  if (res.status == NFS3ERR_NOT_SYNC) {
1507 1511                          va.va_mask = AT_CTIME;
1508 1512                          if (nfs3getattr(vp, &va, cr) == 0)
1509 1513                                  goto tryagain;
1510 1514                  }
1511 1515                  PURGE_STALE_FH(error, vp, cr);
1512 1516          }
1513 1517  
1514 1518          return (error);
1515 1519  }
1516 1520  
1517 1521  static int
1518 1522  nfs3_accessx(void *vp, int mode, cred_t *cr)
1519 1523  {
1520 1524          ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1521 1525          return (nfs3_access(vp, mode, 0, cr, NULL));
1522 1526  }
1523 1527  
1524 1528  /* ARGSUSED */
1525 1529  static int
1526 1530  nfs3_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
1527 1531  {
1528 1532          int error;
1529 1533          ACCESS3args args;
1530 1534          ACCESS3res res;
1531 1535          int douprintf;
1532 1536          uint32 acc;
1533 1537          rnode_t *rp;
1534 1538          cred_t *cred, *ncr, *ncrfree = NULL;
1535 1539          failinfo_t fi;
1536 1540          nfs_access_type_t cacc;
1537 1541          hrtime_t t;
1538 1542  
1539 1543          acc = 0;
1540 1544          if (nfs_zone() != VTOMI(vp)->mi_zone)
1541 1545                  return (EIO);
1542 1546          if (mode & VREAD)
1543 1547                  acc |= ACCESS3_READ;
1544 1548          if (mode & VWRITE) {
1545 1549                  if (vn_is_readonly(vp) && !IS_DEVVP(vp))
1546 1550                          return (EROFS);
1547 1551                  if (vp->v_type == VDIR)
1548 1552                          acc |= ACCESS3_DELETE;
1549 1553                  acc |= ACCESS3_MODIFY | ACCESS3_EXTEND;
1550 1554          }
1551 1555          if (mode & VEXEC) {
1552 1556                  if (vp->v_type == VDIR)
1553 1557                          acc |= ACCESS3_LOOKUP;
1554 1558                  else
1555 1559                          acc |= ACCESS3_EXECUTE;
1556 1560          }
1557 1561  
1558 1562          rp = VTOR(vp);
1559 1563          args.object = *VTOFH3(vp);
1560 1564          if (vp->v_type == VDIR) {
1561 1565                  args.access = ACCESS3_READ | ACCESS3_DELETE | ACCESS3_MODIFY |
1562 1566                      ACCESS3_EXTEND | ACCESS3_LOOKUP;
1563 1567          } else {
1564 1568                  args.access = ACCESS3_READ | ACCESS3_MODIFY | ACCESS3_EXTEND |
1565 1569                      ACCESS3_EXECUTE;
1566 1570          }
1567 1571          fi.vp = vp;
1568 1572          fi.fhp = (caddr_t)&args.object;
1569 1573          fi.copyproc = nfs3copyfh;
1570 1574          fi.lookupproc = nfs3lookup;
1571 1575          fi.xattrdirproc = acl_getxattrdir3;
1572 1576  
1573 1577          cred = cr;
1574 1578          /*
1575 1579           * ncr and ncrfree both initially
1576 1580           * point to the memory area returned
1577 1581           * by crnetadjust();
1578 1582           * ncrfree not NULL when exiting means
1579 1583           * that we need to release it
1580 1584           */
1581 1585          ncr = crnetadjust(cred);
1582 1586          ncrfree = ncr;
1583 1587  tryagain:
1584 1588          if (rp->r_acache != NULL) {
1585 1589                  cacc = nfs_access_check(rp, acc, cred);
1586 1590                  if (cacc == NFS_ACCESS_ALLOWED) {
1587 1591                          if (ncrfree != NULL)
1588 1592                                  crfree(ncrfree);
1589 1593                          return (0);
1590 1594                  }
1591 1595                  if (cacc == NFS_ACCESS_DENIED) {
1592 1596                          /*
1593 1597                           * If the cred can be adjusted, try again
1594 1598                           * with the new cred.
1595 1599                           */
1596 1600                          if (ncr != NULL) {
1597 1601                                  cred = ncr;
1598 1602                                  ncr = NULL;
1599 1603                                  goto tryagain;
1600 1604                          }
1601 1605                          if (ncrfree != NULL)
1602 1606                                  crfree(ncrfree);
1603 1607                          return (EACCES);
1604 1608                  }
1605 1609          }
1606 1610  
1607 1611          douprintf = 1;
1608 1612  
1609 1613          t = gethrtime();
1610 1614  
1611 1615          error = rfs3call(VTOMI(vp), NFSPROC3_ACCESS,
1612 1616              xdr_ACCESS3args, (caddr_t)&args,
1613 1617              xdr_ACCESS3res, (caddr_t)&res, cred,
1614 1618              &douprintf, &res.status, 0, &fi);
1615 1619  
1616 1620          if (error) {
1617 1621                  if (ncrfree != NULL)
1618 1622                          crfree(ncrfree);
1619 1623                  return (error);
1620 1624          }
1621 1625  
1622 1626          error = geterrno3(res.status);
1623 1627          if (!error) {
1624 1628                  nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
1625 1629                  nfs_access_cache(rp, args.access, res.resok.access, cred);
1626 1630                  /*
1627 1631                   * we just cached results with cred; if cred is the
1628 1632                   * adjusted credentials from crnetadjust, we do not want
1629 1633                   * to release them before exiting: hence setting ncrfree
1630 1634                   * to NULL
1631 1635                   */
1632 1636                  if (cred != cr)
1633 1637                          ncrfree = NULL;
1634 1638                  if ((acc & res.resok.access) != acc) {
1635 1639                          /*
1636 1640                           * If the cred can be adjusted, try again
1637 1641                           * with the new cred.
1638 1642                           */
1639 1643                          if (ncr != NULL) {
1640 1644                                  cred = ncr;
1641 1645                                  ncr = NULL;
1642 1646                                  goto tryagain;
1643 1647                          }
1644 1648                          error = EACCES;
1645 1649                  }
1646 1650          } else {
1647 1651                  nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
1648 1652                  PURGE_STALE_FH(error, vp, cr);
1649 1653          }
1650 1654  
1651 1655          if (ncrfree != NULL)
1652 1656                  crfree(ncrfree);
1653 1657  
1654 1658          return (error);
1655 1659  }
1656 1660  
1657 1661  static int nfs3_do_symlink_cache = 1;
1658 1662  
1659 1663  /* ARGSUSED */
1660 1664  static int
1661 1665  nfs3_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
1662 1666  {
1663 1667          int error;
1664 1668          READLINK3args args;
1665 1669          READLINK3res res;
1666 1670          nfspath3 resdata_backup;
1667 1671          rnode_t *rp;
1668 1672          int douprintf;
1669 1673          int len;
1670 1674          failinfo_t fi;
1671 1675          hrtime_t t;
1672 1676  
1673 1677          /*
1674 1678           * Can't readlink anything other than a symbolic link.
1675 1679           */
1676 1680          if (vp->v_type != VLNK)
1677 1681                  return (EINVAL);
1678 1682          if (nfs_zone() != VTOMI(vp)->mi_zone)
1679 1683                  return (EIO);
1680 1684  
1681 1685          rp = VTOR(vp);
1682 1686          if (nfs3_do_symlink_cache && rp->r_symlink.contents != NULL) {
1683 1687                  error = nfs3_validate_caches(vp, cr);
1684 1688                  if (error)
1685 1689                          return (error);
1686 1690                  mutex_enter(&rp->r_statelock);
1687 1691                  if (rp->r_symlink.contents != NULL) {
1688 1692                          error = uiomove(rp->r_symlink.contents,
1689 1693                              rp->r_symlink.len, UIO_READ, uiop);
1690 1694                          mutex_exit(&rp->r_statelock);
1691 1695                          return (error);
1692 1696                  }
1693 1697                  mutex_exit(&rp->r_statelock);
1694 1698          }
1695 1699  
1696 1700          args.symlink = *VTOFH3(vp);
1697 1701          fi.vp = vp;
1698 1702          fi.fhp = (caddr_t)&args.symlink;
1699 1703          fi.copyproc = nfs3copyfh;
1700 1704          fi.lookupproc = nfs3lookup;
1701 1705          fi.xattrdirproc = acl_getxattrdir3;
1702 1706  
1703 1707          res.resok.data = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1704 1708  
1705 1709          resdata_backup = res.resok.data;
1706 1710  
1707 1711          douprintf = 1;
1708 1712  
1709 1713          t = gethrtime();
1710 1714  
1711 1715          error = rfs3call(VTOMI(vp), NFSPROC3_READLINK,
1712 1716              xdr_READLINK3args, (caddr_t)&args,
1713 1717              xdr_READLINK3res, (caddr_t)&res, cr,
1714 1718              &douprintf, &res.status, 0, &fi);
1715 1719  
1716 1720          if (res.resok.data == nfs3nametoolong)
1717 1721                  error = EINVAL;
1718 1722  
1719 1723          if (error) {
1720 1724                  kmem_free(resdata_backup, MAXPATHLEN);
1721 1725                  return (error);
1722 1726          }
1723 1727  
1724 1728          error = geterrno3(res.status);
1725 1729          if (!error) {
1726 1730                  nfs3_cache_post_op_attr(vp, &res.resok.symlink_attributes, t,
1727 1731                      cr);
1728 1732                  len = strlen(res.resok.data);
1729 1733                  error = uiomove(res.resok.data, len, UIO_READ, uiop);
1730 1734                  if (nfs3_do_symlink_cache && rp->r_symlink.contents == NULL) {
1731 1735                          mutex_enter(&rp->r_statelock);
1732 1736                                  if (rp->r_symlink.contents == NULL) {
1733 1737                                  rp->r_symlink.contents = res.resok.data;
1734 1738                                  rp->r_symlink.len = len;
1735 1739                                  rp->r_symlink.size = MAXPATHLEN;
1736 1740                                  mutex_exit(&rp->r_statelock);
1737 1741                          } else {
1738 1742                                  mutex_exit(&rp->r_statelock);
1739 1743  
1740 1744                                  kmem_free((void *)res.resok.data, MAXPATHLEN);
1741 1745                          }
1742 1746                  } else {
1743 1747                          kmem_free((void *)res.resok.data, MAXPATHLEN);
1744 1748                  }
1745 1749          } else {
1746 1750                  nfs3_cache_post_op_attr(vp,
1747 1751                      &res.resfail.symlink_attributes, t, cr);
1748 1752                  PURGE_STALE_FH(error, vp, cr);
1749 1753  
1750 1754                  kmem_free((void *)res.resok.data, MAXPATHLEN);
1751 1755  
1752 1756          }
1753 1757  
1754 1758          /*
1755 1759           * The over the wire error for attempting to readlink something
1756 1760           * other than a symbolic link is ENXIO.  However, we need to
1757 1761           * return EINVAL instead of ENXIO, so we map it here.
1758 1762           */
1759 1763          return (error == ENXIO ? EINVAL : error);
1760 1764  }
1761 1765  
1762 1766  /*
1763 1767   * Flush local dirty pages to stable storage on the server.
1764 1768   *
1765 1769   * If FNODSYNC is specified, then there is nothing to do because
1766 1770   * metadata changes are not cached on the client before being
1767 1771   * sent to the server.
1768 1772   */
1769 1773  /* ARGSUSED */
1770 1774  static int
1771 1775  nfs3_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1772 1776  {
1773 1777          int error;
1774 1778  
1775 1779          if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1776 1780                  return (0);
1777 1781          if (nfs_zone() != VTOMI(vp)->mi_zone)
1778 1782                  return (EIO);
1779 1783  
1780 1784          error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr);
1781 1785          if (!error)
1782 1786                  error = VTOR(vp)->r_error;
1783 1787          return (error);
1784 1788  }
1785 1789  
1786 1790  /*
1787 1791   * Weirdness: if the file was removed or the target of a rename
1788 1792   * operation while it was open, it got renamed instead.  Here we
1789 1793   * remove the renamed file.
1790 1794   */
1791 1795  /* ARGSUSED */
1792 1796  static void
1793 1797  nfs3_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1794 1798  {
1795 1799          rnode_t *rp;
1796 1800  
1797 1801          ASSERT(vp != DNLC_NO_VNODE);
1798 1802  
1799 1803          /*
1800 1804           * If this is coming from the wrong zone, we let someone in the right
1801 1805           * zone take care of it asynchronously.  We can get here due to
1802 1806           * VN_RELE() being called from pageout() or fsflush().  This call may
1803 1807           * potentially turn into an expensive no-op if, for instance, v_count
1804 1808           * gets incremented in the meantime, but it's still correct.
1805 1809           */
1806 1810          if (nfs_zone() != VTOMI(vp)->mi_zone) {
1807 1811                  nfs_async_inactive(vp, cr, nfs3_inactive);
1808 1812                  return;
1809 1813          }
1810 1814  
1811 1815          rp = VTOR(vp);
1812 1816  redo:
1813 1817          if (rp->r_unldvp != NULL) {
1814 1818                  /*
1815 1819                   * Save the vnode pointer for the directory where the
1816 1820                   * unlinked-open file got renamed, then set it to NULL
1817 1821                   * to prevent another thread from getting here before
1818 1822                   * we're done with the remove.  While we have the
1819 1823                   * statelock, make local copies of the pertinent rnode
1820 1824                   * fields.  If we weren't to do this in an atomic way, the
1821 1825                   * the unl* fields could become inconsistent with respect
1822 1826                   * to each other due to a race condition between this
1823 1827                   * code and nfs_remove().  See bug report 1034328.
1824 1828                   */
1825 1829                  mutex_enter(&rp->r_statelock);
1826 1830                  if (rp->r_unldvp != NULL) {
1827 1831                          vnode_t *unldvp;
1828 1832                          char *unlname;
1829 1833                          cred_t *unlcred;
1830 1834                          REMOVE3args args;
1831 1835                          REMOVE3res res;
1832 1836                          int douprintf;
1833 1837                          int error;
1834 1838                          hrtime_t t;
1835 1839  
1836 1840                          unldvp = rp->r_unldvp;
1837 1841                          rp->r_unldvp = NULL;
1838 1842                          unlname = rp->r_unlname;
1839 1843                          rp->r_unlname = NULL;
1840 1844                          unlcred = rp->r_unlcred;
1841 1845                          rp->r_unlcred = NULL;
1842 1846                          mutex_exit(&rp->r_statelock);
1843 1847  
1844 1848                          /*
1845 1849                           * If there are any dirty pages left, then flush
1846 1850                           * them.  This is unfortunate because they just
1847 1851                           * may get thrown away during the remove operation,
1848 1852                           * but we have to do this for correctness.
1849 1853                           */
1850 1854                          if (vn_has_cached_data(vp) &&
1851 1855                              ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1852 1856                                  ASSERT(vp->v_type != VCHR);
1853 1857                                  error = nfs3_putpage(vp, (offset_t)0, 0, 0,
1854 1858                                      cr, ct);
1855 1859                                  if (error) {
1856 1860                                          mutex_enter(&rp->r_statelock);
1857 1861                                          if (!rp->r_error)
1858 1862                                                  rp->r_error = error;
1859 1863                                          mutex_exit(&rp->r_statelock);
1860 1864                                  }
1861 1865                          }
1862 1866  
1863 1867                          /*
1864 1868                           * Do the remove operation on the renamed file
1865 1869                           */
1866 1870                          setdiropargs3(&args.object, unlname, unldvp);
1867 1871  
1868 1872                          douprintf = 1;
1869 1873  
1870 1874                          t = gethrtime();
1871 1875  
1872 1876                          error = rfs3call(VTOMI(unldvp), NFSPROC3_REMOVE,
1873 1877                              xdr_diropargs3, (caddr_t)&args,
1874 1878                              xdr_REMOVE3res, (caddr_t)&res, unlcred,
1875 1879                              &douprintf, &res.status, 0, NULL);
1876 1880  
1877 1881                          if (error) {
1878 1882                                  PURGE_ATTRCACHE(unldvp);
1879 1883                          } else {
1880 1884                                  error = geterrno3(res.status);
1881 1885                                  if (!error) {
1882 1886                                          nfs3_cache_wcc_data(unldvp,
1883 1887                                              &res.resok.dir_wcc, t, cr);
1884 1888                                          if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1885 1889                                                  nfs_purge_rddir_cache(unldvp);
1886 1890                                  } else {
1887 1891                                          nfs3_cache_wcc_data(unldvp,
1888 1892                                              &res.resfail.dir_wcc, t, cr);
1889 1893                                          PURGE_STALE_FH(error, unldvp, cr);
1890 1894                                  }
1891 1895                          }
1892 1896  
1893 1897                          /*
1894 1898                           * Release stuff held for the remove
1895 1899                           */
1896 1900                          VN_RELE(unldvp);
1897 1901                          kmem_free(unlname, MAXNAMELEN);
1898 1902                          crfree(unlcred);
1899 1903                          goto redo;
1900 1904                  }
1901 1905                  mutex_exit(&rp->r_statelock);
1902 1906          }
1903 1907  
1904 1908          rp_addfree(rp, cr);
1905 1909  }
1906 1910  
1907 1911  /*
1908 1912   * Remote file system operations having to do with directory manipulation.
1909 1913   */
1910 1914  
1911 1915  /* ARGSUSED */
1912 1916  static int
1913 1917  nfs3_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1914 1918          int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1915 1919          int *direntflags, pathname_t *realpnp)
1916 1920  {
1917 1921          int error;
1918 1922          vnode_t *vp;
1919 1923          vnode_t *avp = NULL;
1920 1924          rnode_t *drp;
1921 1925  
1922 1926          if (nfs_zone() != VTOMI(dvp)->mi_zone)
1923 1927                  return (EPERM);
1924 1928  
1925 1929          drp = VTOR(dvp);
1926 1930  
1927 1931          /*
1928 1932           * Are we looking up extended attributes?  If so, "dvp" is
1929 1933           * the file or directory for which we want attributes, and
1930 1934           * we need a lookup of the hidden attribute directory
1931 1935           * before we lookup the rest of the path.
1932 1936           */
1933 1937          if (flags & LOOKUP_XATTR) {
1934 1938                  bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1935 1939                  mntinfo_t *mi;
1936 1940  
1937 1941                  mi = VTOMI(dvp);
1938 1942                  if (!(mi->mi_flags & MI_EXTATTR))
1939 1943                          return (EINVAL);
1940 1944  
1941 1945                  if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1942 1946                          return (EINTR);
1943 1947  
1944 1948                  (void) nfs3lookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1945 1949                  if (avp == NULL)
1946 1950                          error = acl_getxattrdir3(dvp, &avp, cflag, cr, 0);
1947 1951                  else
1948 1952                          error = 0;
1949 1953  
1950 1954                  nfs_rw_exit(&drp->r_rwlock);
1951 1955  
1952 1956                  if (error) {
1953 1957                          if (mi->mi_flags & MI_EXTATTR)
1954 1958                                  return (error);
1955 1959                          return (EINVAL);
1956 1960                  }
1957 1961                  dvp = avp;
1958 1962                  drp = VTOR(dvp);
1959 1963          }
1960 1964  
1961 1965          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1962 1966                  error = EINTR;
1963 1967                  goto out;
1964 1968          }
1965 1969  
1966 1970          error = nfs3lookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1967 1971  
1968 1972          nfs_rw_exit(&drp->r_rwlock);
1969 1973  
1970 1974          /*
1971 1975           * If vnode is a device, create special vnode.
1972 1976           */
1973 1977          if (!error && IS_DEVVP(*vpp)) {
1974 1978                  vp = *vpp;
1975 1979                  *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1976 1980                  VN_RELE(vp);
1977 1981          }
1978 1982  
1979 1983  out:
1980 1984          if (avp != NULL)
1981 1985                  VN_RELE(avp);
1982 1986  
1983 1987          return (error);
1984 1988  }
1985 1989  
1986 1990  static int nfs3_lookup_neg_cache = 1;
1987 1991  
1988 1992  #ifdef DEBUG
1989 1993  static int nfs3_lookup_dnlc_hits = 0;
1990 1994  static int nfs3_lookup_dnlc_misses = 0;
1991 1995  static int nfs3_lookup_dnlc_neg_hits = 0;
1992 1996  static int nfs3_lookup_dnlc_disappears = 0;
1993 1997  static int nfs3_lookup_dnlc_lookups = 0;
1994 1998  #endif
1995 1999  
1996 2000  /* ARGSUSED */
1997 2001  int
1998 2002  nfs3lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1999 2003          int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
2000 2004  {
2001 2005          int error;
2002 2006          rnode_t *drp;
2003 2007  
2004 2008          ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2005 2009          /*
2006 2010           * If lookup is for "", just return dvp.  Don't need
2007 2011           * to send it over the wire, look it up in the dnlc,
2008 2012           * or perform any access checks.
2009 2013           */
2010 2014          if (*nm == '\0') {
2011 2015                  VN_HOLD(dvp);
2012 2016                  *vpp = dvp;
2013 2017                  return (0);
2014 2018          }
2015 2019  
2016 2020          /*
2017 2021           * Can't do lookups in non-directories.
2018 2022           */
2019 2023          if (dvp->v_type != VDIR)
2020 2024                  return (ENOTDIR);
2021 2025  
2022 2026          /*
2023 2027           * If we're called with RFSCALL_SOFT, it's important that
2024 2028           * the only rfscall is one we make directly; if we permit
2025 2029           * an access call because we're looking up "." or validating
2026 2030           * a dnlc hit, we'll deadlock because that rfscall will not
2027 2031           * have the RFSCALL_SOFT set.
2028 2032           */
2029 2033          if (rfscall_flags & RFSCALL_SOFT)
2030 2034                  goto callit;
2031 2035  
2032 2036          /*
2033 2037           * If lookup is for ".", just return dvp.  Don't need
2034 2038           * to send it over the wire or look it up in the dnlc,
2035 2039           * just need to check access.
2036 2040           */
2037 2041          if (strcmp(nm, ".") == 0) {
2038 2042                  error = nfs3_access(dvp, VEXEC, 0, cr, NULL);
2039 2043                  if (error)
2040 2044                          return (error);
2041 2045                  VN_HOLD(dvp);
2042 2046                  *vpp = dvp;
2043 2047                  return (0);
2044 2048          }
2045 2049  
2046 2050          drp = VTOR(dvp);
2047 2051          if (!(drp->r_flags & RLOOKUP)) {
2048 2052                  mutex_enter(&drp->r_statelock);
2049 2053                  drp->r_flags |= RLOOKUP;
2050 2054                  mutex_exit(&drp->r_statelock);
2051 2055          }
2052 2056  
2053 2057          /*
2054 2058           * Lookup this name in the DNLC.  If there was a valid entry,
2055 2059           * then return the results of the lookup.
2056 2060           */
2057 2061          error = nfs3lookup_dnlc(dvp, nm, vpp, cr);
2058 2062          if (error || *vpp != NULL)
2059 2063                  return (error);
2060 2064  
2061 2065  callit:
2062 2066          error = nfs3lookup_otw(dvp, nm, vpp, cr, rfscall_flags);
2063 2067  
2064 2068          return (error);
2065 2069  }
2066 2070  
2067 2071  static int
2068 2072  nfs3lookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
2069 2073  {
2070 2074          int error;
2071 2075          vnode_t *vp;
2072 2076  
2073 2077          ASSERT(*nm != '\0');
2074 2078          ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2075 2079          /*
2076 2080           * Lookup this name in the DNLC.  If successful, then validate
2077 2081           * the caches and then recheck the DNLC.  The DNLC is rechecked
2078 2082           * just in case this entry got invalidated during the call
2079 2083           * to nfs3_validate_caches.
2080 2084           *
2081 2085           * An assumption is being made that it is safe to say that a
2082 2086           * file exists which may not on the server.  Any operations to
2083 2087           * the server will fail with ESTALE.
2084 2088           */
2085 2089  #ifdef DEBUG
2086 2090          nfs3_lookup_dnlc_lookups++;
2087 2091  #endif
2088 2092          vp = dnlc_lookup(dvp, nm);
2089 2093          if (vp != NULL) {
2090 2094                  VN_RELE(vp);
2091 2095                  if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
2092 2096                          PURGE_ATTRCACHE(dvp);
2093 2097                  }
2094 2098                  error = nfs3_validate_caches(dvp, cr);
2095 2099                  if (error)
2096 2100                          return (error);
2097 2101                  vp = dnlc_lookup(dvp, nm);
2098 2102                  if (vp != NULL) {
2099 2103                          error = nfs3_access(dvp, VEXEC, 0, cr, NULL);
2100 2104                          if (error) {
2101 2105                                  VN_RELE(vp);
2102 2106                                  return (error);
2103 2107                          }
2104 2108                          if (vp == DNLC_NO_VNODE) {
2105 2109                                  VN_RELE(vp);
2106 2110  #ifdef DEBUG
2107 2111                                  nfs3_lookup_dnlc_neg_hits++;
2108 2112  #endif
2109 2113                                  return (ENOENT);
2110 2114                          }
2111 2115                          *vpp = vp;
2112 2116  #ifdef DEBUG
2113 2117                          nfs3_lookup_dnlc_hits++;
2114 2118  #endif
2115 2119                          return (0);
2116 2120                  }
2117 2121  #ifdef DEBUG
2118 2122                  nfs3_lookup_dnlc_disappears++;
2119 2123  #endif
2120 2124          }
2121 2125  #ifdef DEBUG
2122 2126          else
2123 2127                  nfs3_lookup_dnlc_misses++;
2124 2128  #endif
2125 2129  
2126 2130          *vpp = NULL;
2127 2131  
2128 2132          return (0);
2129 2133  }
2130 2134  
2131 2135  static int
2132 2136  nfs3lookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
2133 2137          int rfscall_flags)
2134 2138  {
2135 2139          int error;
2136 2140          LOOKUP3args args;
2137 2141          LOOKUP3vres res;
2138 2142          int douprintf;
2139 2143          struct vattr vattr;
2140 2144          struct vattr dvattr;
2141 2145          vnode_t *vp;
2142 2146          failinfo_t fi;
2143 2147          hrtime_t t;
2144 2148  
2145 2149          ASSERT(*nm != '\0');
2146 2150          ASSERT(dvp->v_type == VDIR);
2147 2151          ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2148 2152  
2149 2153          setdiropargs3(&args.what, nm, dvp);
2150 2154  
2151 2155          fi.vp = dvp;
2152 2156          fi.fhp = (caddr_t)&args.what.dir;
2153 2157          fi.copyproc = nfs3copyfh;
2154 2158          fi.lookupproc = nfs3lookup;
2155 2159          fi.xattrdirproc = acl_getxattrdir3;
2156 2160          res.obj_attributes.fres.vp = dvp;
2157 2161          res.obj_attributes.fres.vap = &vattr;
2158 2162          res.dir_attributes.fres.vp = dvp;
2159 2163          res.dir_attributes.fres.vap = &dvattr;
2160 2164  
2161 2165          douprintf = 1;
2162 2166  
2163 2167          t = gethrtime();
2164 2168  
2165 2169          error = rfs3call(VTOMI(dvp), NFSPROC3_LOOKUP,
2166 2170              xdr_diropargs3, (caddr_t)&args,
2167 2171              xdr_LOOKUP3vres, (caddr_t)&res, cr,
2168 2172              &douprintf, &res.status, rfscall_flags, &fi);
2169 2173  
2170 2174          if (error)
2171 2175                  return (error);
2172 2176  
2173 2177          nfs3_cache_post_op_vattr(dvp, &res.dir_attributes, t, cr);
2174 2178  
2175 2179          error = geterrno3(res.status);
2176 2180          if (error) {
2177 2181                  PURGE_STALE_FH(error, dvp, cr);
2178 2182                  if (error == ENOENT && nfs3_lookup_neg_cache)
2179 2183                          dnlc_enter(dvp, nm, DNLC_NO_VNODE);
2180 2184                  return (error);
2181 2185          }
2182 2186  
2183 2187          if (res.obj_attributes.attributes) {
2184 2188                  vp = makenfs3node_va(&res.object, res.obj_attributes.fres.vap,
2185 2189                      dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
2186 2190          } else {
2187 2191                  vp = makenfs3node_va(&res.object, NULL,
2188 2192                      dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
2189 2193                  if (vp->v_type == VNON) {
2190 2194                          vattr.va_mask = AT_TYPE;
2191 2195                          error = nfs3getattr(vp, &vattr, cr);
2192 2196                          if (error) {
2193 2197                                  VN_RELE(vp);
2194 2198                                  return (error);
2195 2199                          }
2196 2200                          vp->v_type = vattr.va_type;
2197 2201                  }
2198 2202          }
2199 2203  
2200 2204          if (!(rfscall_flags & RFSCALL_SOFT))
2201 2205                  dnlc_update(dvp, nm, vp);
2202 2206  
2203 2207          *vpp = vp;
2204 2208  
2205 2209          return (error);
2206 2210  }
2207 2211  
2208 2212  #ifdef DEBUG
2209 2213  static int nfs3_create_misses = 0;
2210 2214  #endif
2211 2215  
2212 2216  /* ARGSUSED */
2213 2217  static int
2214 2218  nfs3_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2215 2219          int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct,
2216 2220          vsecattr_t *vsecp)
2217 2221  {
2218 2222          int error;
2219 2223          vnode_t *vp;
2220 2224          rnode_t *rp;
2221 2225          struct vattr vattr;
2222 2226          rnode_t *drp;
2223 2227          vnode_t *tempvp;
2224 2228  
2225 2229          drp = VTOR(dvp);
2226 2230          if (nfs_zone() != VTOMI(dvp)->mi_zone)
2227 2231                  return (EPERM);
2228 2232          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2229 2233                  return (EINTR);
2230 2234  
2231 2235  top:
2232 2236          /*
2233 2237           * We make a copy of the attributes because the caller does not
2234 2238           * expect us to change what va points to.
2235 2239           */
2236 2240          vattr = *va;
2237 2241  
2238 2242          /*
2239 2243           * If the pathname is "", just use dvp.  Don't need
2240 2244           * to send it over the wire, look it up in the dnlc,
2241 2245           * or perform any access checks.
2242 2246           */
2243 2247          if (*nm == '\0') {
2244 2248                  error = 0;
2245 2249                  VN_HOLD(dvp);
2246 2250                  vp = dvp;
2247 2251          /*
2248 2252           * If the pathname is ".", just use dvp.  Don't need
2249 2253           * to send it over the wire or look it up in the dnlc,
2250 2254           * just need to check access.
2251 2255           */
2252 2256          } else if (strcmp(nm, ".") == 0) {
2253 2257                  error = nfs3_access(dvp, VEXEC, 0, cr, ct);
2254 2258                  if (error) {
2255 2259                          nfs_rw_exit(&drp->r_rwlock);
2256 2260                          return (error);
2257 2261                  }
2258 2262                  VN_HOLD(dvp);
2259 2263                  vp = dvp;
2260 2264          /*
2261 2265           * We need to go over the wire, just to be sure whether the
2262 2266           * file exists or not.  Using the DNLC can be dangerous in
2263 2267           * this case when making a decision regarding existence.
2264 2268           */
2265 2269          } else {
2266 2270                  error = nfs3lookup_otw(dvp, nm, &vp, cr, 0);
2267 2271          }
2268 2272          if (!error) {
2269 2273                  if (exclusive == EXCL)
2270 2274                          error = EEXIST;
2271 2275                  else if (vp->v_type == VDIR && (mode & VWRITE))
2272 2276                          error = EISDIR;
2273 2277                  else {
2274 2278                          /*
2275 2279                           * If vnode is a device, create special vnode.
2276 2280                           */
2277 2281                          if (IS_DEVVP(vp)) {
2278 2282                                  tempvp = vp;
2279 2283                                  vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2280 2284                                  VN_RELE(tempvp);
2281 2285                          }
2282 2286                          if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
2283 2287                                  if ((vattr.va_mask & AT_SIZE) &&
2284 2288                                      vp->v_type == VREG) {
2285 2289                                          rp = VTOR(vp);
2286 2290                                          /*
2287 2291                                           * Check here for large file handled
2288 2292                                           * by LF-unaware process (as
2289 2293                                           * ufs_create() does)
2290 2294                                           */

↓ open down ↓

2250 lines elided

↑ open up ↑

2291 2295                                          if (!(lfaware & FOFFMAX)) {
2292 2296                                                  mutex_enter(&rp->r_statelock);
2293 2297                                                  if (rp->r_size > MAXOFF32_T)
2294 2298                                                          error = EOVERFLOW;
2295 2299                                                  mutex_exit(&rp->r_statelock);
2296 2300                                          }
2297 2301                                          if (!error) {
2298 2302                                                  vattr.va_mask = AT_SIZE;
2299 2303                                                  error = nfs3setattr(vp,
2300 2304                                                      &vattr, 0, cr);
     2305 +
     2306 +                                                /*
     2307 +                                                 * Existing file was truncated;
     2308 +                                                 * emit a create event.
     2309 +                                                 */
     2310 +                                                vnevent_create(vp, ct);
2301 2311                                          }
2302 2312                                  }
2303 2313                          }
2304 2314                  }
2305 2315                  nfs_rw_exit(&drp->r_rwlock);
2306 2316                  if (error) {
2307 2317                          VN_RELE(vp);
2308 2318                  } else {
2309      -                        /*
2310      -                         * existing file got truncated, notify.
2311      -                         */
2312      -                        vnevent_create(vp, ct);
2313 2319                          *vpp = vp;
2314 2320                  }
     2321 +
2315 2322                  return (error);
2316 2323          }
2317 2324  
2318 2325          dnlc_remove(dvp, nm);
2319 2326  
2320 2327          /*
2321 2328           * Decide what the group-id of the created file should be.
2322 2329           * Set it in attribute list as advisory...
2323 2330           */
2324 2331          error = setdirgid(dvp, &vattr.va_gid, cr);

2325 2332          if (error) {
2326 2333                  nfs_rw_exit(&drp->r_rwlock);
2327 2334                  return (error);
2328 2335          }
2329 2336          vattr.va_mask |= AT_GID;
2330 2337  
2331 2338          ASSERT(vattr.va_mask & AT_TYPE);
2332 2339          if (vattr.va_type == VREG) {
2333 2340                  ASSERT(vattr.va_mask & AT_MODE);
2334 2341                  if (MANDMODE(vattr.va_mode)) {
2335 2342                          nfs_rw_exit(&drp->r_rwlock);
2336 2343                          return (EACCES);
2337 2344                  }
2338 2345                  error = nfs3create(dvp, nm, &vattr, exclusive, mode, vpp, cr,
2339 2346                      lfaware);
2340 2347                  /*
2341 2348                   * If this is not an exclusive create, then the CREATE
2342 2349                   * request will be made with the GUARDED mode set.  This
2343 2350                   * means that the server will return EEXIST if the file
2344 2351                   * exists.  The file could exist because of a retransmitted
2345 2352                   * request.  In this case, we recover by starting over and
2346 2353                   * checking to see whether the file exists.  This second
2347 2354                   * time through it should and a CREATE request will not be
2348 2355                   * sent.
2349 2356                   *
2350 2357                   * This handles the problem of a dangling CREATE request
2351 2358                   * which contains attributes which indicate that the file
2352 2359                   * should be truncated.  This retransmitted request could
2353 2360                   * possibly truncate valid data in the file if not caught
2354 2361                   * by the duplicate request mechanism on the server or if
2355 2362                   * not caught by other means.  The scenario is:
2356 2363                   *
2357 2364                   * Client transmits CREATE request with size = 0
2358 2365                   * Client times out, retransmits request.
2359 2366                   * Response to the first request arrives from the server
2360 2367                   *  and the client proceeds on.
2361 2368                   * Client writes data to the file.
2362 2369                   * The server now processes retransmitted CREATE request
2363 2370                   *  and truncates file.
2364 2371                   *
2365 2372                   * The use of the GUARDED CREATE request prevents this from
2366 2373                   * happening because the retransmitted CREATE would fail
2367 2374                   * with EEXIST and would not truncate the file.
2368 2375                   */
2369 2376                  if (error == EEXIST && exclusive == NONEXCL) {
2370 2377  #ifdef DEBUG
2371 2378                          nfs3_create_misses++;
2372 2379  #endif
2373 2380                          goto top;
2374 2381                  }
2375 2382                  nfs_rw_exit(&drp->r_rwlock);
2376 2383                  return (error);
2377 2384          }
2378 2385          error = nfs3mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
2379 2386          nfs_rw_exit(&drp->r_rwlock);
2380 2387          return (error);
2381 2388  }
2382 2389  
2383 2390  /* ARGSUSED */
2384 2391  static int
2385 2392  nfs3create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2386 2393          int mode, vnode_t **vpp, cred_t *cr, int lfaware)
2387 2394  {
2388 2395          int error;
2389 2396          CREATE3args args;
2390 2397          CREATE3res res;
2391 2398          int douprintf;
2392 2399          vnode_t *vp;
2393 2400          struct vattr vattr;
2394 2401          nfstime3 *verfp;
2395 2402          rnode_t *rp;
2396 2403          timestruc_t now;
2397 2404          hrtime_t t;
2398 2405  
2399 2406          ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2400 2407          setdiropargs3(&args.where, nm, dvp);
2401 2408          if (exclusive == EXCL) {
2402 2409                  args.how.mode = EXCLUSIVE;
2403 2410                  /*
2404 2411                   * Construct the create verifier.  This verifier needs
2405 2412                   * to be unique between different clients.  It also needs
2406 2413                   * to vary for each exclusive create request generated
2407 2414                   * from the client to the server.
2408 2415                   *
2409 2416                   * The first attempt is made to use the hostid and a
2410 2417                   * unique number on the client.  If the hostid has not
2411 2418                   * been set, the high resolution time that the exclusive
2412 2419                   * create request is being made is used.  This will work
2413 2420                   * unless two different clients, both with the hostid
2414 2421                   * not set, attempt an exclusive create request on the
2415 2422                   * same file, at exactly the same clock time.  The
2416 2423                   * chances of this happening seem small enough to be
2417 2424                   * reasonable.
2418 2425                   */
2419 2426                  verfp = (nfstime3 *)&args.how.createhow3_u.verf;
2420 2427                  verfp->seconds = zone_get_hostid(NULL);
2421 2428                  if (verfp->seconds != 0)
2422 2429                          verfp->nseconds = newnum();
2423 2430                  else {
2424 2431                          gethrestime(&now);
2425 2432                          verfp->seconds = now.tv_sec;
2426 2433                          verfp->nseconds = now.tv_nsec;
2427 2434                  }
2428 2435                  /*
2429 2436                   * Since the server will use this value for the mtime,
2430 2437                   * make sure that it can't overflow. Zero out the MSB.
2431 2438                   * The actual value does not matter here, only its uniqeness.
2432 2439                   */
2433 2440                  verfp->seconds %= INT32_MAX;
2434 2441          } else {
2435 2442                  /*
2436 2443                   * Issue the non-exclusive create in guarded mode.  This
2437 2444                   * may result in some false EEXIST responses for
2438 2445                   * retransmitted requests, but these will be handled at
2439 2446                   * a higher level.  By using GUARDED, duplicate requests
2440 2447                   * to do file truncation and possible access problems
2441 2448                   * can be avoided.
2442 2449                   */
2443 2450                  args.how.mode = GUARDED;
2444 2451                  error = vattr_to_sattr3(va,
2445 2452                      &args.how.createhow3_u.obj_attributes);
2446 2453                  if (error) {
2447 2454                          /* req time field(s) overflow - return immediately */
2448 2455                          return (error);
2449 2456                  }
2450 2457          }
2451 2458  
2452 2459          douprintf = 1;
2453 2460  
2454 2461          t = gethrtime();
2455 2462  
2456 2463          error = rfs3call(VTOMI(dvp), NFSPROC3_CREATE,
2457 2464              xdr_CREATE3args, (caddr_t)&args,
2458 2465              xdr_CREATE3res, (caddr_t)&res, cr,
2459 2466              &douprintf, &res.status, 0, NULL);
2460 2467  
2461 2468          if (error) {
2462 2469                  PURGE_ATTRCACHE(dvp);
2463 2470                  return (error);
2464 2471          }
2465 2472  
2466 2473          error = geterrno3(res.status);
2467 2474          if (!error) {
2468 2475                  nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
2469 2476                  if (HAVE_RDDIR_CACHE(VTOR(dvp)))
2470 2477                          nfs_purge_rddir_cache(dvp);
2471 2478  
2472 2479                  /*
2473 2480                   * On exclusive create the times need to be explicitly
2474 2481                   * set to clear any potential verifier that may be stored
2475 2482                   * in one of these fields (see comment below).  This
2476 2483                   * is done here to cover the case where no post op attrs
2477 2484                   * were returned or a 'invalid' time was returned in
2478 2485                   * the attributes.
2479 2486                   */
2480 2487                  if (exclusive == EXCL)
2481 2488                          va->va_mask |= (AT_MTIME | AT_ATIME);
2482 2489  
2483 2490                  if (!res.resok.obj.handle_follows) {
2484 2491                          error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2485 2492                          if (error)
2486 2493                                  return (error);
2487 2494                  } else {
2488 2495                          if (res.resok.obj_attributes.attributes) {
2489 2496                                  vp = makenfs3node(&res.resok.obj.handle,
2490 2497                                      &res.resok.obj_attributes.attr,
2491 2498                                      dvp->v_vfsp, t, cr, NULL, NULL);
2492 2499                          } else {
2493 2500                                  vp = makenfs3node(&res.resok.obj.handle, NULL,
2494 2501                                      dvp->v_vfsp, t, cr, NULL, NULL);
2495 2502  
2496 2503                                  /*
2497 2504                                   * On an exclusive create, it is possible
2498 2505                                   * that attributes were returned but those
2499 2506                                   * postop attributes failed to decode
2500 2507                                   * properly.  If this is the case,
2501 2508                                   * then most likely the atime or mtime
2502 2509                                   * were invalid for our client; this
2503 2510                                   * is caused by the server storing the
2504 2511                                   * create verifier in one of the time
2505 2512                                   * fields(most likely mtime).
2506 2513                                   * So... we are going to setattr just the
2507 2514                                   * atime/mtime to clear things up.
2508 2515                                   */
2509 2516                                  if (exclusive == EXCL) {
2510 2517                                          if (error =
2511 2518                                              nfs3excl_create_settimes(vp,
2512 2519                                              va, cr)) {
2513 2520                                                  /*
2514 2521                                                   * Setting the times failed.
2515 2522                                                   * Remove the file and return
2516 2523                                                   * the error.
2517 2524                                                   */
2518 2525                                                  VN_RELE(vp);
2519 2526                                                  (void) nfs3_remove(dvp,
2520 2527                                                      nm, cr, NULL, 0);
2521 2528                                                  return (error);
2522 2529                                          }
2523 2530                                  }
2524 2531  
2525 2532                                  /*
2526 2533                                   * This handles the non-exclusive case
2527 2534                                   * and the exclusive case where no post op
2528 2535                                   * attrs were returned.
2529 2536                                   */
2530 2537                                  if (vp->v_type == VNON) {
2531 2538                                          vattr.va_mask = AT_TYPE;
2532 2539                                          error = nfs3getattr(vp, &vattr, cr);
2533 2540                                          if (error) {
2534 2541                                                  VN_RELE(vp);
2535 2542                                                  return (error);
2536 2543                                          }
2537 2544                                          vp->v_type = vattr.va_type;
2538 2545                                  }
2539 2546                          }
2540 2547                          dnlc_update(dvp, nm, vp);
2541 2548                  }
2542 2549  
2543 2550                  rp = VTOR(vp);
2544 2551  
2545 2552                  /*
2546 2553                   * Check here for large file handled by
2547 2554                   * LF-unaware process (as ufs_create() does)
2548 2555                   */
2549 2556                  if ((va->va_mask & AT_SIZE) && vp->v_type == VREG &&
2550 2557                      !(lfaware & FOFFMAX)) {
2551 2558                          mutex_enter(&rp->r_statelock);
2552 2559                          if (rp->r_size > MAXOFF32_T) {
2553 2560                                  mutex_exit(&rp->r_statelock);
2554 2561                                  VN_RELE(vp);
2555 2562                                  return (EOVERFLOW);
2556 2563                          }
2557 2564                          mutex_exit(&rp->r_statelock);
2558 2565                  }
2559 2566  
2560 2567                  if (exclusive == EXCL &&
2561 2568                      (va->va_mask & ~(AT_GID | AT_SIZE))) {
2562 2569                          /*
2563 2570                           * If doing an exclusive create, then generate
2564 2571                           * a SETATTR to set the initial attributes.
2565 2572                           * Try to set the mtime and the atime to the
2566 2573                           * server's current time.  It is somewhat
2567 2574                           * expected that these fields will be used to
2568 2575                           * store the exclusive create cookie.  If not,
2569 2576                           * server implementors will need to know that
2570 2577                           * a SETATTR will follow an exclusive create
2571 2578                           * and the cookie should be destroyed if
2572 2579                           * appropriate. This work may have been done
2573 2580                           * earlier in this function if post op attrs
2574 2581                           * were not available.
2575 2582                           *
2576 2583                           * The AT_GID and AT_SIZE bits are turned off
2577 2584                           * so that the SETATTR request will not attempt
2578 2585                           * to process these.  The gid will be set
2579 2586                           * separately if appropriate.  The size is turned
2580 2587                           * off because it is assumed that a new file will
2581 2588                           * be created empty and if the file wasn't empty,
2582 2589                           * then the exclusive create will have failed
2583 2590                           * because the file must have existed already.
2584 2591                           * Therefore, no truncate operation is needed.
2585 2592                           */
2586 2593                          va->va_mask &= ~(AT_GID | AT_SIZE);
2587 2594                          error = nfs3setattr(vp, va, 0, cr);
2588 2595                          if (error) {
2589 2596                                  /*
2590 2597                                   * Couldn't correct the attributes of
2591 2598                                   * the newly created file and the
2592 2599                                   * attributes are wrong.  Remove the
2593 2600                                   * file and return an error to the
2594 2601                                   * application.
2595 2602                                   */
2596 2603                                  VN_RELE(vp);
2597 2604                                  (void) nfs3_remove(dvp, nm, cr, NULL, 0);
2598 2605                                  return (error);
2599 2606                          }
2600 2607                  }
2601 2608  
2602 2609                  if (va->va_gid != rp->r_attr.va_gid) {
2603 2610                          /*
2604 2611                           * If the gid on the file isn't right, then
2605 2612                           * generate a SETATTR to attempt to change
2606 2613                           * it.  This may or may not work, depending
2607 2614                           * upon the server's semantics for allowing
2608 2615                           * file ownership changes.
2609 2616                           */
2610 2617                          va->va_mask = AT_GID;
2611 2618                          (void) nfs3setattr(vp, va, 0, cr);
2612 2619                  }
2613 2620  
2614 2621                  /*
2615 2622                   * If vnode is a device create special vnode
2616 2623                   */
2617 2624                  if (IS_DEVVP(vp)) {
2618 2625                          *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2619 2626                          VN_RELE(vp);
2620 2627                  } else
2621 2628                          *vpp = vp;
2622 2629          } else {
2623 2630                  nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
2624 2631                  PURGE_STALE_FH(error, dvp, cr);
2625 2632          }
2626 2633  
2627 2634          return (error);
2628 2635  }
2629 2636  
2630 2637  /*
2631 2638   * Special setattr function to take care of rest of atime/mtime
2632 2639   * after successful exclusive create.  This function exists to avoid
2633 2640   * handling attributes from the server; exclusive the atime/mtime fields
2634 2641   * may be 'invalid' in client's view and therefore can not be trusted.
2635 2642   */
2636 2643  static int
2637 2644  nfs3excl_create_settimes(vnode_t *vp, struct vattr *vap, cred_t *cr)
2638 2645  {
2639 2646          int error;
2640 2647          uint_t mask;
2641 2648          SETATTR3args args;
2642 2649          SETATTR3res res;
2643 2650          int douprintf;
2644 2651          rnode_t *rp;
2645 2652          hrtime_t t;
2646 2653  
2647 2654          ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
2648 2655          /* save the caller's mask so that it can be reset later */
2649 2656          mask = vap->va_mask;
2650 2657  
2651 2658          rp = VTOR(vp);
2652 2659  
2653 2660          args.object = *RTOFH3(rp);
2654 2661          args.guard.check = FALSE;
2655 2662  
2656 2663          /* Use the mask to initialize the arguments */
2657 2664          vap->va_mask = 0;
2658 2665          error = vattr_to_sattr3(vap, &args.new_attributes);
2659 2666  
2660 2667          /* We want to set just atime/mtime on this request */
2661 2668          args.new_attributes.atime.set_it = SET_TO_SERVER_TIME;
2662 2669          args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME;
2663 2670  
2664 2671          douprintf = 1;
2665 2672  
2666 2673          t = gethrtime();
2667 2674  
2668 2675          error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR,
2669 2676              xdr_SETATTR3args, (caddr_t)&args,
2670 2677              xdr_SETATTR3res, (caddr_t)&res, cr,
2671 2678              &douprintf, &res.status, 0, NULL);
2672 2679  
2673 2680          if (error) {
2674 2681                  vap->va_mask = mask;
2675 2682                  return (error);
2676 2683          }
2677 2684  
2678 2685          error = geterrno3(res.status);
2679 2686          if (!error) {
2680 2687                  /*
2681 2688                   * It is important to pick up the attributes.
2682 2689                   * Since this is the exclusive create path, the
2683 2690                   * attributes on the initial create were ignored
2684 2691                   * and we need these to have the correct info.
2685 2692                   */
2686 2693                  nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr);
2687 2694                  /*
2688 2695                   * No need to do the atime/mtime work again so clear
2689 2696                   * the bits.
2690 2697                   */
2691 2698                  mask &= ~(AT_ATIME | AT_MTIME);
2692 2699          } else {
2693 2700                  nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr);
2694 2701          }
2695 2702  
2696 2703          vap->va_mask = mask;
2697 2704  
2698 2705          return (error);
2699 2706  }
2700 2707  
2701 2708  /* ARGSUSED */
2702 2709  static int
2703 2710  nfs3mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2704 2711          int mode, vnode_t **vpp, cred_t *cr)
2705 2712  {
2706 2713          int error;
2707 2714          MKNOD3args args;
2708 2715          MKNOD3res res;
2709 2716          int douprintf;
2710 2717          vnode_t *vp;
2711 2718          struct vattr vattr;
2712 2719          hrtime_t t;
2713 2720  
2714 2721          ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2715 2722          switch (va->va_type) {
2716 2723          case VCHR:
2717 2724          case VBLK:
2718 2725                  setdiropargs3(&args.where, nm, dvp);
2719 2726                  args.what.type = (va->va_type == VCHR) ? NF3CHR : NF3BLK;
2720 2727                  error = vattr_to_sattr3(va,
2721 2728                      &args.what.mknoddata3_u.device.dev_attributes);
2722 2729                  if (error) {
2723 2730                          /* req time field(s) overflow - return immediately */
2724 2731                          return (error);
2725 2732                  }
2726 2733                  args.what.mknoddata3_u.device.spec.specdata1 =
2727 2734                      getmajor(va->va_rdev);
2728 2735                  args.what.mknoddata3_u.device.spec.specdata2 =
2729 2736                      getminor(va->va_rdev);
2730 2737                  break;
2731 2738  
2732 2739          case VFIFO:
2733 2740          case VSOCK:
2734 2741                  setdiropargs3(&args.where, nm, dvp);
2735 2742                  args.what.type = (va->va_type == VFIFO) ? NF3FIFO : NF3SOCK;
2736 2743                  error = vattr_to_sattr3(va,
2737 2744                      &args.what.mknoddata3_u.pipe_attributes);
2738 2745                  if (error) {
2739 2746                          /* req time field(s) overflow - return immediately */
2740 2747                          return (error);
2741 2748                  }
2742 2749                  break;
2743 2750  
2744 2751          default:
2745 2752                  return (EINVAL);
2746 2753          }
2747 2754  
2748 2755          douprintf = 1;
2749 2756  
2750 2757          t = gethrtime();
2751 2758  
2752 2759          error = rfs3call(VTOMI(dvp), NFSPROC3_MKNOD,
2753 2760              xdr_MKNOD3args, (caddr_t)&args,
2754 2761              xdr_MKNOD3res, (caddr_t)&res, cr,
2755 2762              &douprintf, &res.status, 0, NULL);
2756 2763  
2757 2764          if (error) {
2758 2765                  PURGE_ATTRCACHE(dvp);
2759 2766                  return (error);
2760 2767          }
2761 2768  
2762 2769          error = geterrno3(res.status);
2763 2770          if (!error) {
2764 2771                  nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
2765 2772                  if (HAVE_RDDIR_CACHE(VTOR(dvp)))
2766 2773                          nfs_purge_rddir_cache(dvp);
2767 2774  
2768 2775                  if (!res.resok.obj.handle_follows) {
2769 2776                          error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2770 2777                          if (error)
2771 2778                                  return (error);
2772 2779                  } else {
2773 2780                          if (res.resok.obj_attributes.attributes) {
2774 2781                                  vp = makenfs3node(&res.resok.obj.handle,
2775 2782                                      &res.resok.obj_attributes.attr,
2776 2783                                      dvp->v_vfsp, t, cr, NULL, NULL);
2777 2784                          } else {
2778 2785                                  vp = makenfs3node(&res.resok.obj.handle, NULL,
2779 2786                                      dvp->v_vfsp, t, cr, NULL, NULL);
2780 2787                                  if (vp->v_type == VNON) {
2781 2788                                          vattr.va_mask = AT_TYPE;
2782 2789                                          error = nfs3getattr(vp, &vattr, cr);
2783 2790                                          if (error) {
2784 2791                                                  VN_RELE(vp);
2785 2792                                                  return (error);
2786 2793                                          }
2787 2794                                          vp->v_type = vattr.va_type;
2788 2795                                  }
2789 2796  
2790 2797                          }
2791 2798                          dnlc_update(dvp, nm, vp);
2792 2799                  }
2793 2800  
2794 2801                  if (va->va_gid != VTOR(vp)->r_attr.va_gid) {
2795 2802                          va->va_mask = AT_GID;
2796 2803                          (void) nfs3setattr(vp, va, 0, cr);
2797 2804                  }
2798 2805  
2799 2806                  /*
2800 2807                   * If vnode is a device create special vnode
2801 2808                   */
2802 2809                  if (IS_DEVVP(vp)) {
2803 2810                          *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2804 2811                          VN_RELE(vp);
2805 2812                  } else
2806 2813                          *vpp = vp;
2807 2814          } else {
2808 2815                  nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
2809 2816                  PURGE_STALE_FH(error, dvp, cr);
2810 2817          }
2811 2818          return (error);
2812 2819  }
2813 2820  
2814 2821  /*
2815 2822   * Weirdness: if the vnode to be removed is open
2816 2823   * we rename it instead of removing it and nfs_inactive
2817 2824   * will remove the new name.
2818 2825   */
2819 2826  /* ARGSUSED */
2820 2827  static int
2821 2828  nfs3_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
2822 2829  {
2823 2830          int error;
2824 2831          REMOVE3args args;
2825 2832          REMOVE3res res;
2826 2833          vnode_t *vp;
2827 2834          char *tmpname;
2828 2835          int douprintf;
2829 2836          rnode_t *rp;
2830 2837          rnode_t *drp;
2831 2838          hrtime_t t;
2832 2839  
2833 2840          if (nfs_zone() != VTOMI(dvp)->mi_zone)
2834 2841                  return (EPERM);
2835 2842          drp = VTOR(dvp);
2836 2843          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2837 2844                  return (EINTR);
2838 2845  
2839 2846          error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2840 2847          if (error) {
2841 2848                  nfs_rw_exit(&drp->r_rwlock);
2842 2849                  return (error);
2843 2850          }
2844 2851  
2845 2852          if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2846 2853                  VN_RELE(vp);
2847 2854                  nfs_rw_exit(&drp->r_rwlock);
2848 2855                  return (EPERM);
2849 2856          }
2850 2857  
2851 2858          /*
2852 2859           * First just remove the entry from the name cache, as it
2853 2860           * is most likely the only entry for this vp.
2854 2861           */
2855 2862          dnlc_remove(dvp, nm);
2856 2863  
2857 2864          /*
2858 2865           * If the file has a v_count > 1 then there may be more than one
2859 2866           * entry in the name cache due multiple links or an open file,
2860 2867           * but we don't have the real reference count so flush all
2861 2868           * possible entries.
2862 2869           */
2863 2870          if (vp->v_count > 1)
2864 2871                  dnlc_purge_vp(vp);
2865 2872  
2866 2873          /*
2867 2874           * Now we have the real reference count on the vnode
2868 2875           */
2869 2876          rp = VTOR(vp);
2870 2877          mutex_enter(&rp->r_statelock);
2871 2878          if (vp->v_count > 1 &&
2872 2879              (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2873 2880                  mutex_exit(&rp->r_statelock);
2874 2881                  tmpname = newname();
2875 2882                  error = nfs3rename(dvp, nm, dvp, tmpname, cr, ct);
2876 2883                  if (error)
2877 2884                          kmem_free(tmpname, MAXNAMELEN);
2878 2885                  else {
2879 2886                          mutex_enter(&rp->r_statelock);
2880 2887                          if (rp->r_unldvp == NULL) {
2881 2888                                  VN_HOLD(dvp);
2882 2889                                  rp->r_unldvp = dvp;
2883 2890                                  if (rp->r_unlcred != NULL)
2884 2891                                          crfree(rp->r_unlcred);
2885 2892                                  crhold(cr);
2886 2893                                  rp->r_unlcred = cr;
2887 2894                                  rp->r_unlname = tmpname;
2888 2895                          } else {
2889 2896                                  kmem_free(rp->r_unlname, MAXNAMELEN);
2890 2897                                  rp->r_unlname = tmpname;
2891 2898                          }
2892 2899                          mutex_exit(&rp->r_statelock);
2893 2900                  }
2894 2901          } else {
2895 2902                  mutex_exit(&rp->r_statelock);
2896 2903                  /*
2897 2904                   * We need to flush any dirty pages which happen to
2898 2905                   * be hanging around before removing the file.  This
2899 2906                   * shouldn't happen very often and mostly on file
2900 2907                   * systems mounted "nocto".
2901 2908                   */
2902 2909                  if (vn_has_cached_data(vp) &&
2903 2910                      ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2904 2911                          error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, ct);
2905 2912                          if (error && (error == ENOSPC || error == EDQUOT)) {
2906 2913                                  mutex_enter(&rp->r_statelock);
2907 2914                                  if (!rp->r_error)
2908 2915                                          rp->r_error = error;
2909 2916                                  mutex_exit(&rp->r_statelock);
2910 2917                          }
2911 2918                  }
2912 2919  
2913 2920                  setdiropargs3(&args.object, nm, dvp);
2914 2921  
2915 2922                  douprintf = 1;
2916 2923  
2917 2924                  t = gethrtime();
2918 2925  
2919 2926                  error = rfs3call(VTOMI(dvp), NFSPROC3_REMOVE,
2920 2927                      xdr_diropargs3, (caddr_t)&args,
2921 2928                      xdr_REMOVE3res, (caddr_t)&res, cr,
2922 2929                      &douprintf, &res.status, 0, NULL);
2923 2930  
2924 2931                  /*
2925 2932                   * The xattr dir may be gone after last attr is removed,
2926 2933                   * so flush it from dnlc.
2927 2934                   */
2928 2935                  if (dvp->v_flag & V_XATTRDIR)
2929 2936                          dnlc_purge_vp(dvp);
2930 2937  
2931 2938                  PURGE_ATTRCACHE(vp);
2932 2939  
2933 2940                  if (error) {
2934 2941                          PURGE_ATTRCACHE(dvp);
2935 2942                  } else {
2936 2943                          error = geterrno3(res.status);
2937 2944                          if (!error) {
2938 2945                                  nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t,
2939 2946                                      cr);
2940 2947                                  if (HAVE_RDDIR_CACHE(drp))
2941 2948                                          nfs_purge_rddir_cache(dvp);
2942 2949                          } else {
2943 2950                                  nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc,
2944 2951                                      t, cr);
2945 2952                                  PURGE_STALE_FH(error, dvp, cr);
2946 2953                          }
2947 2954                  }
2948 2955          }
2949 2956  
2950 2957          if (error == 0) {
2951 2958                  vnevent_remove(vp, dvp, nm, ct);
2952 2959          }
2953 2960          VN_RELE(vp);
2954 2961  
2955 2962          nfs_rw_exit(&drp->r_rwlock);
2956 2963  
2957 2964          return (error);
2958 2965  }
2959 2966  
2960 2967  /* ARGSUSED */
2961 2968  static int
2962 2969  nfs3_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
2963 2970          caller_context_t *ct, int flags)
2964 2971  {
2965 2972          int error;
2966 2973          LINK3args args;
2967 2974          LINK3res res;
2968 2975          vnode_t *realvp;
2969 2976          int douprintf;
2970 2977          mntinfo_t *mi;
2971 2978          rnode_t *tdrp;
2972 2979          hrtime_t t;
2973 2980  
2974 2981          if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2975 2982                  return (EPERM);
2976 2983          if (VOP_REALVP(svp, &realvp, ct) == 0)
2977 2984                  svp = realvp;
2978 2985  
2979 2986          mi = VTOMI(svp);
2980 2987  
2981 2988          if (!(mi->mi_flags & MI_LINK))
2982 2989                  return (EOPNOTSUPP);
2983 2990  
2984 2991          args.file = *VTOFH3(svp);
2985 2992          setdiropargs3(&args.link, tnm, tdvp);
2986 2993  
2987 2994          tdrp = VTOR(tdvp);
2988 2995          if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2989 2996                  return (EINTR);
2990 2997  
2991 2998          dnlc_remove(tdvp, tnm);
2992 2999  
2993 3000          douprintf = 1;
2994 3001  
2995 3002          t = gethrtime();
2996 3003  
2997 3004          error = rfs3call(mi, NFSPROC3_LINK,
2998 3005              xdr_LINK3args, (caddr_t)&args,
2999 3006              xdr_LINK3res, (caddr_t)&res, cr,
3000 3007              &douprintf, &res.status, 0, NULL);
3001 3008  
3002 3009          if (error) {
3003 3010                  PURGE_ATTRCACHE(tdvp);
3004 3011                  PURGE_ATTRCACHE(svp);
3005 3012                  nfs_rw_exit(&tdrp->r_rwlock);
3006 3013                  return (error);
3007 3014          }
3008 3015  
3009 3016          error = geterrno3(res.status);
3010 3017  
3011 3018          if (!error) {
3012 3019                  nfs3_cache_post_op_attr(svp, &res.resok.file_attributes, t, cr);
3013 3020                  nfs3_cache_wcc_data(tdvp, &res.resok.linkdir_wcc, t, cr);
3014 3021                  if (HAVE_RDDIR_CACHE(tdrp))
3015 3022                          nfs_purge_rddir_cache(tdvp);
3016 3023                  dnlc_update(tdvp, tnm, svp);
3017 3024          } else {
3018 3025                  nfs3_cache_post_op_attr(svp, &res.resfail.file_attributes, t,
3019 3026                      cr);
3020 3027                  nfs3_cache_wcc_data(tdvp, &res.resfail.linkdir_wcc, t, cr);
3021 3028                  if (error == EOPNOTSUPP) {
3022 3029                          mutex_enter(&mi->mi_lock);
3023 3030                          mi->mi_flags &= ~MI_LINK;
3024 3031                          mutex_exit(&mi->mi_lock);
3025 3032                  }
3026 3033          }
3027 3034  
3028 3035          nfs_rw_exit(&tdrp->r_rwlock);
3029 3036  
3030 3037          if (!error) {
3031 3038                  /*
3032 3039                   * Notify the source file of this link operation.
3033 3040                   */
3034 3041                  vnevent_link(svp, ct);
3035 3042          }
3036 3043          return (error);
3037 3044  }
3038 3045  
3039 3046  /* ARGSUSED */
3040 3047  static int
3041 3048  nfs3_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
3042 3049          caller_context_t *ct, int flags)
3043 3050  {
3044 3051          vnode_t *realvp;
3045 3052  
3046 3053          if (nfs_zone() != VTOMI(odvp)->mi_zone)
3047 3054                  return (EPERM);
3048 3055          if (VOP_REALVP(ndvp, &realvp, ct) == 0)
3049 3056                  ndvp = realvp;
3050 3057  
3051 3058          return (nfs3rename(odvp, onm, ndvp, nnm, cr, ct));
3052 3059  }
3053 3060  
3054 3061  /*
3055 3062   * nfs3rename does the real work of renaming in NFS Version 3.
3056 3063   */
3057 3064  static int
3058 3065  nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
3059 3066      caller_context_t *ct)
3060 3067  {
3061 3068          int error;
3062 3069          RENAME3args args;
3063 3070          RENAME3res res;
3064 3071          int douprintf;
3065 3072          vnode_t *nvp = NULL;
3066 3073          vnode_t *ovp = NULL;
3067 3074          char *tmpname;
3068 3075          rnode_t *rp;
3069 3076          rnode_t *odrp;
3070 3077          rnode_t *ndrp;
3071 3078          hrtime_t t;
3072 3079  
3073 3080          ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
3074 3081  
3075 3082          if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
3076 3083              strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
3077 3084                  return (EINVAL);
3078 3085  
3079 3086          odrp = VTOR(odvp);
3080 3087          ndrp = VTOR(ndvp);
3081 3088          if ((intptr_t)odrp < (intptr_t)ndrp) {
3082 3089                  if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
3083 3090                          return (EINTR);
3084 3091                  if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
3085 3092                          nfs_rw_exit(&odrp->r_rwlock);
3086 3093                          return (EINTR);
3087 3094                  }
3088 3095          } else {
3089 3096                  if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
3090 3097                          return (EINTR);
3091 3098                  if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
3092 3099                          nfs_rw_exit(&ndrp->r_rwlock);
3093 3100                          return (EINTR);
3094 3101                  }
3095 3102          }
3096 3103  
3097 3104          /*
3098 3105           * Lookup the target file.  If it exists, it needs to be
3099 3106           * checked to see whether it is a mount point and whether
3100 3107           * it is active (open).
3101 3108           */
3102 3109          error = nfs3lookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
3103 3110          if (!error) {
3104 3111                  /*
3105 3112                   * If this file has been mounted on, then just
3106 3113                   * return busy because renaming to it would remove
3107 3114                   * the mounted file system from the name space.
3108 3115                   */
3109 3116                  if (vn_mountedvfs(nvp) != NULL) {
3110 3117                          VN_RELE(nvp);
3111 3118                          nfs_rw_exit(&odrp->r_rwlock);
3112 3119                          nfs_rw_exit(&ndrp->r_rwlock);
3113 3120                          return (EBUSY);
3114 3121                  }
3115 3122  
3116 3123                  /*
3117 3124                   * Purge the name cache of all references to this vnode
3118 3125                   * so that we can check the reference count to infer
3119 3126                   * whether it is active or not.
3120 3127                   */
3121 3128                  /*
3122 3129                   * First just remove the entry from the name cache, as it
3123 3130                   * is most likely the only entry for this vp.
3124 3131                   */
3125 3132                  dnlc_remove(ndvp, nnm);
3126 3133                  /*
3127 3134                   * If the file has a v_count > 1 then there may be more
3128 3135                   * than one entry in the name cache due multiple links
3129 3136                   * or an open file, but we don't have the real reference
3130 3137                   * count so flush all possible entries.
3131 3138                   */
3132 3139                  if (nvp->v_count > 1)
3133 3140                          dnlc_purge_vp(nvp);
3134 3141  
3135 3142                  /*
3136 3143                   * If the vnode is active and is not a directory,
3137 3144                   * arrange to rename it to a
3138 3145                   * temporary file so that it will continue to be
3139 3146                   * accessible.  This implements the "unlink-open-file"
3140 3147                   * semantics for the target of a rename operation.
3141 3148                   * Before doing this though, make sure that the
3142 3149                   * source and target files are not already the same.
3143 3150                   */
3144 3151                  if (nvp->v_count > 1 && nvp->v_type != VDIR) {
3145 3152                          /*
3146 3153                           * Lookup the source name.
3147 3154                           */
3148 3155                          error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL,
3149 3156                              cr, 0);
3150 3157  
3151 3158                          /*
3152 3159                           * The source name *should* already exist.
3153 3160                           */
3154 3161                          if (error) {
3155 3162                                  VN_RELE(nvp);
3156 3163                                  nfs_rw_exit(&odrp->r_rwlock);
3157 3164                                  nfs_rw_exit(&ndrp->r_rwlock);
3158 3165                                  return (error);
3159 3166                          }
3160 3167  
3161 3168                          /*
3162 3169                           * Compare the two vnodes.  If they are the same,
3163 3170                           * just release all held vnodes and return success.
3164 3171                           */
3165 3172                          if (ovp == nvp) {
3166 3173                                  VN_RELE(ovp);
3167 3174                                  VN_RELE(nvp);
3168 3175                                  nfs_rw_exit(&odrp->r_rwlock);
3169 3176                                  nfs_rw_exit(&ndrp->r_rwlock);
3170 3177                                  return (0);
3171 3178                          }
3172 3179  
3173 3180                          /*
3174 3181                           * Can't mix and match directories and non-
3175 3182                           * directories in rename operations.  We already
3176 3183                           * know that the target is not a directory.  If
3177 3184                           * the source is a directory, return an error.
3178 3185                           */
3179 3186                          if (ovp->v_type == VDIR) {
3180 3187                                  VN_RELE(ovp);
3181 3188                                  VN_RELE(nvp);
3182 3189                                  nfs_rw_exit(&odrp->r_rwlock);
3183 3190                                  nfs_rw_exit(&ndrp->r_rwlock);
3184 3191                                  return (ENOTDIR);
3185 3192                          }
3186 3193  
3187 3194                          /*
3188 3195                           * The target file exists, is not the same as
3189 3196                           * the source file, and is active.  Link it
3190 3197                           * to a temporary filename to avoid having
3191 3198                           * the server removing the file completely.
3192 3199                           */
3193 3200                          tmpname = newname();
3194 3201                          error = nfs3_link(ndvp, nvp, tmpname, cr, NULL, 0);
3195 3202                          if (error == EOPNOTSUPP) {
3196 3203                                  error = nfs3_rename(ndvp, nnm, ndvp, tmpname,
3197 3204                                      cr, NULL, 0);
3198 3205                          }
3199 3206                          if (error) {
3200 3207                                  kmem_free(tmpname, MAXNAMELEN);
3201 3208                                  VN_RELE(ovp);
3202 3209                                  VN_RELE(nvp);
3203 3210                                  nfs_rw_exit(&odrp->r_rwlock);
3204 3211                                  nfs_rw_exit(&ndrp->r_rwlock);
3205 3212                                  return (error);
3206 3213                          }
3207 3214                          rp = VTOR(nvp);
3208 3215                          mutex_enter(&rp->r_statelock);
3209 3216                          if (rp->r_unldvp == NULL) {
3210 3217                                  VN_HOLD(ndvp);
3211 3218                                  rp->r_unldvp = ndvp;
3212 3219                                  if (rp->r_unlcred != NULL)
3213 3220                                          crfree(rp->r_unlcred);
3214 3221                                  crhold(cr);
3215 3222                                  rp->r_unlcred = cr;
3216 3223                                  rp->r_unlname = tmpname;
3217 3224                          } else {
3218 3225                                  kmem_free(rp->r_unlname, MAXNAMELEN);
3219 3226                                  rp->r_unlname = tmpname;
3220 3227                          }
3221 3228                          mutex_exit(&rp->r_statelock);
3222 3229                  }
3223 3230          }
3224 3231  
3225 3232          if (ovp == NULL) {
3226 3233                  /*
3227 3234                   * When renaming directories to be a subdirectory of a
3228 3235                   * different parent, the dnlc entry for ".." will no
3229 3236                   * longer be valid, so it must be removed.
3230 3237                   *
3231 3238                   * We do a lookup here to determine whether we are renaming
3232 3239                   * a directory and we need to check if we are renaming
3233 3240                   * an unlinked file.  This might have already been done
3234 3241                   * in previous code, so we check ovp == NULL to avoid
3235 3242                   * doing it twice.
3236 3243                   */
3237 3244  
3238 3245                  error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
3239 3246                  /*
3240 3247                   * The source name *should* already exist.
3241 3248                   */
3242 3249                  if (error) {
3243 3250                          nfs_rw_exit(&odrp->r_rwlock);
3244 3251                          nfs_rw_exit(&ndrp->r_rwlock);
3245 3252                          if (nvp) {
3246 3253                                  VN_RELE(nvp);
3247 3254                          }
3248 3255                          return (error);
3249 3256                  }
3250 3257                  ASSERT(ovp != NULL);
3251 3258          }
3252 3259  
3253 3260          dnlc_remove(odvp, onm);
3254 3261          dnlc_remove(ndvp, nnm);
3255 3262  
3256 3263          setdiropargs3(&args.from, onm, odvp);
3257 3264          setdiropargs3(&args.to, nnm, ndvp);
3258 3265  
3259 3266          douprintf = 1;
3260 3267  
3261 3268          t = gethrtime();
3262 3269  
3263 3270          error = rfs3call(VTOMI(odvp), NFSPROC3_RENAME,
3264 3271              xdr_RENAME3args, (caddr_t)&args,
3265 3272              xdr_RENAME3res, (caddr_t)&res, cr,
3266 3273              &douprintf, &res.status, 0, NULL);
3267 3274  
3268 3275          if (error) {
3269 3276                  PURGE_ATTRCACHE(odvp);
3270 3277                  PURGE_ATTRCACHE(ndvp);
3271 3278                  VN_RELE(ovp);
3272 3279                  nfs_rw_exit(&odrp->r_rwlock);
3273 3280                  nfs_rw_exit(&ndrp->r_rwlock);
3274 3281                  if (nvp) {
3275 3282                          VN_RELE(nvp);
3276 3283                  }
3277 3284                  return (error);
3278 3285          }
3279 3286  
3280 3287          error = geterrno3(res.status);
3281 3288  
3282 3289          if (!error) {
3283 3290                  nfs3_cache_wcc_data(odvp, &res.resok.fromdir_wcc, t, cr);
3284 3291                  if (HAVE_RDDIR_CACHE(odrp))
3285 3292                          nfs_purge_rddir_cache(odvp);
3286 3293                  if (ndvp != odvp) {
3287 3294                          nfs3_cache_wcc_data(ndvp, &res.resok.todir_wcc, t, cr);
3288 3295                          if (HAVE_RDDIR_CACHE(ndrp))
3289 3296                                  nfs_purge_rddir_cache(ndvp);
3290 3297                  }
3291 3298                  /*
3292 3299                   * when renaming directories to be a subdirectory of a
3293 3300                   * different parent, the dnlc entry for ".." will no
3294 3301                   * longer be valid, so it must be removed
3295 3302                   */
3296 3303                  rp = VTOR(ovp);
3297 3304                  if (ndvp != odvp) {
3298 3305                          if (ovp->v_type == VDIR) {
3299 3306                                  dnlc_remove(ovp, "..");
3300 3307                                  if (HAVE_RDDIR_CACHE(rp))
3301 3308                                          nfs_purge_rddir_cache(ovp);
3302 3309                          }
3303 3310                  }
3304 3311  
3305 3312                  /*
3306 3313                   * If we are renaming the unlinked file, update the
3307 3314                   * r_unldvp and r_unlname as needed.
3308 3315                   */
3309 3316                  mutex_enter(&rp->r_statelock);
3310 3317                  if (rp->r_unldvp != NULL) {
3311 3318                          if (strcmp(rp->r_unlname, onm) == 0) {
3312 3319                                  (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
3313 3320                                  rp->r_unlname[MAXNAMELEN - 1] = '\0';
3314 3321  
3315 3322                                  if (ndvp != rp->r_unldvp) {
3316 3323                                          VN_RELE(rp->r_unldvp);
3317 3324                                          rp->r_unldvp = ndvp;
3318 3325                                          VN_HOLD(ndvp);
3319 3326                                  }
3320 3327                          }
3321 3328                  }
3322 3329                  mutex_exit(&rp->r_statelock);
3323 3330          } else {
3324 3331                  nfs3_cache_wcc_data(odvp, &res.resfail.fromdir_wcc, t, cr);
3325 3332                  if (ndvp != odvp) {
3326 3333                          nfs3_cache_wcc_data(ndvp, &res.resfail.todir_wcc, t,
3327 3334                              cr);
3328 3335                  }
3329 3336                  /*
3330 3337                   * System V defines rename to return EEXIST, not
3331 3338                   * ENOTEMPTY if the target directory is not empty.
3332 3339                   * Over the wire, the error is NFSERR_ENOTEMPTY
3333 3340                   * which geterrno maps to ENOTEMPTY.
3334 3341                   */
3335 3342                  if (error == ENOTEMPTY)
3336 3343                          error = EEXIST;
3337 3344          }
3338 3345  
3339 3346          if (error == 0) {
3340 3347                  if (nvp)
3341 3348                          vnevent_rename_dest(nvp, ndvp, nnm, ct);
3342 3349  
3343 3350                  if (odvp != ndvp)
3344 3351                          vnevent_rename_dest_dir(ndvp, ct);
3345 3352                  ASSERT(ovp != NULL);
3346 3353                  vnevent_rename_src(ovp, odvp, onm, ct);
3347 3354          }
3348 3355  
3349 3356          if (nvp) {
3350 3357                  VN_RELE(nvp);
3351 3358          }
3352 3359          VN_RELE(ovp);
3353 3360  
3354 3361          nfs_rw_exit(&odrp->r_rwlock);
3355 3362          nfs_rw_exit(&ndrp->r_rwlock);
3356 3363  
3357 3364          return (error);
3358 3365  }
3359 3366  
3360 3367  /* ARGSUSED */
3361 3368  static int
3362 3369  nfs3_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
3363 3370          caller_context_t *ct, int flags, vsecattr_t *vsecp)
3364 3371  {
3365 3372          int error;
3366 3373          MKDIR3args args;
3367 3374          MKDIR3res res;
3368 3375          int douprintf;
3369 3376          struct vattr vattr;
3370 3377          vnode_t *vp;
3371 3378          rnode_t *drp;
3372 3379          hrtime_t t;
3373 3380  
3374 3381          if (nfs_zone() != VTOMI(dvp)->mi_zone)
3375 3382                  return (EPERM);
3376 3383          setdiropargs3(&args.where, nm, dvp);
3377 3384  
3378 3385          /*
3379 3386           * Decide what the group-id and set-gid bit of the created directory
3380 3387           * should be.  May have to do a setattr to get the gid right.
3381 3388           */
3382 3389          error = setdirgid(dvp, &va->va_gid, cr);
3383 3390          if (error)
3384 3391                  return (error);
3385 3392          error = setdirmode(dvp, &va->va_mode, cr);
3386 3393          if (error)
3387 3394                  return (error);
3388 3395          va->va_mask |= AT_MODE|AT_GID;
3389 3396  
3390 3397          error = vattr_to_sattr3(va, &args.attributes);
3391 3398          if (error) {
3392 3399                  /* req time field(s) overflow - return immediately */
3393 3400                  return (error);
3394 3401          }
3395 3402  
3396 3403          drp = VTOR(dvp);
3397 3404          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3398 3405                  return (EINTR);
3399 3406  
3400 3407          dnlc_remove(dvp, nm);
3401 3408  
3402 3409          douprintf = 1;
3403 3410  
3404 3411          t = gethrtime();
3405 3412  
3406 3413          error = rfs3call(VTOMI(dvp), NFSPROC3_MKDIR,
3407 3414              xdr_MKDIR3args, (caddr_t)&args,
3408 3415              xdr_MKDIR3res, (caddr_t)&res, cr,
3409 3416              &douprintf, &res.status, 0, NULL);
3410 3417  
3411 3418          if (error) {
3412 3419                  PURGE_ATTRCACHE(dvp);
3413 3420                  nfs_rw_exit(&drp->r_rwlock);
3414 3421                  return (error);
3415 3422          }
3416 3423  
3417 3424          error = geterrno3(res.status);
3418 3425          if (!error) {
3419 3426                  nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3420 3427                  if (HAVE_RDDIR_CACHE(drp))
3421 3428                          nfs_purge_rddir_cache(dvp);
3422 3429  
3423 3430                  if (!res.resok.obj.handle_follows) {
3424 3431                          error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
3425 3432                          if (error) {
3426 3433                                  nfs_rw_exit(&drp->r_rwlock);
3427 3434                                  return (error);
3428 3435                          }
3429 3436                  } else {
3430 3437                          if (res.resok.obj_attributes.attributes) {
3431 3438                                  vp = makenfs3node(&res.resok.obj.handle,
3432 3439                                      &res.resok.obj_attributes.attr,
3433 3440                                      dvp->v_vfsp, t, cr, NULL, NULL);
3434 3441                          } else {
3435 3442                                  vp = makenfs3node(&res.resok.obj.handle, NULL,
3436 3443                                      dvp->v_vfsp, t, cr, NULL, NULL);
3437 3444                                  if (vp->v_type == VNON) {
3438 3445                                          vattr.va_mask = AT_TYPE;
3439 3446                                          error = nfs3getattr(vp, &vattr, cr);
3440 3447                                          if (error) {
3441 3448                                                  VN_RELE(vp);
3442 3449                                                  nfs_rw_exit(&drp->r_rwlock);
3443 3450                                                  return (error);
3444 3451                                          }
3445 3452                                          vp->v_type = vattr.va_type;
3446 3453                                  }
3447 3454                          }
3448 3455                          dnlc_update(dvp, nm, vp);
3449 3456                  }
3450 3457                  if (va->va_gid != VTOR(vp)->r_attr.va_gid) {
3451 3458                          va->va_mask = AT_GID;
3452 3459                          (void) nfs3setattr(vp, va, 0, cr);
3453 3460                  }
3454 3461                  *vpp = vp;
3455 3462          } else {
3456 3463                  nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3457 3464                  PURGE_STALE_FH(error, dvp, cr);
3458 3465          }
3459 3466  
3460 3467          nfs_rw_exit(&drp->r_rwlock);
3461 3468  
3462 3469          return (error);
3463 3470  }
3464 3471  
3465 3472  /* ARGSUSED */
3466 3473  static int
3467 3474  nfs3_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
3468 3475          caller_context_t *ct, int flags)
3469 3476  {
3470 3477          int error;
3471 3478          RMDIR3args args;
3472 3479          RMDIR3res res;
3473 3480          vnode_t *vp;
3474 3481          int douprintf;
3475 3482          rnode_t *drp;
3476 3483          hrtime_t t;
3477 3484  
3478 3485          if (nfs_zone() != VTOMI(dvp)->mi_zone)
3479 3486                  return (EPERM);
3480 3487          drp = VTOR(dvp);
3481 3488          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3482 3489                  return (EINTR);
3483 3490  
3484 3491          /*
3485 3492           * Attempt to prevent a rmdir(".") from succeeding.
3486 3493           */
3487 3494          error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
3488 3495          if (error) {
3489 3496                  nfs_rw_exit(&drp->r_rwlock);
3490 3497                  return (error);
3491 3498          }
3492 3499  
3493 3500          if (vp == cdir) {
3494 3501                  VN_RELE(vp);
3495 3502                  nfs_rw_exit(&drp->r_rwlock);
3496 3503                  return (EINVAL);
3497 3504          }
3498 3505  
3499 3506          setdiropargs3(&args.object, nm, dvp);
3500 3507  
3501 3508          /*
3502 3509           * First just remove the entry from the name cache, as it
3503 3510           * is most likely an entry for this vp.
3504 3511           */
3505 3512          dnlc_remove(dvp, nm);
3506 3513  
3507 3514          /*
3508 3515           * If there vnode reference count is greater than one, then
3509 3516           * there may be additional references in the DNLC which will
3510 3517           * need to be purged.  First, trying removing the entry for
3511 3518           * the parent directory and see if that removes the additional
3512 3519           * reference(s).  If that doesn't do it, then use dnlc_purge_vp
3513 3520           * to completely remove any references to the directory which
3514 3521           * might still exist in the DNLC.
3515 3522           */
3516 3523          if (vp->v_count > 1) {
3517 3524                  dnlc_remove(vp, "..");
3518 3525                  if (vp->v_count > 1)
3519 3526                          dnlc_purge_vp(vp);
3520 3527          }
3521 3528  
3522 3529          douprintf = 1;
3523 3530  
3524 3531          t = gethrtime();
3525 3532  
3526 3533          error = rfs3call(VTOMI(dvp), NFSPROC3_RMDIR,
3527 3534              xdr_diropargs3, (caddr_t)&args,
3528 3535              xdr_RMDIR3res, (caddr_t)&res, cr,
3529 3536              &douprintf, &res.status, 0, NULL);
3530 3537  
3531 3538          PURGE_ATTRCACHE(vp);
3532 3539  
3533 3540          if (error) {
3534 3541                  PURGE_ATTRCACHE(dvp);
3535 3542                  VN_RELE(vp);
3536 3543                  nfs_rw_exit(&drp->r_rwlock);
3537 3544                  return (error);
3538 3545          }
3539 3546  
3540 3547          error = geterrno3(res.status);
3541 3548          if (!error) {
3542 3549                  nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3543 3550                  if (HAVE_RDDIR_CACHE(drp))
3544 3551                          nfs_purge_rddir_cache(dvp);
3545 3552                  if (HAVE_RDDIR_CACHE(VTOR(vp)))
3546 3553                          nfs_purge_rddir_cache(vp);
3547 3554          } else {
3548 3555                  nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3549 3556                  PURGE_STALE_FH(error, dvp, cr);
3550 3557                  /*
3551 3558                   * System V defines rmdir to return EEXIST, not
3552 3559                   * ENOTEMPTY if the directory is not empty.  Over
3553 3560                   * the wire, the error is NFSERR_ENOTEMPTY which
3554 3561                   * geterrno maps to ENOTEMPTY.
3555 3562                   */
3556 3563                  if (error == ENOTEMPTY)
3557 3564                          error = EEXIST;
3558 3565          }
3559 3566  
3560 3567          if (error == 0) {
3561 3568                  vnevent_rmdir(vp, dvp, nm, ct);
3562 3569          }
3563 3570          VN_RELE(vp);
3564 3571  
3565 3572          nfs_rw_exit(&drp->r_rwlock);
3566 3573  
3567 3574          return (error);
3568 3575  }
3569 3576  
3570 3577  /* ARGSUSED */
3571 3578  static int
3572 3579  nfs3_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
3573 3580          caller_context_t *ct, int flags)
3574 3581  {
3575 3582          int error;
3576 3583          SYMLINK3args args;
3577 3584          SYMLINK3res res;
3578 3585          int douprintf;
3579 3586          mntinfo_t *mi;
3580 3587          vnode_t *vp;
3581 3588          rnode_t *rp;
3582 3589          char *contents;
3583 3590          rnode_t *drp;
3584 3591          hrtime_t t;
3585 3592  
3586 3593          mi = VTOMI(dvp);
3587 3594  
3588 3595          if (nfs_zone() != mi->mi_zone)
3589 3596                  return (EPERM);
3590 3597          if (!(mi->mi_flags & MI_SYMLINK))
3591 3598                  return (EOPNOTSUPP);
3592 3599  
3593 3600          setdiropargs3(&args.where, lnm, dvp);
3594 3601          error = vattr_to_sattr3(tva, &args.symlink.symlink_attributes);
3595 3602          if (error) {
3596 3603                  /* req time field(s) overflow - return immediately */
3597 3604                  return (error);
3598 3605          }
3599 3606          args.symlink.symlink_data = tnm;
3600 3607  
3601 3608          drp = VTOR(dvp);
3602 3609          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3603 3610                  return (EINTR);
3604 3611  
3605 3612          dnlc_remove(dvp, lnm);
3606 3613  
3607 3614          douprintf = 1;
3608 3615  
3609 3616          t = gethrtime();
3610 3617  
3611 3618          error = rfs3call(mi, NFSPROC3_SYMLINK,
3612 3619              xdr_SYMLINK3args, (caddr_t)&args,
3613 3620              xdr_SYMLINK3res, (caddr_t)&res, cr,
3614 3621              &douprintf, &res.status, 0, NULL);
3615 3622  
3616 3623          if (error) {
3617 3624                  PURGE_ATTRCACHE(dvp);
3618 3625                  nfs_rw_exit(&drp->r_rwlock);
3619 3626                  return (error);
3620 3627          }
3621 3628  
3622 3629          error = geterrno3(res.status);
3623 3630          if (!error) {
3624 3631                  nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3625 3632                  if (HAVE_RDDIR_CACHE(drp))
3626 3633                          nfs_purge_rddir_cache(dvp);
3627 3634  
3628 3635                  if (res.resok.obj.handle_follows) {
3629 3636                          if (res.resok.obj_attributes.attributes) {
3630 3637                                  vp = makenfs3node(&res.resok.obj.handle,
3631 3638                                      &res.resok.obj_attributes.attr,
3632 3639                                      dvp->v_vfsp, t, cr, NULL, NULL);
3633 3640                          } else {
3634 3641                                  vp = makenfs3node(&res.resok.obj.handle, NULL,
3635 3642                                      dvp->v_vfsp, t, cr, NULL, NULL);
3636 3643                                  vp->v_type = VLNK;
3637 3644                                  vp->v_rdev = 0;
3638 3645                          }
3639 3646                          dnlc_update(dvp, lnm, vp);
3640 3647                          rp = VTOR(vp);
3641 3648                          if (nfs3_do_symlink_cache &&
3642 3649                              rp->r_symlink.contents == NULL) {
3643 3650  
3644 3651                                  contents = kmem_alloc(MAXPATHLEN,
3645 3652                                      KM_NOSLEEP);
3646 3653  
3647 3654                                  if (contents != NULL) {
3648 3655                                          mutex_enter(&rp->r_statelock);
3649 3656                                          if (rp->r_symlink.contents == NULL) {
3650 3657                                                  rp->r_symlink.len = strlen(tnm);
3651 3658                                                  bcopy(tnm, contents,
3652 3659                                                      rp->r_symlink.len);
3653 3660                                                  rp->r_symlink.contents =
3654 3661                                                      contents;
3655 3662                                                  rp->r_symlink.size = MAXPATHLEN;
3656 3663                                                  mutex_exit(&rp->r_statelock);
3657 3664                                          } else {
3658 3665                                                  mutex_exit(&rp->r_statelock);
3659 3666                                                  kmem_free((void *)contents,
3660 3667                                                      MAXPATHLEN);
3661 3668                                          }
3662 3669                                  }
3663 3670                          }
3664 3671                          VN_RELE(vp);
3665 3672                  }
3666 3673          } else {
3667 3674                  nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3668 3675                  PURGE_STALE_FH(error, dvp, cr);
3669 3676                  if (error == EOPNOTSUPP) {
3670 3677                          mutex_enter(&mi->mi_lock);
3671 3678                          mi->mi_flags &= ~MI_SYMLINK;
3672 3679                          mutex_exit(&mi->mi_lock);
3673 3680                  }
3674 3681          }
3675 3682  
3676 3683          nfs_rw_exit(&drp->r_rwlock);
3677 3684  
3678 3685          return (error);
3679 3686  }
3680 3687  
3681 3688  #ifdef DEBUG
3682 3689  static int nfs3_readdir_cache_hits = 0;
3683 3690  static int nfs3_readdir_cache_shorts = 0;
3684 3691  static int nfs3_readdir_cache_waits = 0;
3685 3692  static int nfs3_readdir_cache_misses = 0;
3686 3693  static int nfs3_readdir_readahead = 0;
3687 3694  #endif
3688 3695  
3689 3696  static int nfs3_shrinkreaddir = 0;
3690 3697  
3691 3698  /*
3692 3699   * Read directory entries.
3693 3700   * There are some weird things to look out for here.  The uio_loffset
3694 3701   * field is either 0 or it is the offset returned from a previous
3695 3702   * readdir.  It is an opaque value used by the server to find the
3696 3703   * correct directory block to read. The count field is the number
3697 3704   * of blocks to read on the server.  This is advisory only, the server
3698 3705   * may return only one block's worth of entries.  Entries may be compressed
3699 3706   * on the server.
3700 3707   */
3701 3708  /* ARGSUSED */
3702 3709  static int
3703 3710  nfs3_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
3704 3711          caller_context_t *ct, int flags)
3705 3712  {
3706 3713          int error;
3707 3714          size_t count;
3708 3715          rnode_t *rp;
3709 3716          rddir_cache *rdc;
3710 3717          rddir_cache *nrdc;
3711 3718          rddir_cache *rrdc;
3712 3719  #ifdef DEBUG
3713 3720          int missed;
3714 3721  #endif
3715 3722          int doreadahead;
3716 3723          rddir_cache srdc;
3717 3724          avl_index_t where;
3718 3725  
3719 3726          if (nfs_zone() != VTOMI(vp)->mi_zone)
3720 3727                  return (EIO);
3721 3728          rp = VTOR(vp);
3722 3729  
3723 3730          ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
3724 3731  
3725 3732          /*
3726 3733           * Make sure that the directory cache is valid.
3727 3734           */
3728 3735          if (HAVE_RDDIR_CACHE(rp)) {
3729 3736                  if (nfs_disable_rddir_cache) {
3730 3737                          /*
3731 3738                           * Setting nfs_disable_rddir_cache in /etc/system
3732 3739                           * allows interoperability with servers that do not
3733 3740                           * properly update the attributes of directories.
3734 3741                           * Any cached information gets purged before an
3735 3742                           * access is made to it.
3736 3743                           */
3737 3744                          nfs_purge_rddir_cache(vp);
3738 3745                  } else {
3739 3746                          error = nfs3_validate_caches(vp, cr);
3740 3747                          if (error)
3741 3748                                  return (error);
3742 3749                  }
3743 3750          }
3744 3751  
3745 3752          /*
3746 3753           * It is possible that some servers may not be able to correctly
3747 3754           * handle a large READDIR or READDIRPLUS request due to bugs in
3748 3755           * their implementation.  In order to continue to interoperate
3749 3756           * with them, this workaround is provided to limit the maximum
3750 3757           * size of a READDIRPLUS request to 1024.  In any case, the request
3751 3758           * size is limited to MAXBSIZE.
3752 3759           */
3753 3760          count = MIN(uiop->uio_iov->iov_len,
3754 3761              nfs3_shrinkreaddir ? 1024 : MAXBSIZE);
3755 3762  
3756 3763          nrdc = NULL;
3757 3764  #ifdef DEBUG
3758 3765          missed = 0;
3759 3766  #endif
3760 3767  top:
3761 3768          /*
3762 3769           * Short circuit last readdir which always returns 0 bytes.
3763 3770           * This can be done after the directory has been read through
3764 3771           * completely at least once.  This will set r_direof which
3765 3772           * can be used to find the value of the last cookie.
3766 3773           */
3767 3774          mutex_enter(&rp->r_statelock);
3768 3775          if (rp->r_direof != NULL &&
3769 3776              uiop->uio_loffset == rp->r_direof->nfs3_ncookie) {
3770 3777                  mutex_exit(&rp->r_statelock);
3771 3778  #ifdef DEBUG
3772 3779                  nfs3_readdir_cache_shorts++;
3773 3780  #endif
3774 3781                  if (eofp)
3775 3782                          *eofp = 1;
3776 3783                  if (nrdc != NULL)
3777 3784                          rddir_cache_rele(nrdc);
3778 3785                  return (0);
3779 3786          }
3780 3787          /*
3781 3788           * Look for a cache entry.  Cache entries are identified
3782 3789           * by the NFS cookie value and the byte count requested.
3783 3790           */
3784 3791          srdc.nfs3_cookie = uiop->uio_loffset;
3785 3792          srdc.buflen = count;
3786 3793          rdc = avl_find(&rp->r_dir, &srdc, &where);
3787 3794          if (rdc != NULL) {
3788 3795                  rddir_cache_hold(rdc);
3789 3796                  /*
3790 3797                   * If the cache entry is in the process of being
3791 3798                   * filled in, wait until this completes.  The
3792 3799                   * RDDIRWAIT bit is set to indicate that someone
3793 3800                   * is waiting and then the thread currently
3794 3801                   * filling the entry is done, it should do a
3795 3802                   * cv_broadcast to wakeup all of the threads
3796 3803                   * waiting for it to finish.
3797 3804                   */
3798 3805                  if (rdc->flags & RDDIR) {
3799 3806                          nfs_rw_exit(&rp->r_rwlock);
3800 3807                          rdc->flags |= RDDIRWAIT;
3801 3808  #ifdef DEBUG
3802 3809                          nfs3_readdir_cache_waits++;
3803 3810  #endif
3804 3811                          if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
3805 3812                                  /*
3806 3813                                   * We got interrupted, probably
3807 3814                                   * the user typed ^C or an alarm
3808 3815                                   * fired.  We free the new entry
3809 3816                                   * if we allocated one.
3810 3817                                   */
3811 3818                                  mutex_exit(&rp->r_statelock);
3812 3819                                  (void) nfs_rw_enter_sig(&rp->r_rwlock,
3813 3820                                      RW_READER, FALSE);
3814 3821                                  rddir_cache_rele(rdc);
3815 3822                                  if (nrdc != NULL)
3816 3823                                          rddir_cache_rele(nrdc);
3817 3824                                  return (EINTR);
3818 3825                          }
3819 3826                          mutex_exit(&rp->r_statelock);
3820 3827                          (void) nfs_rw_enter_sig(&rp->r_rwlock,
3821 3828                              RW_READER, FALSE);
3822 3829                          rddir_cache_rele(rdc);
3823 3830                          goto top;
3824 3831                  }
3825 3832                  /*
3826 3833                   * Check to see if a readdir is required to
3827 3834                   * fill the entry.  If so, mark this entry
3828 3835                   * as being filled, remove our reference,
3829 3836                   * and branch to the code to fill the entry.
3830 3837                   */
3831 3838                  if (rdc->flags & RDDIRREQ) {
3832 3839                          rdc->flags &= ~RDDIRREQ;
3833 3840                          rdc->flags |= RDDIR;
3834 3841                          if (nrdc != NULL)
3835 3842                                  rddir_cache_rele(nrdc);
3836 3843                          nrdc = rdc;
3837 3844                          mutex_exit(&rp->r_statelock);
3838 3845                          goto bottom;
3839 3846                  }
3840 3847  #ifdef DEBUG
3841 3848                  if (!missed)
3842 3849                          nfs3_readdir_cache_hits++;
3843 3850  #endif
3844 3851                  /*
3845 3852                   * If an error occurred while attempting
3846 3853                   * to fill the cache entry, just return it.
3847 3854                   */
3848 3855                  if (rdc->error) {
3849 3856                          error = rdc->error;
3850 3857                          mutex_exit(&rp->r_statelock);
3851 3858                          rddir_cache_rele(rdc);
3852 3859                          if (nrdc != NULL)
3853 3860                                  rddir_cache_rele(nrdc);
3854 3861                          return (error);
3855 3862                  }
3856 3863  
3857 3864                  /*
3858 3865                   * The cache entry is complete and good,
3859 3866                   * copyout the dirent structs to the calling
3860 3867                   * thread.
3861 3868                   */
3862 3869                  error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3863 3870  
3864 3871                  /*
3865 3872                   * If no error occurred during the copyout,
3866 3873                   * update the offset in the uio struct to
3867 3874                   * contain the value of the next cookie
3868 3875                   * and set the eof value appropriately.
3869 3876                   */
3870 3877                  if (!error) {
3871 3878                          uiop->uio_loffset = rdc->nfs3_ncookie;
3872 3879                          if (eofp)
3873 3880                                  *eofp = rdc->eof;
3874 3881                  }
3875 3882  
3876 3883                  /*
3877 3884                   * Decide whether to do readahead.
3878 3885                   *
3879 3886                   * Don't if have already read to the end of
3880 3887                   * directory.  There is nothing more to read.
3881 3888                   *
3882 3889                   * Don't if the application is not doing
3883 3890                   * lookups in the directory.  The readahead
3884 3891                   * is only effective if the application can
3885 3892                   * be doing work while an async thread is
3886 3893                   * handling the over the wire request.
3887 3894                   */
3888 3895                  if (rdc->eof) {
3889 3896                          rp->r_direof = rdc;
3890 3897                          doreadahead = FALSE;
3891 3898                  } else if (!(rp->r_flags & RLOOKUP))
3892 3899                          doreadahead = FALSE;
3893 3900                  else
3894 3901                          doreadahead = TRUE;
3895 3902  
3896 3903                  if (!doreadahead) {
3897 3904                          mutex_exit(&rp->r_statelock);
3898 3905                          rddir_cache_rele(rdc);
3899 3906                          if (nrdc != NULL)
3900 3907                                  rddir_cache_rele(nrdc);
3901 3908                          return (error);
3902 3909                  }
3903 3910  
3904 3911                  /*
3905 3912                   * Check to see whether we found an entry
3906 3913                   * for the readahead.  If so, we don't need
3907 3914                   * to do anything further, so free the new
3908 3915                   * entry if one was allocated.  Otherwise,
3909 3916                   * allocate a new entry, add it to the cache,
3910 3917                   * and then initiate an asynchronous readdir
3911 3918                   * operation to fill it.
3912 3919                   */
3913 3920                  srdc.nfs3_cookie = rdc->nfs3_ncookie;
3914 3921                  srdc.buflen = count;
3915 3922                  rrdc = avl_find(&rp->r_dir, &srdc, &where);
3916 3923                  if (rrdc != NULL) {
3917 3924                          if (nrdc != NULL)
3918 3925                                  rddir_cache_rele(nrdc);
3919 3926                  } else {
3920 3927                          if (nrdc != NULL)
3921 3928                                  rrdc = nrdc;
3922 3929                          else {
3923 3930                                  rrdc = rddir_cache_alloc(KM_NOSLEEP);
3924 3931                          }
3925 3932                          if (rrdc != NULL) {
3926 3933                                  rrdc->nfs3_cookie = rdc->nfs3_ncookie;
3927 3934                                  rrdc->buflen = count;
3928 3935                                  avl_insert(&rp->r_dir, rrdc, where);
3929 3936                                  rddir_cache_hold(rrdc);
3930 3937                                  mutex_exit(&rp->r_statelock);
3931 3938                                  rddir_cache_rele(rdc);
3932 3939  #ifdef DEBUG
3933 3940                                  nfs3_readdir_readahead++;
3934 3941  #endif
3935 3942                                  nfs_async_readdir(vp, rrdc, cr, do_nfs3readdir);
3936 3943                                  return (error);
3937 3944                          }
3938 3945                  }
3939 3946  
3940 3947                  mutex_exit(&rp->r_statelock);
3941 3948                  rddir_cache_rele(rdc);
3942 3949                  return (error);
3943 3950          }
3944 3951  
3945 3952          /*
3946 3953           * Didn't find an entry in the cache.  Construct a new empty
3947 3954           * entry and link it into the cache.  Other processes attempting
3948 3955           * to access this entry will need to wait until it is filled in.
3949 3956           *
3950 3957           * Since kmem_alloc may block, another pass through the cache
3951 3958           * will need to be taken to make sure that another process
3952 3959           * hasn't already added an entry to the cache for this request.
3953 3960           */
3954 3961          if (nrdc == NULL) {
3955 3962                  mutex_exit(&rp->r_statelock);
3956 3963                  nrdc = rddir_cache_alloc(KM_SLEEP);
3957 3964                  nrdc->nfs3_cookie = uiop->uio_loffset;
3958 3965                  nrdc->buflen = count;
3959 3966                  goto top;
3960 3967          }
3961 3968  
3962 3969          /*
3963 3970           * Add this entry to the cache.
3964 3971           */
3965 3972          avl_insert(&rp->r_dir, nrdc, where);
3966 3973          rddir_cache_hold(nrdc);
3967 3974          mutex_exit(&rp->r_statelock);
3968 3975  
3969 3976  bottom:
3970 3977  #ifdef DEBUG
3971 3978          missed = 1;
3972 3979          nfs3_readdir_cache_misses++;
3973 3980  #endif
3974 3981          /*
3975 3982           * Do the readdir.  This routine decides whether to use
3976 3983           * READDIR or READDIRPLUS.
3977 3984           */
3978 3985          error = do_nfs3readdir(vp, nrdc, cr);
3979 3986  
3980 3987          /*
3981 3988           * If this operation failed, just return the error which occurred.
3982 3989           */
3983 3990          if (error != 0)
3984 3991                  return (error);
3985 3992  
3986 3993          /*
3987 3994           * Since the RPC operation will have taken sometime and blocked
3988 3995           * this process, another pass through the cache will need to be
3989 3996           * taken to find the correct cache entry.  It is possible that
3990 3997           * the correct cache entry will not be there (although one was
3991 3998           * added) because the directory changed during the RPC operation
3992 3999           * and the readdir cache was flushed.  In this case, just start
3993 4000           * over.  It is hoped that this will not happen too often... :-)
3994 4001           */
3995 4002          nrdc = NULL;
3996 4003          goto top;
3997 4004          /* NOTREACHED */
3998 4005  }
3999 4006  
4000 4007  static int
4001 4008  do_nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4002 4009  {
4003 4010          int error;
4004 4011          rnode_t *rp;
4005 4012          mntinfo_t *mi;
4006 4013  
4007 4014          rp = VTOR(vp);
4008 4015          mi = VTOMI(vp);
4009 4016          ASSERT(nfs_zone() == mi->mi_zone);
4010 4017          /*
4011 4018           * Issue the proper request.
4012 4019           *
4013 4020           * If the server does not support READDIRPLUS, then use READDIR.
4014 4021           *
4015 4022           * Otherwise --
4016 4023           * Issue a READDIRPLUS if reading to fill an empty cache or if
4017 4024           * an application has performed a lookup in the directory which
4018 4025           * required an over the wire lookup.  The use of READDIRPLUS
4019 4026           * will help to (re)populate the DNLC.
4020 4027           */
4021 4028          if (!(mi->mi_flags & MI_READDIRONLY) &&
4022 4029              (rp->r_flags & (RLOOKUP | RREADDIRPLUS))) {
4023 4030                  if (rp->r_flags & RREADDIRPLUS) {
4024 4031                          mutex_enter(&rp->r_statelock);
4025 4032                          rp->r_flags &= ~RREADDIRPLUS;
4026 4033                          mutex_exit(&rp->r_statelock);
4027 4034                  }
4028 4035                  nfs3readdirplus(vp, rdc, cr);
4029 4036                  if (rdc->error == EOPNOTSUPP)
4030 4037                          nfs3readdir(vp, rdc, cr);
4031 4038          } else
4032 4039                  nfs3readdir(vp, rdc, cr);
4033 4040  
4034 4041          mutex_enter(&rp->r_statelock);
4035 4042          rdc->flags &= ~RDDIR;
4036 4043          if (rdc->flags & RDDIRWAIT) {
4037 4044                  rdc->flags &= ~RDDIRWAIT;
4038 4045                  cv_broadcast(&rdc->cv);
4039 4046          }
4040 4047          error = rdc->error;
4041 4048          if (error)
4042 4049                  rdc->flags |= RDDIRREQ;
4043 4050          mutex_exit(&rp->r_statelock);
4044 4051  
4045 4052          rddir_cache_rele(rdc);
4046 4053  
4047 4054          return (error);
4048 4055  }
4049 4056  
4050 4057  static void
4051 4058  nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4052 4059  {
4053 4060          int error;
4054 4061          READDIR3args args;
4055 4062          READDIR3vres res;
4056 4063          vattr_t dva;
4057 4064          rnode_t *rp;
4058 4065          int douprintf;
4059 4066          failinfo_t fi, *fip = NULL;
4060 4067          mntinfo_t *mi;
4061 4068          hrtime_t t;
4062 4069  
4063 4070          rp = VTOR(vp);
4064 4071          mi = VTOMI(vp);
4065 4072          ASSERT(nfs_zone() == mi->mi_zone);
4066 4073  
4067 4074          args.dir = *RTOFH3(rp);
4068 4075          args.cookie = (cookie3)rdc->nfs3_cookie;
4069 4076          args.cookieverf = rp->r_cookieverf;
4070 4077          args.count = rdc->buflen;
4071 4078  
4072 4079          /*
4073 4080           * NFS client failover support
4074 4081           * suppress failover unless we have a zero cookie
4075 4082           */
4076 4083          if (args.cookie == (cookie3) 0) {
4077 4084                  fi.vp = vp;
4078 4085                  fi.fhp = (caddr_t)&args.dir;
4079 4086                  fi.copyproc = nfs3copyfh;
4080 4087                  fi.lookupproc = nfs3lookup;
4081 4088                  fi.xattrdirproc = acl_getxattrdir3;
4082 4089                  fip = &fi;
4083 4090          }
4084 4091  
4085 4092  #ifdef DEBUG
4086 4093          rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP);
4087 4094  #else
4088 4095          rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
4089 4096  #endif
4090 4097  
4091 4098          res.entries = (dirent64_t *)rdc->entries;
4092 4099          res.entries_size = rdc->buflen;
4093 4100          res.dir_attributes.fres.vap = &dva;
4094 4101          res.dir_attributes.fres.vp = vp;
4095 4102          res.loff = rdc->nfs3_cookie;
4096 4103  
4097 4104          douprintf = 1;
4098 4105  
4099 4106          if (mi->mi_io_kstats) {
4100 4107                  mutex_enter(&mi->mi_lock);
4101 4108                  kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
4102 4109                  mutex_exit(&mi->mi_lock);
4103 4110          }
4104 4111  
4105 4112          t = gethrtime();
4106 4113  
4107 4114          error = rfs3call(VTOMI(vp), NFSPROC3_READDIR,
4108 4115              xdr_READDIR3args, (caddr_t)&args,
4109 4116              xdr_READDIR3vres, (caddr_t)&res, cr,
4110 4117              &douprintf, &res.status, 0, fip);
4111 4118  
4112 4119          if (mi->mi_io_kstats) {
4113 4120                  mutex_enter(&mi->mi_lock);
4114 4121                  kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
4115 4122                  mutex_exit(&mi->mi_lock);
4116 4123          }
4117 4124  
4118 4125          if (error)
4119 4126                  goto err;
4120 4127  
4121 4128          nfs3_cache_post_op_vattr(vp, &res.dir_attributes, t, cr);
4122 4129  
4123 4130          error = geterrno3(res.status);
4124 4131          if (error) {
4125 4132                  PURGE_STALE_FH(error, vp, cr);
4126 4133                  goto err;
4127 4134          }
4128 4135  
4129 4136          if (mi->mi_io_kstats) {
4130 4137                  mutex_enter(&mi->mi_lock);
4131 4138                  KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
4132 4139                  KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size;
4133 4140                  mutex_exit(&mi->mi_lock);
4134 4141          }
4135 4142  
4136 4143          rdc->nfs3_ncookie = res.loff;
4137 4144          rp->r_cookieverf = res.cookieverf;
4138 4145          rdc->eof = res.eof ? 1 : 0;
4139 4146          rdc->entlen = res.size;
4140 4147          ASSERT(rdc->entlen <= rdc->buflen);
4141 4148          rdc->error = 0;
4142 4149          return;
4143 4150  
4144 4151  err:
4145 4152          kmem_free(rdc->entries, rdc->buflen);
4146 4153          rdc->entries = NULL;
4147 4154          rdc->error = error;
4148 4155  }
4149 4156  
4150 4157  /*
4151 4158   * Read directory entries.
4152 4159   * There are some weird things to look out for here.  The uio_loffset
4153 4160   * field is either 0 or it is the offset returned from a previous
4154 4161   * readdir.  It is an opaque value used by the server to find the
4155 4162   * correct directory block to read. The count field is the number
4156 4163   * of blocks to read on the server.  This is advisory only, the server
4157 4164   * may return only one block's worth of entries.  Entries may be compressed
4158 4165   * on the server.
4159 4166   */
4160 4167  static void
4161 4168  nfs3readdirplus(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4162 4169  {
4163 4170          int error;
4164 4171          READDIRPLUS3args args;
4165 4172          READDIRPLUS3vres res;
4166 4173          vattr_t dva;
4167 4174          rnode_t *rp;
4168 4175          mntinfo_t *mi;
4169 4176          int douprintf;
4170 4177          failinfo_t fi, *fip = NULL;
4171 4178  
4172 4179          rp = VTOR(vp);
4173 4180          mi = VTOMI(vp);
4174 4181          ASSERT(nfs_zone() == mi->mi_zone);
4175 4182  
4176 4183          args.dir = *RTOFH3(rp);
4177 4184          args.cookie = (cookie3)rdc->nfs3_cookie;
4178 4185          args.cookieverf = rp->r_cookieverf;
4179 4186          args.dircount = rdc->buflen;
4180 4187          args.maxcount = mi->mi_tsize;
4181 4188  
4182 4189          /*
4183 4190           * NFS client failover support
4184 4191           * suppress failover unless we have a zero cookie
4185 4192           */
4186 4193          if (args.cookie == (cookie3)0) {
4187 4194                  fi.vp = vp;
4188 4195                  fi.fhp = (caddr_t)&args.dir;
4189 4196                  fi.copyproc = nfs3copyfh;
4190 4197                  fi.lookupproc = nfs3lookup;
4191 4198                  fi.xattrdirproc = acl_getxattrdir3;
4192 4199                  fip = &fi;
4193 4200          }
4194 4201  
4195 4202  #ifdef DEBUG
4196 4203          rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP);
4197 4204  #else
4198 4205          rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
4199 4206  #endif
4200 4207  
4201 4208          res.entries = (dirent64_t *)rdc->entries;
4202 4209          res.entries_size = rdc->buflen;
4203 4210          res.dir_attributes.fres.vap = &dva;
4204 4211          res.dir_attributes.fres.vp = vp;
4205 4212          res.loff = rdc->nfs3_cookie;
4206 4213          res.credentials = cr;
4207 4214  
4208 4215          douprintf = 1;
4209 4216  
4210 4217          if (mi->mi_io_kstats) {
4211 4218                  mutex_enter(&mi->mi_lock);
4212 4219                  kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
4213 4220                  mutex_exit(&mi->mi_lock);
4214 4221          }
4215 4222  
4216 4223          res.time = gethrtime();
4217 4224  
4218 4225          error = rfs3call(mi, NFSPROC3_READDIRPLUS,
4219 4226              xdr_READDIRPLUS3args, (caddr_t)&args,
4220 4227              xdr_READDIRPLUS3vres, (caddr_t)&res, cr,
4221 4228              &douprintf, &res.status, 0, fip);
4222 4229  
4223 4230          if (mi->mi_io_kstats) {
4224 4231                  mutex_enter(&mi->mi_lock);
4225 4232                  kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
4226 4233                  mutex_exit(&mi->mi_lock);
4227 4234          }
4228 4235  
4229 4236          if (error) {
4230 4237                  goto err;
4231 4238          }
4232 4239  
4233 4240          nfs3_cache_post_op_vattr(vp, &res.dir_attributes, res.time, cr);
4234 4241  
4235 4242          error = geterrno3(res.status);
4236 4243          if (error) {
4237 4244                  PURGE_STALE_FH(error, vp, cr);
4238 4245                  if (error == EOPNOTSUPP) {
4239 4246                          mutex_enter(&mi->mi_lock);
4240 4247                          mi->mi_flags |= MI_READDIRONLY;
4241 4248                          mutex_exit(&mi->mi_lock);
4242 4249                  }
4243 4250                  goto err;
4244 4251          }
4245 4252  
4246 4253          if (mi->mi_io_kstats) {
4247 4254                  mutex_enter(&mi->mi_lock);
4248 4255                  KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
4249 4256                  KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size;
4250 4257                  mutex_exit(&mi->mi_lock);
4251 4258          }
4252 4259  
4253 4260          rdc->nfs3_ncookie = res.loff;
4254 4261          rp->r_cookieverf = res.cookieverf;
4255 4262          rdc->eof = res.eof ? 1 : 0;
4256 4263          rdc->entlen = res.size;
4257 4264          ASSERT(rdc->entlen <= rdc->buflen);
4258 4265          rdc->error = 0;
4259 4266  
4260 4267          return;
4261 4268  
4262 4269  err:
4263 4270          kmem_free(rdc->entries, rdc->buflen);
4264 4271          rdc->entries = NULL;
4265 4272          rdc->error = error;
4266 4273  }
4267 4274  
4268 4275  #ifdef DEBUG
4269 4276  static int nfs3_bio_do_stop = 0;
4270 4277  #endif
4271 4278  
4272 4279  static int
4273 4280  nfs3_bio(struct buf *bp, stable_how *stab_comm, cred_t *cr)
4274 4281  {
4275 4282          rnode_t *rp = VTOR(bp->b_vp);
4276 4283          int count;
4277 4284          int error;
4278 4285          cred_t *cred;
4279 4286          offset_t offset;
4280 4287  
4281 4288          ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
4282 4289          offset = ldbtob(bp->b_lblkno);
4283 4290  
4284 4291          DTRACE_IO1(start, struct buf *, bp);
4285 4292  
4286 4293          if (bp->b_flags & B_READ) {
4287 4294                  mutex_enter(&rp->r_statelock);
4288 4295                  if (rp->r_cred != NULL) {
4289 4296                          cred = rp->r_cred;
4290 4297                          crhold(cred);
4291 4298                  } else {
4292 4299                          rp->r_cred = cr;
4293 4300                          crhold(cr);
4294 4301                          cred = cr;
4295 4302                          crhold(cred);
4296 4303                  }
4297 4304                  mutex_exit(&rp->r_statelock);
4298 4305          read_again:
4299 4306                  error = bp->b_error = nfs3read(bp->b_vp, bp->b_un.b_addr,
4300 4307                      offset, bp->b_bcount, &bp->b_resid, cred);
4301 4308                  crfree(cred);
4302 4309                  if (!error) {
4303 4310                          if (bp->b_resid) {
4304 4311                                  /*
4305 4312                                   * Didn't get it all because we hit EOF,
4306 4313                                   * zero all the memory beyond the EOF.
4307 4314                                   */
4308 4315                                  /* bzero(rdaddr + */
4309 4316                                  bzero(bp->b_un.b_addr +
4310 4317                                      bp->b_bcount - bp->b_resid, bp->b_resid);
4311 4318                          }
4312 4319                          mutex_enter(&rp->r_statelock);
4313 4320                          if (bp->b_resid == bp->b_bcount &&
4314 4321                              offset >= rp->r_size) {
4315 4322                                  /*
4316 4323                                   * We didn't read anything at all as we are
4317 4324                                   * past EOF.  Return an error indicator back
4318 4325                                   * but don't destroy the pages (yet).
4319 4326                                   */
4320 4327                                  error = NFS_EOF;
4321 4328                          }
4322 4329                          mutex_exit(&rp->r_statelock);
4323 4330                  } else if (error == EACCES) {
4324 4331                          mutex_enter(&rp->r_statelock);
4325 4332                          if (cred != cr) {
4326 4333                                  if (rp->r_cred != NULL)
4327 4334                                          crfree(rp->r_cred);
4328 4335                                  rp->r_cred = cr;
4329 4336                                  crhold(cr);
4330 4337                                  cred = cr;
4331 4338                                  crhold(cred);
4332 4339                                  mutex_exit(&rp->r_statelock);
4333 4340                                  goto read_again;
4334 4341                          }
4335 4342                          mutex_exit(&rp->r_statelock);
4336 4343                  }
4337 4344          } else {
4338 4345                  if (!(rp->r_flags & RSTALE)) {
4339 4346                          mutex_enter(&rp->r_statelock);
4340 4347                          if (rp->r_cred != NULL) {
4341 4348                                  cred = rp->r_cred;
4342 4349                                  crhold(cred);
4343 4350                          } else {
4344 4351                                  rp->r_cred = cr;
4345 4352                                  crhold(cr);
4346 4353                                  cred = cr;
4347 4354                                  crhold(cred);
4348 4355                          }
4349 4356                          mutex_exit(&rp->r_statelock);
4350 4357                  write_again:
4351 4358                          mutex_enter(&rp->r_statelock);
4352 4359                          count = MIN(bp->b_bcount, rp->r_size - offset);
4353 4360                          mutex_exit(&rp->r_statelock);
4354 4361                          if (count < 0)
4355 4362                                  cmn_err(CE_PANIC, "nfs3_bio: write count < 0");
4356 4363  #ifdef DEBUG
4357 4364                          if (count == 0) {
4358 4365                                  zcmn_err(getzoneid(), CE_WARN,
4359 4366                                      "nfs3_bio: zero length write at %lld",
4360 4367                                      offset);
4361 4368                                  nfs_printfhandle(&rp->r_fh);
4362 4369                                  if (nfs3_bio_do_stop)
4363 4370                                          debug_enter("nfs3_bio");
4364 4371                          }
4365 4372  #endif
4366 4373                          error = nfs3write(bp->b_vp, bp->b_un.b_addr, offset,
4367 4374                              count, cred, stab_comm);
4368 4375                          if (error == EACCES) {
4369 4376                                  mutex_enter(&rp->r_statelock);
4370 4377                                  if (cred != cr) {
4371 4378                                          if (rp->r_cred != NULL)
4372 4379                                                  crfree(rp->r_cred);
4373 4380                                          rp->r_cred = cr;
4374 4381                                          crhold(cr);
4375 4382                                          crfree(cred);
4376 4383                                          cred = cr;
4377 4384                                          crhold(cred);
4378 4385                                          mutex_exit(&rp->r_statelock);
4379 4386                                          goto write_again;
4380 4387                                  }
4381 4388                                  mutex_exit(&rp->r_statelock);
4382 4389                          }
4383 4390                          bp->b_error = error;
4384 4391                          if (error && error != EINTR) {
4385 4392                                  /*
4386 4393                                   * Don't print EDQUOT errors on the console.
4387 4394                                   * Don't print asynchronous EACCES errors.
4388 4395                                   * Don't print EFBIG errors.
4389 4396                                   * Print all other write errors.
4390 4397                                   */
4391 4398                                  if (error != EDQUOT && error != EFBIG &&
4392 4399                                      (error != EACCES ||
4393 4400                                      !(bp->b_flags & B_ASYNC)))
4394 4401                                          nfs_write_error(bp->b_vp, error, cred);
4395 4402                                  /*
4396 4403                                   * Update r_error and r_flags as appropriate.
4397 4404                                   * If the error was ESTALE, then mark the
4398 4405                                   * rnode as not being writeable and save
4399 4406                                   * the error status.  Otherwise, save any
4400 4407                                   * errors which occur from asynchronous
4401 4408                                   * page invalidations.  Any errors occurring
4402 4409                                   * from other operations should be saved
4403 4410                                   * by the caller.
4404 4411                                   */
4405 4412                                  mutex_enter(&rp->r_statelock);
4406 4413                                  if (error == ESTALE) {
4407 4414                                          rp->r_flags |= RSTALE;
4408 4415                                          if (!rp->r_error)
4409 4416                                                  rp->r_error = error;
4410 4417                                  } else if (!rp->r_error &&
4411 4418                                      (bp->b_flags &
4412 4419                                      (B_INVAL|B_FORCE|B_ASYNC)) ==
4413 4420                                      (B_INVAL|B_FORCE|B_ASYNC)) {
4414 4421                                          rp->r_error = error;
4415 4422                                  }
4416 4423                                  mutex_exit(&rp->r_statelock);
4417 4424                          }
4418 4425                          crfree(cred);
4419 4426                  } else {
4420 4427                          error = rp->r_error;
4421 4428                          /*
4422 4429                           * A close may have cleared r_error, if so,
4423 4430                           * propagate ESTALE error return properly
4424 4431                           */
4425 4432                          if (error == 0)
4426 4433                                  error = ESTALE;
4427 4434                  }
4428 4435          }
4429 4436  
4430 4437          if (error != 0 && error != NFS_EOF)
4431 4438                  bp->b_flags |= B_ERROR;
4432 4439  
4433 4440          DTRACE_IO1(done, struct buf *, bp);
4434 4441  
4435 4442          return (error);
4436 4443  }
4437 4444  
4438 4445  /* ARGSUSED */
4439 4446  static int
4440 4447  nfs3_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4441 4448  {
4442 4449          rnode_t *rp;
4443 4450  
4444 4451          if (nfs_zone() != VTOMI(vp)->mi_zone)
4445 4452                  return (EIO);
4446 4453          rp = VTOR(vp);
4447 4454  
4448 4455          if (fidp->fid_len < (ushort_t)rp->r_fh.fh_len) {
4449 4456                  fidp->fid_len = rp->r_fh.fh_len;
4450 4457                  return (ENOSPC);
4451 4458          }
4452 4459          fidp->fid_len = rp->r_fh.fh_len;
4453 4460          bcopy(rp->r_fh.fh_buf, fidp->fid_data, fidp->fid_len);
4454 4461          return (0);
4455 4462  }
4456 4463  
4457 4464  /* ARGSUSED2 */
4458 4465  static int
4459 4466  nfs3_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
4460 4467  {
4461 4468          rnode_t *rp = VTOR(vp);
4462 4469  
4463 4470          if (!write_lock) {
4464 4471                  (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
4465 4472                  return (V_WRITELOCK_FALSE);
4466 4473          }
4467 4474  
4468 4475          if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
4469 4476                  (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
4470 4477                  if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
4471 4478                          return (V_WRITELOCK_FALSE);
4472 4479                  nfs_rw_exit(&rp->r_rwlock);
4473 4480          }
4474 4481  
4475 4482          (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
4476 4483          return (V_WRITELOCK_TRUE);
4477 4484  }
4478 4485  
4479 4486  /* ARGSUSED */
4480 4487  static void
4481 4488  nfs3_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
4482 4489  {
4483 4490          rnode_t *rp = VTOR(vp);
4484 4491  
4485 4492          nfs_rw_exit(&rp->r_rwlock);
4486 4493  }
4487 4494  
4488 4495  /* ARGSUSED */
4489 4496  static int
4490 4497  nfs3_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
4491 4498  {
4492 4499  
4493 4500          /*
4494 4501           * Because we stuff the readdir cookie into the offset field
4495 4502           * someone may attempt to do an lseek with the cookie which
4496 4503           * we want to succeed.
4497 4504           */
4498 4505          if (vp->v_type == VDIR)
4499 4506                  return (0);
4500 4507          if (*noffp < 0)
4501 4508                  return (EINVAL);
4502 4509          return (0);
4503 4510  }
4504 4511  
4505 4512  /*
4506 4513   * number of nfs3_bsize blocks to read ahead.
4507 4514   */
4508 4515  static int nfs3_nra = 4;
4509 4516  
4510 4517  #ifdef DEBUG
4511 4518  static int nfs3_lostpage = 0;   /* number of times we lost original page */
4512 4519  #endif
4513 4520  
4514 4521  /*
4515 4522   * Return all the pages from [off..off+len) in file
4516 4523   */
4517 4524  /* ARGSUSED */
4518 4525  static int
4519 4526  nfs3_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4520 4527          page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4521 4528          enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4522 4529  {
4523 4530          rnode_t *rp;
4524 4531          int error;
4525 4532          mntinfo_t *mi;
4526 4533  
4527 4534          if (vp->v_flag & VNOMAP)
4528 4535                  return (ENOSYS);
4529 4536  
4530 4537          if (nfs_zone() != VTOMI(vp)->mi_zone)
4531 4538                  return (EIO);
4532 4539          if (protp != NULL)
4533 4540                  *protp = PROT_ALL;
4534 4541  
4535 4542          /*
4536 4543           * Now valididate that the caches are up to date.
4537 4544           */
4538 4545          error = nfs3_validate_caches(vp, cr);
4539 4546          if (error)
4540 4547                  return (error);
4541 4548  
4542 4549          rp = VTOR(vp);
4543 4550          mi = VTOMI(vp);
4544 4551  retry:
4545 4552          mutex_enter(&rp->r_statelock);
4546 4553  
4547 4554          /*
4548 4555           * Don't create dirty pages faster than they
4549 4556           * can be cleaned so that the system doesn't
4550 4557           * get imbalanced.  If the async queue is
4551 4558           * maxed out, then wait for it to drain before
4552 4559           * creating more dirty pages.  Also, wait for
4553 4560           * any threads doing pagewalks in the vop_getattr
4554 4561           * entry points so that they don't block for
4555 4562           * long periods.
4556 4563           */
4557 4564          if (rw == S_CREATE) {
4558 4565                  while ((mi->mi_max_threads != 0 &&
4559 4566                      rp->r_awcount > 2 * mi->mi_max_threads) ||
4560 4567                      rp->r_gcount > 0)
4561 4568                          cv_wait(&rp->r_cv, &rp->r_statelock);
4562 4569          }
4563 4570  
4564 4571          /*
4565 4572           * If we are getting called as a side effect of an nfs_write()
4566 4573           * operation the local file size might not be extended yet.
4567 4574           * In this case we want to be able to return pages of zeroes.
4568 4575           */
4569 4576          if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
4570 4577                  mutex_exit(&rp->r_statelock);
4571 4578                  return (EFAULT);                /* beyond EOF */
4572 4579          }
4573 4580  
4574 4581          mutex_exit(&rp->r_statelock);
4575 4582  
4576 4583          if (len <= PAGESIZE) {
4577 4584                  error = nfs3_getapage(vp, off, len, protp, pl, plsz,
4578 4585                      seg, addr, rw, cr);
4579 4586          } else {
4580 4587                  error = pvn_getpages(nfs3_getapage, vp, off, len, protp,
4581 4588                      pl, plsz, seg, addr, rw, cr);
4582 4589          }
4583 4590  
4584 4591          switch (error) {
4585 4592          case NFS_EOF:
4586 4593                  nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
4587 4594                  goto retry;
4588 4595          case ESTALE:
4589 4596                  PURGE_STALE_FH(error, vp, cr);
4590 4597          }
4591 4598  
4592 4599          return (error);
4593 4600  }
4594 4601  
4595 4602  /*
4596 4603   * Called from pvn_getpages or nfs3_getpage to get a particular page.
4597 4604   */
4598 4605  /* ARGSUSED */
4599 4606  static int
4600 4607  nfs3_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
4601 4608          page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4602 4609          enum seg_rw rw, cred_t *cr)
4603 4610  {
4604 4611          rnode_t *rp;
4605 4612          uint_t bsize;
4606 4613          struct buf *bp;
4607 4614          page_t *pp;
4608 4615          u_offset_t lbn;
4609 4616          u_offset_t io_off;
4610 4617          u_offset_t blkoff;
4611 4618          u_offset_t rablkoff;
4612 4619          size_t io_len;
4613 4620          uint_t blksize;
4614 4621          int error;
4615 4622          int readahead;
4616 4623          int readahead_issued = 0;
4617 4624          int ra_window; /* readahead window */
4618 4625          page_t *pagefound;
4619 4626          page_t *savepp;
4620 4627  
4621 4628          if (nfs_zone() != VTOMI(vp)->mi_zone)
4622 4629                  return (EIO);
4623 4630          rp = VTOR(vp);
4624 4631          bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4625 4632  
4626 4633  reread:
4627 4634          bp = NULL;
4628 4635          pp = NULL;
4629 4636          pagefound = NULL;
4630 4637  
4631 4638          if (pl != NULL)
4632 4639                  pl[0] = NULL;
4633 4640  
4634 4641          error = 0;
4635 4642          lbn = off / bsize;
4636 4643          blkoff = lbn * bsize;
4637 4644  
4638 4645          /*
4639 4646           * Queueing up the readahead before doing the synchronous read
4640 4647           * results in a significant increase in read throughput because
4641 4648           * of the increased parallelism between the async threads and
4642 4649           * the process context.
4643 4650           */
4644 4651          if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
4645 4652              rw != S_CREATE &&
4646 4653              !(vp->v_flag & VNOCACHE)) {
4647 4654                  mutex_enter(&rp->r_statelock);
4648 4655  
4649 4656                  /*
4650 4657                   * Calculate the number of readaheads to do.
4651 4658                   * a) No readaheads at offset = 0.
4652 4659                   * b) Do maximum(nfs3_nra) readaheads when the readahead
4653 4660                   *    window is closed.
4654 4661                   * c) Do readaheads between 1 to (nfs3_nra - 1) depending
4655 4662                   *    upon how far the readahead window is open or close.
4656 4663                   * d) No readaheads if rp->r_nextr is not within the scope
4657 4664                   *    of the readahead window (random i/o).
4658 4665                   */
4659 4666  
4660 4667                  if (off == 0)
4661 4668                          readahead = 0;
4662 4669                  else if (blkoff == rp->r_nextr)
4663 4670                          readahead = nfs3_nra;
4664 4671                  else if (rp->r_nextr > blkoff &&
4665 4672                      ((ra_window = (rp->r_nextr - blkoff) / bsize)
4666 4673                      <= (nfs3_nra - 1)))
4667 4674                          readahead = nfs3_nra - ra_window;
4668 4675                  else
4669 4676                          readahead = 0;
4670 4677  
4671 4678                  rablkoff = rp->r_nextr;
4672 4679                  while (readahead > 0 && rablkoff + bsize < rp->r_size) {
4673 4680                          mutex_exit(&rp->r_statelock);
4674 4681                          if (nfs_async_readahead(vp, rablkoff + bsize,
4675 4682                              addr + (rablkoff + bsize - off), seg, cr,
4676 4683                              nfs3_readahead) < 0) {
4677 4684                                  mutex_enter(&rp->r_statelock);
4678 4685                                  break;
4679 4686                          }
4680 4687                          readahead--;
4681 4688                          rablkoff += bsize;
4682 4689                          /*
4683 4690                           * Indicate that we did a readahead so
4684 4691                           * readahead offset is not updated
4685 4692                           * by the synchronous read below.
4686 4693                           */
4687 4694                          readahead_issued = 1;
4688 4695                          mutex_enter(&rp->r_statelock);
4689 4696                          /*
4690 4697                           * set readahead offset to
4691 4698                           * offset of last async readahead
4692 4699                           * request.
4693 4700                           */
4694 4701                          rp->r_nextr = rablkoff;
4695 4702                  }
4696 4703                  mutex_exit(&rp->r_statelock);
4697 4704          }
4698 4705  
4699 4706  again:
4700 4707          if ((pagefound = page_exists(vp, off)) == NULL) {
4701 4708                  if (pl == NULL) {
4702 4709                          (void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
4703 4710                              nfs3_readahead);
4704 4711                  } else if (rw == S_CREATE) {
4705 4712                          /*
4706 4713                           * Block for this page is not allocated, or the offset
4707 4714                           * is beyond the current allocation size, or we're
4708 4715                           * allocating a swap slot and the page was not found,
4709 4716                           * so allocate it and return a zero page.
4710 4717                           */
4711 4718                          if ((pp = page_create_va(vp, off,
4712 4719                              PAGESIZE, PG_WAIT, seg, addr)) == NULL)
4713 4720                                  cmn_err(CE_PANIC, "nfs3_getapage: page_create");
4714 4721                          io_len = PAGESIZE;
4715 4722                          mutex_enter(&rp->r_statelock);
4716 4723                          rp->r_nextr = off + PAGESIZE;
4717 4724                          mutex_exit(&rp->r_statelock);
4718 4725                  } else {
4719 4726                          /*
4720 4727                           * Need to go to server to get a BLOCK, exception to
4721 4728                           * that being while reading at offset = 0 or doing
4722 4729                           * random i/o, in that case read only a PAGE.
4723 4730                           */
4724 4731                          mutex_enter(&rp->r_statelock);
4725 4732                          if (blkoff < rp->r_size &&
4726 4733                              blkoff + bsize >= rp->r_size) {
4727 4734                                  /*
4728 4735                                   * If only a block or less is left in
4729 4736                                   * the file, read all that is remaining.
4730 4737                                   */
4731 4738                                  if (rp->r_size <= off) {
4732 4739                                          /*
4733 4740                                           * Trying to access beyond EOF,
4734 4741                                           * set up to get at least one page.
4735 4742                                           */
4736 4743                                          blksize = off + PAGESIZE - blkoff;
4737 4744                                  } else
4738 4745                                          blksize = rp->r_size - blkoff;
4739 4746                          } else if ((off == 0) ||
4740 4747                              (off != rp->r_nextr && !readahead_issued)) {
4741 4748                                  blksize = PAGESIZE;
4742 4749                                  blkoff = off; /* block = page here */
4743 4750                          } else
4744 4751                                  blksize = bsize;
4745 4752                          mutex_exit(&rp->r_statelock);
4746 4753  
4747 4754                          pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4748 4755                              &io_len, blkoff, blksize, 0);
4749 4756  
4750 4757                          /*
4751 4758                           * Some other thread has entered the page,
4752 4759                           * so just use it.
4753 4760                           */
4754 4761                          if (pp == NULL)
4755 4762                                  goto again;
4756 4763  
4757 4764                          /*
4758 4765                           * Now round the request size up to page boundaries.
4759 4766                           * This ensures that the entire page will be
4760 4767                           * initialized to zeroes if EOF is encountered.
4761 4768                           */
4762 4769                          io_len = ptob(btopr(io_len));
4763 4770  
4764 4771                          bp = pageio_setup(pp, io_len, vp, B_READ);
4765 4772                          ASSERT(bp != NULL);
4766 4773  
4767 4774                          /*
4768 4775                           * pageio_setup should have set b_addr to 0.  This
4769 4776                           * is correct since we want to do I/O on a page
4770 4777                           * boundary.  bp_mapin will use this addr to calculate
4771 4778                           * an offset, and then set b_addr to the kernel virtual
4772 4779                           * address it allocated for us.
4773 4780                           */
4774 4781                          ASSERT(bp->b_un.b_addr == 0);
4775 4782  
4776 4783                          bp->b_edev = 0;
4777 4784                          bp->b_dev = 0;
4778 4785                          bp->b_lblkno = lbtodb(io_off);
4779 4786                          bp->b_file = vp;
4780 4787                          bp->b_offset = (offset_t)off;
4781 4788                          bp_mapin(bp);
4782 4789  
4783 4790                          /*
4784 4791                           * If doing a write beyond what we believe is EOF,
4785 4792                           * don't bother trying to read the pages from the
4786 4793                           * server, we'll just zero the pages here.  We
4787 4794                           * don't check that the rw flag is S_WRITE here
4788 4795                           * because some implementations may attempt a
4789 4796                           * read access to the buffer before copying data.
4790 4797                           */
4791 4798                          mutex_enter(&rp->r_statelock);
4792 4799                          if (io_off >= rp->r_size && seg == segkmap) {
4793 4800                                  mutex_exit(&rp->r_statelock);
4794 4801                                  bzero(bp->b_un.b_addr, io_len);
4795 4802                          } else {
4796 4803                                  mutex_exit(&rp->r_statelock);
4797 4804                                  error = nfs3_bio(bp, NULL, cr);
4798 4805                          }
4799 4806  
4800 4807                          /*
4801 4808                           * Unmap the buffer before freeing it.
4802 4809                           */
4803 4810                          bp_mapout(bp);
4804 4811                          pageio_done(bp);
4805 4812  
4806 4813                          savepp = pp;
4807 4814                          do {
4808 4815                                  pp->p_fsdata = C_NOCOMMIT;
4809 4816                          } while ((pp = pp->p_next) != savepp);
4810 4817  
4811 4818                          if (error == NFS_EOF) {
4812 4819                                  /*
4813 4820                                   * If doing a write system call just return
4814 4821                                   * zeroed pages, else user tried to get pages
4815 4822                                   * beyond EOF, return error.  We don't check
4816 4823                                   * that the rw flag is S_WRITE here because
4817 4824                                   * some implementations may attempt a read
4818 4825                                   * access to the buffer before copying data.
4819 4826                                   */
4820 4827                                  if (seg == segkmap)
4821 4828                                          error = 0;
4822 4829                                  else
4823 4830                                          error = EFAULT;
4824 4831                          }
4825 4832  
4826 4833                          if (!readahead_issued && !error) {
4827 4834                                  mutex_enter(&rp->r_statelock);
4828 4835                                  rp->r_nextr = io_off + io_len;
4829 4836                                  mutex_exit(&rp->r_statelock);
4830 4837                          }
4831 4838                  }
4832 4839          }
4833 4840  
4834 4841  out:
4835 4842          if (pl == NULL)
4836 4843                  return (error);
4837 4844  
4838 4845          if (error) {
4839 4846                  if (pp != NULL)
4840 4847                          pvn_read_done(pp, B_ERROR);
4841 4848                  return (error);
4842 4849          }
4843 4850  
4844 4851          if (pagefound) {
4845 4852                  se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
4846 4853  
4847 4854                  /*
4848 4855                   * Page exists in the cache, acquire the appropriate lock.
4849 4856                   * If this fails, start all over again.
4850 4857                   */
4851 4858                  if ((pp = page_lookup(vp, off, se)) == NULL) {
4852 4859  #ifdef DEBUG
4853 4860                          nfs3_lostpage++;
4854 4861  #endif
4855 4862                          goto reread;
4856 4863                  }
4857 4864                  pl[0] = pp;
4858 4865                  pl[1] = NULL;
4859 4866                  return (0);
4860 4867          }
4861 4868  
4862 4869          if (pp != NULL)
4863 4870                  pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4864 4871  
4865 4872          return (error);
4866 4873  }
4867 4874  
4868 4875  static void
4869 4876  nfs3_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
4870 4877          cred_t *cr)
4871 4878  {
4872 4879          int error;
4873 4880          page_t *pp;
4874 4881          u_offset_t io_off;
4875 4882          size_t io_len;
4876 4883          struct buf *bp;
4877 4884          uint_t bsize, blksize;
4878 4885          rnode_t *rp = VTOR(vp);
4879 4886          page_t *savepp;
4880 4887  
4881 4888          ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4882 4889          bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4883 4890  
4884 4891          mutex_enter(&rp->r_statelock);
4885 4892          if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
4886 4893                  /*
4887 4894                   * If less than a block left in file read less
4888 4895                   * than a block.
4889 4896                   */
4890 4897                  blksize = rp->r_size - blkoff;
4891 4898          } else
4892 4899                  blksize = bsize;
4893 4900          mutex_exit(&rp->r_statelock);
4894 4901  
4895 4902          pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
4896 4903              &io_off, &io_len, blkoff, blksize, 1);
4897 4904          /*
4898 4905           * The isra flag passed to the kluster function is 1, we may have
4899 4906           * gotten a return value of NULL for a variety of reasons (# of free
4900 4907           * pages < minfree, someone entered the page on the vnode etc). In all
4901 4908           * cases, we want to punt on the readahead.
4902 4909           */
4903 4910          if (pp == NULL)
4904 4911                  return;
4905 4912  
4906 4913          /*
4907 4914           * Now round the request size up to page boundaries.
4908 4915           * This ensures that the entire page will be
4909 4916           * initialized to zeroes if EOF is encountered.
4910 4917           */
4911 4918          io_len = ptob(btopr(io_len));
4912 4919  
4913 4920          bp = pageio_setup(pp, io_len, vp, B_READ);
4914 4921          ASSERT(bp != NULL);
4915 4922  
4916 4923          /*
4917 4924           * pageio_setup should have set b_addr to 0.  This is correct since
4918 4925           * we want to do I/O on a page boundary. bp_mapin() will use this addr
4919 4926           * to calculate an offset, and then set b_addr to the kernel virtual
4920 4927           * address it allocated for us.
4921 4928           */
4922 4929          ASSERT(bp->b_un.b_addr == 0);
4923 4930  
4924 4931          bp->b_edev = 0;
4925 4932          bp->b_dev = 0;
4926 4933          bp->b_lblkno = lbtodb(io_off);
4927 4934          bp->b_file = vp;
4928 4935          bp->b_offset = (offset_t)blkoff;
4929 4936          bp_mapin(bp);
4930 4937  
4931 4938          /*
4932 4939           * If doing a write beyond what we believe is EOF, don't bother trying
4933 4940           * to read the pages from the server, we'll just zero the pages here.
4934 4941           * We don't check that the rw flag is S_WRITE here because some
4935 4942           * implementations may attempt a read access to the buffer before
4936 4943           * copying data.
4937 4944           */
4938 4945          mutex_enter(&rp->r_statelock);
4939 4946          if (io_off >= rp->r_size && seg == segkmap) {
4940 4947                  mutex_exit(&rp->r_statelock);
4941 4948                  bzero(bp->b_un.b_addr, io_len);
4942 4949                  error = 0;
4943 4950          } else {
4944 4951                  mutex_exit(&rp->r_statelock);
4945 4952                  error = nfs3_bio(bp, NULL, cr);
4946 4953                  if (error == NFS_EOF)
4947 4954                          error = 0;
4948 4955          }
4949 4956  
4950 4957          /*
4951 4958           * Unmap the buffer before freeing it.
4952 4959           */
4953 4960          bp_mapout(bp);
4954 4961          pageio_done(bp);
4955 4962  
4956 4963          savepp = pp;
4957 4964          do {
4958 4965                  pp->p_fsdata = C_NOCOMMIT;
4959 4966          } while ((pp = pp->p_next) != savepp);
4960 4967  
4961 4968          pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
4962 4969  
4963 4970          /*
4964 4971           * In case of error set readahead offset
4965 4972           * to the lowest offset.
4966 4973           * pvn_read_done() calls VN_DISPOSE to destroy the pages
4967 4974           */
4968 4975          if (error && rp->r_nextr > io_off) {
4969 4976                  mutex_enter(&rp->r_statelock);
4970 4977                  if (rp->r_nextr > io_off)
4971 4978                          rp->r_nextr = io_off;
4972 4979                  mutex_exit(&rp->r_statelock);
4973 4980          }
4974 4981  }
4975 4982  
4976 4983  /*
4977 4984   * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
4978 4985   * If len == 0, do from off to EOF.
4979 4986   *
4980 4987   * The normal cases should be len == 0 && off == 0 (entire vp list),
4981 4988   * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4982 4989   * (from pageout).
4983 4990   */
4984 4991  /* ARGSUSED */
4985 4992  static int
4986 4993  nfs3_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4987 4994          caller_context_t *ct)
4988 4995  {
4989 4996          int error;
4990 4997          rnode_t *rp;
4991 4998  
4992 4999          ASSERT(cr != NULL);
4993 5000  
4994 5001          /*
4995 5002           * XXX - Why should this check be made here?
4996 5003           */
4997 5004          if (vp->v_flag & VNOMAP)
4998 5005                  return (ENOSYS);
4999 5006          if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
5000 5007                  return (0);
5001 5008          if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
5002 5009                  return (EIO);
5003 5010  
5004 5011          rp = VTOR(vp);
5005 5012          mutex_enter(&rp->r_statelock);
5006 5013          rp->r_count++;
5007 5014          mutex_exit(&rp->r_statelock);
5008 5015          error = nfs_putpages(vp, off, len, flags, cr);
5009 5016          mutex_enter(&rp->r_statelock);
5010 5017          rp->r_count--;
5011 5018          cv_broadcast(&rp->r_cv);
5012 5019          mutex_exit(&rp->r_statelock);
5013 5020  
5014 5021          return (error);
5015 5022  }
5016 5023  
5017 5024  /*
5018 5025   * Write out a single page, possibly klustering adjacent dirty pages.
5019 5026   */
5020 5027  int
5021 5028  nfs3_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
5022 5029          int flags, cred_t *cr)
5023 5030  {
5024 5031          u_offset_t io_off;
5025 5032          u_offset_t lbn_off;
5026 5033          u_offset_t lbn;
5027 5034          size_t io_len;
5028 5035          uint_t bsize;
5029 5036          int error;
5030 5037          rnode_t *rp;
5031 5038  
5032 5039          ASSERT(!vn_is_readonly(vp));
5033 5040          ASSERT(pp != NULL);
5034 5041          ASSERT(cr != NULL);
5035 5042          ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
5036 5043  
5037 5044          rp = VTOR(vp);
5038 5045          ASSERT(rp->r_count > 0);
5039 5046  
5040 5047          bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
5041 5048          lbn = pp->p_offset / bsize;
5042 5049          lbn_off = lbn * bsize;
5043 5050  
5044 5051          /*
5045 5052           * Find a kluster that fits in one block, or in
5046 5053           * one page if pages are bigger than blocks.  If
5047 5054           * there is less file space allocated than a whole
5048 5055           * page, we'll shorten the i/o request below.
5049 5056           */
5050 5057          pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
5051 5058              roundup(bsize, PAGESIZE), flags);
5052 5059  
5053 5060          /*
5054 5061           * pvn_write_kluster shouldn't have returned a page with offset
5055 5062           * behind the original page we were given.  Verify that.
5056 5063           */
5057 5064          ASSERT((pp->p_offset / bsize) >= lbn);
5058 5065  
5059 5066          /*
5060 5067           * Now pp will have the list of kept dirty pages marked for
5061 5068           * write back.  It will also handle invalidation and freeing
5062 5069           * of pages that are not dirty.  Check for page length rounding
5063 5070           * problems.
5064 5071           */
5065 5072          if (io_off + io_len > lbn_off + bsize) {
5066 5073                  ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
5067 5074                  io_len = lbn_off + bsize - io_off;
5068 5075          }
5069 5076          /*
5070 5077           * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
5071 5078           * consistent value of r_size. RMODINPROGRESS is set in writerp().
5072 5079           * When RMODINPROGRESS is set it indicates that a uiomove() is in
5073 5080           * progress and the r_size has not been made consistent with the
5074 5081           * new size of the file. When the uiomove() completes the r_size is
5075 5082           * updated and the RMODINPROGRESS flag is cleared.
5076 5083           *
5077 5084           * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
5078 5085           * consistent value of r_size. Without this handshaking, it is
5079 5086           * possible that nfs(3)_bio() picks  up the old value of r_size
5080 5087           * before the uiomove() in writerp() completes. This will result
5081 5088           * in the write through nfs(3)_bio() being dropped.
5082 5089           *
5083 5090           * More precisely, there is a window between the time the uiomove()
5084 5091           * completes and the time the r_size is updated. If a VOP_PUTPAGE()
5085 5092           * operation intervenes in this window, the page will be picked up,
5086 5093           * because it is dirty (it will be unlocked, unless it was
5087 5094           * pagecreate'd). When the page is picked up as dirty, the dirty
5088 5095           * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
5089 5096           * checked. This will still be the old size. Therefore the page will
5090 5097           * not be written out. When segmap_release() calls VOP_PUTPAGE(),
5091 5098           * the page will be found to be clean and the write will be dropped.
5092 5099           */
5093 5100          if (rp->r_flags & RMODINPROGRESS) {
5094 5101                  mutex_enter(&rp->r_statelock);
5095 5102                  if ((rp->r_flags & RMODINPROGRESS) &&
5096 5103                      rp->r_modaddr + MAXBSIZE > io_off &&
5097 5104                      rp->r_modaddr < io_off + io_len) {
5098 5105                          page_t *plist;
5099 5106                          /*
5100 5107                           * A write is in progress for this region of the file.
5101 5108                           * If we did not detect RMODINPROGRESS here then this
5102 5109                           * path through nfs_putapage() would eventually go to
5103 5110                           * nfs(3)_bio() and may not write out all of the data
5104 5111                           * in the pages. We end up losing data. So we decide
5105 5112                           * to set the modified bit on each page in the page
5106 5113                           * list and mark the rnode with RDIRTY. This write
5107 5114                           * will be restarted at some later time.
5108 5115                           */
5109 5116                          plist = pp;
5110 5117                          while (plist != NULL) {
5111 5118                                  pp = plist;
5112 5119                                  page_sub(&plist, pp);
5113 5120                                  hat_setmod(pp);
5114 5121                                  page_io_unlock(pp);
5115 5122                                  page_unlock(pp);
5116 5123                          }
5117 5124                          rp->r_flags |= RDIRTY;
5118 5125                          mutex_exit(&rp->r_statelock);
5119 5126                          if (offp)
5120 5127                                  *offp = io_off;
5121 5128                          if (lenp)
5122 5129                                  *lenp = io_len;
5123 5130                          return (0);
5124 5131                  }
5125 5132                  mutex_exit(&rp->r_statelock);
5126 5133          }
5127 5134  
5128 5135          if (flags & B_ASYNC) {
5129 5136                  error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
5130 5137                      nfs3_sync_putapage);
5131 5138          } else
5132 5139                  error = nfs3_sync_putapage(vp, pp, io_off, io_len, flags, cr);
5133 5140  
5134 5141          if (offp)
5135 5142                  *offp = io_off;
5136 5143          if (lenp)
5137 5144                  *lenp = io_len;
5138 5145          return (error);
5139 5146  }
5140 5147  
5141 5148  static int
5142 5149  nfs3_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5143 5150          int flags, cred_t *cr)
5144 5151  {
5145 5152          int error;
5146 5153          rnode_t *rp;
5147 5154  
5148 5155          ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5149 5156  
5150 5157          flags |= B_WRITE;
5151 5158  
5152 5159          error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5153 5160  
5154 5161          rp = VTOR(vp);
5155 5162  
5156 5163          if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
5157 5164              error == EACCES) &&
5158 5165              (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
5159 5166                  if (!(rp->r_flags & ROUTOFSPACE)) {
5160 5167                          mutex_enter(&rp->r_statelock);
5161 5168                          rp->r_flags |= ROUTOFSPACE;
5162 5169                          mutex_exit(&rp->r_statelock);
5163 5170                  }
5164 5171                  flags |= B_ERROR;
5165 5172                  pvn_write_done(pp, flags);
5166 5173                  /*
5167 5174                   * If this was not an async thread, then try again to
5168 5175                   * write out the pages, but this time, also destroy
5169 5176                   * them whether or not the write is successful.  This
5170 5177                   * will prevent memory from filling up with these
5171 5178                   * pages and destroying them is the only alternative
5172 5179                   * if they can't be written out.
5173 5180                   *
5174 5181                   * Don't do this if this is an async thread because
5175 5182                   * when the pages are unlocked in pvn_write_done,
5176 5183                   * some other thread could have come along, locked
5177 5184                   * them, and queued for an async thread.  It would be
5178 5185                   * possible for all of the async threads to be tied
5179 5186                   * up waiting to lock the pages again and they would
5180 5187                   * all already be locked and waiting for an async
5181 5188                   * thread to handle them.  Deadlock.
5182 5189                   */
5183 5190                  if (!(flags & B_ASYNC)) {
5184 5191                          error = nfs3_putpage(vp, io_off, io_len,
5185 5192                              B_INVAL | B_FORCE, cr, NULL);
5186 5193                  }
5187 5194          } else {
5188 5195                  if (error)
5189 5196                          flags |= B_ERROR;
5190 5197                  else if (rp->r_flags & ROUTOFSPACE) {
5191 5198                          mutex_enter(&rp->r_statelock);
5192 5199                          rp->r_flags &= ~ROUTOFSPACE;
5193 5200                          mutex_exit(&rp->r_statelock);
5194 5201                  }
5195 5202                  pvn_write_done(pp, flags);
5196 5203                  if (freemem < desfree)
5197 5204                          (void) nfs3_commit_vp(vp, (u_offset_t)0, 0, cr);
5198 5205          }
5199 5206  
5200 5207          return (error);
5201 5208  }
5202 5209  
5203 5210  /* ARGSUSED */
5204 5211  static int
5205 5212  nfs3_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
5206 5213          size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
5207 5214          cred_t *cr, caller_context_t *ct)
5208 5215  {
5209 5216          struct segvn_crargs vn_a;
5210 5217          int error;
5211 5218          rnode_t *rp;
5212 5219          struct vattr va;
5213 5220  
5214 5221          if (nfs_zone() != VTOMI(vp)->mi_zone)
5215 5222                  return (EIO);
5216 5223  
5217 5224          if (vp->v_flag & VNOMAP)
5218 5225                  return (ENOSYS);
5219 5226  
5220 5227          if (off < 0 || off + len < 0)
5221 5228                  return (ENXIO);
5222 5229  
5223 5230          if (vp->v_type != VREG)
5224 5231                  return (ENODEV);
5225 5232  
5226 5233          /*
5227 5234           * If there is cached data and if close-to-open consistency
5228 5235           * checking is not turned off and if the file system is not
5229 5236           * mounted readonly, then force an over the wire getattr.
5230 5237           * Otherwise, just invoke nfs3getattr to get a copy of the
5231 5238           * attributes.  The attribute cache will be used unless it
5232 5239           * is timed out and if it is, then an over the wire getattr
5233 5240           * will be issued.
5234 5241           */
5235 5242          va.va_mask = AT_ALL;
5236 5243          if (vn_has_cached_data(vp) &&
5237 5244              !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
5238 5245                  error = nfs3_getattr_otw(vp, &va, cr);
5239 5246          else
5240 5247                  error = nfs3getattr(vp, &va, cr);
5241 5248          if (error)
5242 5249                  return (error);
5243 5250  
5244 5251          /*
5245 5252           * Check to see if the vnode is currently marked as not cachable.
5246 5253           * This means portions of the file are locked (through VOP_FRLOCK).
5247 5254           * In this case the map request must be refused.  We use
5248 5255           * rp->r_lkserlock to avoid a race with concurrent lock requests.
5249 5256           */
5250 5257          rp = VTOR(vp);
5251 5258  
5252 5259          /*
5253 5260           * Atomically increment r_inmap after acquiring r_rwlock. The
5254 5261           * idea here is to acquire r_rwlock to block read/write and
5255 5262           * not to protect r_inmap. r_inmap will inform nfs3_read/write()
5256 5263           * that we are in nfs3_map(). Now, r_rwlock is acquired in order
5257 5264           * and we can prevent the deadlock that would have occurred
5258 5265           * when nfs3_addmap() would have acquired it out of order.
5259 5266           *
5260 5267           * Since we are not protecting r_inmap by any lock, we do not
5261 5268           * hold any lock when we decrement it. We atomically decrement
5262 5269           * r_inmap after we release r_lkserlock.
5263 5270           */
5264 5271  
5265 5272          if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
5266 5273                  return (EINTR);
5267 5274          atomic_add_int(&rp->r_inmap, 1);
5268 5275          nfs_rw_exit(&rp->r_rwlock);
5269 5276  
5270 5277          if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
5271 5278                  atomic_add_int(&rp->r_inmap, -1);
5272 5279                  return (EINTR);
5273 5280          }
5274 5281  
5275 5282          if (vp->v_flag & VNOCACHE) {
5276 5283                  error = EAGAIN;
5277 5284                  goto done;
5278 5285          }
5279 5286  
5280 5287          /*
5281 5288           * Don't allow concurrent locks and mapping if mandatory locking is
5282 5289           * enabled.
5283 5290           */
5284 5291          if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
5285 5292              MANDLOCK(vp, va.va_mode)) {
5286 5293                  error = EAGAIN;
5287 5294                  goto done;
5288 5295          }
5289 5296  
5290 5297          as_rangelock(as);
5291 5298          error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5292 5299          if (error != 0) {
5293 5300                  as_rangeunlock(as);
5294 5301                  goto done;
5295 5302          }
5296 5303  
5297 5304          vn_a.vp = vp;
5298 5305          vn_a.offset = off;
5299 5306          vn_a.type = (flags & MAP_TYPE);
5300 5307          vn_a.prot = (uchar_t)prot;
5301 5308          vn_a.maxprot = (uchar_t)maxprot;
5302 5309          vn_a.flags = (flags & ~MAP_TYPE);
5303 5310          vn_a.cred = cr;
5304 5311          vn_a.amp = NULL;
5305 5312          vn_a.szc = 0;
5306 5313          vn_a.lgrp_mem_policy_flags = 0;
5307 5314  
5308 5315          error = as_map(as, *addrp, len, segvn_create, &vn_a);
5309 5316          as_rangeunlock(as);
5310 5317  
5311 5318  done:
5312 5319          nfs_rw_exit(&rp->r_lkserlock);
5313 5320          atomic_add_int(&rp->r_inmap, -1);
5314 5321          return (error);
5315 5322  }
5316 5323  
5317 5324  /* ARGSUSED */
5318 5325  static int
5319 5326  nfs3_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5320 5327          size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
5321 5328          cred_t *cr, caller_context_t *ct)
5322 5329  {
5323 5330          rnode_t *rp;
5324 5331  
5325 5332          if (vp->v_flag & VNOMAP)
5326 5333                  return (ENOSYS);
5327 5334          if (nfs_zone() != VTOMI(vp)->mi_zone)
5328 5335                  return (EIO);
5329 5336  
5330 5337          rp = VTOR(vp);
5331 5338          atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
5332 5339  
5333 5340          return (0);
5334 5341  }
5335 5342  
5336 5343  /* ARGSUSED */
5337 5344  static int
5338 5345  nfs3_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
5339 5346          offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
5340 5347          caller_context_t *ct)
5341 5348  {
5342 5349          netobj lm_fh3;
5343 5350          int rc;
5344 5351          u_offset_t start, end;
5345 5352          rnode_t *rp;
5346 5353          int error = 0, intr = INTR(vp);
5347 5354  
5348 5355          if (nfs_zone() != VTOMI(vp)->mi_zone)
5349 5356                  return (EIO);
5350 5357          /* check for valid cmd parameter */
5351 5358          if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
5352 5359                  return (EINVAL);
5353 5360  
5354 5361          /* Verify l_type. */
5355 5362          switch (bfp->l_type) {
5356 5363          case F_RDLCK:
5357 5364                  if (cmd != F_GETLK && !(flag & FREAD))
5358 5365                          return (EBADF);
5359 5366                  break;
5360 5367          case F_WRLCK:
5361 5368                  if (cmd != F_GETLK && !(flag & FWRITE))
5362 5369                          return (EBADF);
5363 5370                  break;
5364 5371          case F_UNLCK:
5365 5372                  intr = 0;
5366 5373                  break;
5367 5374  
5368 5375          default:
5369 5376                  return (EINVAL);
5370 5377          }
5371 5378  
5372 5379          /* check the validity of the lock range */
5373 5380          if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
5374 5381                  return (rc);
5375 5382          if (rc = flk_check_lock_data(start, end, MAXEND))
5376 5383                  return (rc);
5377 5384  
5378 5385          /*
5379 5386           * If the filesystem is mounted using local locking, pass the
5380 5387           * request off to the local locking code.
5381 5388           */
5382 5389          if (VTOMI(vp)->mi_flags & MI_LLOCK) {
5383 5390                  if (cmd == F_SETLK || cmd == F_SETLKW) {
5384 5391                          /*
5385 5392                           * For complete safety, we should be holding
5386 5393                           * r_lkserlock.  However, we can't call
5387 5394                           * lm_safelock and then fs_frlock while
5388 5395                           * holding r_lkserlock, so just invoke
5389 5396                           * lm_safelock and expect that this will
5390 5397                           * catch enough of the cases.
5391 5398                           */
5392 5399                          if (!lm_safelock(vp, bfp, cr))
5393 5400                                  return (EAGAIN);
5394 5401                  }
5395 5402                  return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
5396 5403          }
5397 5404  
5398 5405          rp = VTOR(vp);
5399 5406  
5400 5407          /*
5401 5408           * Check whether the given lock request can proceed, given the
5402 5409           * current file mappings.
5403 5410           */
5404 5411          if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
5405 5412                  return (EINTR);
5406 5413          if (cmd == F_SETLK || cmd == F_SETLKW) {
5407 5414                  if (!lm_safelock(vp, bfp, cr)) {
5408 5415                          rc = EAGAIN;
5409 5416                          goto done;
5410 5417                  }
5411 5418          }
5412 5419  
5413 5420          /*
5414 5421           * Flush the cache after waiting for async I/O to finish.  For new
5415 5422           * locks, this is so that the process gets the latest bits from the
5416 5423           * server.  For unlocks, this is so that other clients see the
5417 5424           * latest bits once the file has been unlocked.  If currently dirty
5418 5425           * pages can't be flushed, then don't allow a lock to be set.  But
5419 5426           * allow unlocks to succeed, to avoid having orphan locks on the
5420 5427           * server.
5421 5428           */
5422 5429          if (cmd != F_GETLK) {
5423 5430                  mutex_enter(&rp->r_statelock);
5424 5431                  while (rp->r_count > 0) {
5425 5432                          if (intr) {
5426 5433                                  klwp_t *lwp = ttolwp(curthread);
5427 5434  
5428 5435                                  if (lwp != NULL)
5429 5436                                          lwp->lwp_nostop++;
5430 5437                                  if (cv_wait_sig(&rp->r_cv,
5431 5438                                      &rp->r_statelock) == 0) {
5432 5439                                          if (lwp != NULL)
5433 5440                                                  lwp->lwp_nostop--;
5434 5441                                          rc = EINTR;
5435 5442                                          break;
5436 5443                                  }
5437 5444                                  if (lwp != NULL)
5438 5445                                          lwp->lwp_nostop--;
5439 5446                          } else
5440 5447                                  cv_wait(&rp->r_cv, &rp->r_statelock);
5441 5448                  }
5442 5449                  mutex_exit(&rp->r_statelock);
5443 5450                  if (rc != 0)
5444 5451                          goto done;
5445 5452                  error = nfs3_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
5446 5453                  if (error) {
5447 5454                          if (error == ENOSPC || error == EDQUOT) {
5448 5455                                  mutex_enter(&rp->r_statelock);
5449 5456                                  if (!rp->r_error)
5450 5457                                          rp->r_error = error;
5451 5458                                  mutex_exit(&rp->r_statelock);
5452 5459                          }
5453 5460                          if (bfp->l_type != F_UNLCK) {
5454 5461                                  rc = ENOLCK;
5455 5462                                  goto done;
5456 5463                          }
5457 5464                  }
5458 5465          }
5459 5466  
5460 5467          lm_fh3.n_len = VTOFH3(vp)->fh3_length;
5461 5468          lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data);
5462 5469  
5463 5470          /*
5464 5471           * Call the lock manager to do the real work of contacting
5465 5472           * the server and obtaining the lock.
5466 5473           */
5467 5474          rc = lm4_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh3, flk_cbp);
5468 5475  
5469 5476          if (rc == 0)
5470 5477                  nfs_lockcompletion(vp, cmd);
5471 5478  
5472 5479  done:
5473 5480          nfs_rw_exit(&rp->r_lkserlock);
5474 5481          return (rc);
5475 5482  }
5476 5483  
5477 5484  /*
5478 5485   * Free storage space associated with the specified vnode.  The portion
5479 5486   * to be freed is specified by bfp->l_start and bfp->l_len (already
5480 5487   * normalized to a "whence" of 0).
5481 5488   *
5482 5489   * This is an experimental facility whose continued existence is not
5483 5490   * guaranteed.  Currently, we only support the special case
5484 5491   * of l_len == 0, meaning free to end of file.
5485 5492   */
5486 5493  /* ARGSUSED */
5487 5494  static int
5488 5495  nfs3_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
5489 5496          offset_t offset, cred_t *cr, caller_context_t *ct)
5490 5497  {
5491 5498          int error;
5492 5499  
5493 5500          ASSERT(vp->v_type == VREG);
5494 5501          if (cmd != F_FREESP)
5495 5502                  return (EINVAL);
5496 5503          if (nfs_zone() != VTOMI(vp)->mi_zone)
5497 5504                  return (EIO);
5498 5505  
5499 5506          error = convoff(vp, bfp, 0, offset);
5500 5507          if (!error) {
5501 5508                  ASSERT(bfp->l_start >= 0);
5502 5509                  if (bfp->l_len == 0) {
5503 5510                          struct vattr va;
5504 5511  
5505 5512                          /*
5506 5513                           * ftruncate should not change the ctime and
5507 5514                           * mtime if we truncate the file to its
5508 5515                           * previous size.
5509 5516                           */
5510 5517                          va.va_mask = AT_SIZE;
5511 5518                          error = nfs3getattr(vp, &va, cr);
5512 5519                          if (error || va.va_size == bfp->l_start)
5513 5520                                  return (error);
5514 5521                          va.va_mask = AT_SIZE;
5515 5522                          va.va_size = bfp->l_start;
5516 5523                          error = nfs3setattr(vp, &va, 0, cr);
5517 5524                  } else
5518 5525                          error = EINVAL;
5519 5526          }
5520 5527  
5521 5528          return (error);
5522 5529  }
5523 5530  
5524 5531  /* ARGSUSED */
5525 5532  static int
5526 5533  nfs3_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
5527 5534  {
5528 5535  
5529 5536          return (EINVAL);
5530 5537  }
5531 5538  
5532 5539  /*
5533 5540   * Setup and add an address space callback to do the work of the delmap call.
5534 5541   * The callback will (and must be) deleted in the actual callback function.
5535 5542   *
5536 5543   * This is done in order to take care of the problem that we have with holding
5537 5544   * the address space's a_lock for a long period of time (e.g. if the NFS server
5538 5545   * is down).  Callbacks will be executed in the address space code while the
5539 5546   * a_lock is not held.  Holding the address space's a_lock causes things such
5540 5547   * as ps and fork to hang because they are trying to acquire this lock as well.
5541 5548   */
5542 5549  /* ARGSUSED */
5543 5550  static int
5544 5551  nfs3_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5545 5552          size_t len, uint_t prot, uint_t maxprot, uint_t flags,
5546 5553          cred_t *cr, caller_context_t *ct)
5547 5554  {
5548 5555          int                     caller_found;
5549 5556          int                     error;
5550 5557          rnode_t                 *rp;
5551 5558          nfs_delmap_args_t       *dmapp;
5552 5559          nfs_delmapcall_t        *delmap_call;
5553 5560  
5554 5561          if (vp->v_flag & VNOMAP)
5555 5562                  return (ENOSYS);
5556 5563          /*
5557 5564           * A process may not change zones if it has NFS pages mmap'ed
5558 5565           * in, so we can't legitimately get here from the wrong zone.
5559 5566           */
5560 5567          ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5561 5568  
5562 5569          rp = VTOR(vp);
5563 5570  
5564 5571          /*
5565 5572           * The way that the address space of this process deletes its mapping
5566 5573           * of this file is via the following call chains:
5567 5574           * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs3_delmap()
5568 5575           * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs3_delmap()
5569 5576           *
5570 5577           * With the use of address space callbacks we are allowed to drop the
5571 5578           * address space lock, a_lock, while executing the NFS operations that
5572 5579           * need to go over the wire.  Returning EAGAIN to the caller of this
5573 5580           * function is what drives the execution of the callback that we add
5574 5581           * below.  The callback will be executed by the address space code
5575 5582           * after dropping the a_lock.  When the callback is finished, since
5576 5583           * we dropped the a_lock, it must be re-acquired and segvn_unmap()
5577 5584           * is called again on the same segment to finish the rest of the work
5578 5585           * that needs to happen during unmapping.
5579 5586           *
5580 5587           * This action of calling back into the segment driver causes
5581 5588           * nfs3_delmap() to get called again, but since the callback was
5582 5589           * already executed at this point, it already did the work and there
5583 5590           * is nothing left for us to do.
5584 5591           *
5585 5592           * To Summarize:
5586 5593           * - The first time nfs3_delmap is called by the current thread is when
5587 5594           * we add the caller associated with this delmap to the delmap caller
5588 5595           * list, add the callback, and return EAGAIN.
5589 5596           * - The second time in this call chain when nfs3_delmap is called we
5590 5597           * will find this caller in the delmap caller list and realize there
5591 5598           * is no more work to do thus removing this caller from the list and
5592 5599           * returning the error that was set in the callback execution.
5593 5600           */
5594 5601          caller_found = nfs_find_and_delete_delmapcall(rp, &error);
5595 5602          if (caller_found) {
5596 5603                  /*
5597 5604                   * 'error' is from the actual delmap operations.  To avoid
5598 5605                   * hangs, we need to handle the return of EAGAIN differently
5599 5606                   * since this is what drives the callback execution.
5600 5607                   * In this case, we don't want to return EAGAIN and do the
5601 5608                   * callback execution because there are none to execute.
5602 5609                   */
5603 5610                  if (error == EAGAIN)
5604 5611                          return (0);
5605 5612                  else
5606 5613                          return (error);
5607 5614          }
5608 5615  
5609 5616          /* current caller was not in the list */
5610 5617          delmap_call = nfs_init_delmapcall();
5611 5618  
5612 5619          mutex_enter(&rp->r_statelock);
5613 5620          list_insert_tail(&rp->r_indelmap, delmap_call);
5614 5621          mutex_exit(&rp->r_statelock);
5615 5622  
5616 5623          dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
5617 5624  
5618 5625          dmapp->vp = vp;
5619 5626          dmapp->off = off;
5620 5627          dmapp->addr = addr;
5621 5628          dmapp->len = len;
5622 5629          dmapp->prot = prot;
5623 5630          dmapp->maxprot = maxprot;
5624 5631          dmapp->flags = flags;
5625 5632          dmapp->cr = cr;
5626 5633          dmapp->caller = delmap_call;
5627 5634  
5628 5635          error = as_add_callback(as, nfs3_delmap_callback, dmapp,
5629 5636              AS_UNMAP_EVENT, addr, len, KM_SLEEP);
5630 5637  
5631 5638          return (error ? error : EAGAIN);
5632 5639  }
5633 5640  
5634 5641  /*
5635 5642   * Remove some pages from an mmap'd vnode.  Just update the
5636 5643   * count of pages.  If doing close-to-open, then flush and
5637 5644   * commit all of the pages associated with this file.
5638 5645   * Otherwise, start an asynchronous page flush to write out
5639 5646   * any dirty pages.  This will also associate a credential
5640 5647   * with the rnode which can be used to write the pages.
5641 5648   */
5642 5649  /* ARGSUSED */
5643 5650  static void
5644 5651  nfs3_delmap_callback(struct as *as, void *arg, uint_t event)
5645 5652  {
5646 5653          int                     error;
5647 5654          rnode_t                 *rp;
5648 5655          mntinfo_t               *mi;
5649 5656          nfs_delmap_args_t       *dmapp = (nfs_delmap_args_t *)arg;
5650 5657  
5651 5658          rp = VTOR(dmapp->vp);
5652 5659          mi = VTOMI(dmapp->vp);
5653 5660  
5654 5661          atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
5655 5662          ASSERT(rp->r_mapcnt >= 0);
5656 5663  
5657 5664          /*
5658 5665           * Initiate a page flush and potential commit if there are
5659 5666           * pages, the file system was not mounted readonly, the segment
5660 5667           * was mapped shared, and the pages themselves were writeable.
5661 5668           */
5662 5669          if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
5663 5670              dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
5664 5671                  mutex_enter(&rp->r_statelock);
5665 5672                  rp->r_flags |= RDIRTY;
5666 5673                  mutex_exit(&rp->r_statelock);
5667 5674                  /*
5668 5675                   * If this is a cross-zone access a sync putpage won't work, so
5669 5676                   * the best we can do is try an async putpage.  That seems
5670 5677                   * better than something more draconian such as discarding the
5671 5678                   * dirty pages.
5672 5679                   */
5673 5680                  if ((mi->mi_flags & MI_NOCTO) ||
5674 5681                      nfs_zone() != mi->mi_zone)
5675 5682                          error = nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len,
5676 5683                              B_ASYNC, dmapp->cr, NULL);
5677 5684                  else
5678 5685                          error = nfs3_putpage_commit(dmapp->vp, dmapp->off,
5679 5686                              dmapp->len, dmapp->cr);
5680 5687                  if (!error) {
5681 5688                          mutex_enter(&rp->r_statelock);
5682 5689                          error = rp->r_error;
5683 5690                          rp->r_error = 0;
5684 5691                          mutex_exit(&rp->r_statelock);
5685 5692                  }
5686 5693          } else
5687 5694                  error = 0;
5688 5695  
5689 5696          if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
5690 5697                  (void) nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len,
5691 5698                      B_INVAL, dmapp->cr, NULL);
5692 5699  
5693 5700          dmapp->caller->error = error;
5694 5701          (void) as_delete_callback(as, arg);
5695 5702          kmem_free(dmapp, sizeof (nfs_delmap_args_t));
5696 5703  }
5697 5704  
5698 5705  static int nfs3_pathconf_disable_cache = 0;
5699 5706  
5700 5707  #ifdef DEBUG
5701 5708  static int nfs3_pathconf_cache_hits = 0;
5702 5709  static int nfs3_pathconf_cache_misses = 0;
5703 5710  #endif
5704 5711  
5705 5712  /* ARGSUSED */
5706 5713  static int
5707 5714  nfs3_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5708 5715          caller_context_t *ct)
5709 5716  {
5710 5717          int error;
5711 5718          PATHCONF3args args;
5712 5719          PATHCONF3res res;
5713 5720          int douprintf;
5714 5721          failinfo_t fi;
5715 5722          rnode_t *rp;
5716 5723          hrtime_t t;
5717 5724  
5718 5725          if (nfs_zone() != VTOMI(vp)->mi_zone)
5719 5726                  return (EIO);
5720 5727          /*
5721 5728           * Large file spec - need to base answer on info stored
5722 5729           * on original FSINFO response.
5723 5730           */
5724 5731          if (cmd == _PC_FILESIZEBITS) {
5725 5732                  unsigned long long ll;
5726 5733                  long l = 1;
5727 5734  
5728 5735                  ll = VTOMI(vp)->mi_maxfilesize;
5729 5736  
5730 5737                  if (ll == 0) {
5731 5738                          *valp = 0;
5732 5739                          return (0);
5733 5740                  }
5734 5741  
5735 5742                  if (ll & 0xffffffff00000000) {
5736 5743                          l += 32; ll >>= 32;
5737 5744                  }
5738 5745                  if (ll & 0xffff0000) {
5739 5746                          l += 16; ll >>= 16;
5740 5747                  }
5741 5748                  if (ll & 0xff00) {
5742 5749                          l += 8; ll >>= 8;
5743 5750                  }
5744 5751                  if (ll & 0xf0) {
5745 5752                          l += 4; ll >>= 4;
5746 5753                  }
5747 5754                  if (ll & 0xc) {
5748 5755                          l += 2; ll >>= 2;
5749 5756                  }
5750 5757                  if (ll & 0x2)
5751 5758                          l += 2;
5752 5759                  else if (ll & 0x1)
5753 5760                          l += 1;
5754 5761                  *valp = l;
5755 5762                  return (0);
5756 5763          }
5757 5764  
5758 5765          if (cmd == _PC_ACL_ENABLED) {
5759 5766                  *valp = _ACL_ACLENT_ENABLED;
5760 5767                  return (0);
5761 5768          }
5762 5769  
5763 5770          if (cmd == _PC_XATTR_EXISTS) {
5764 5771                  error = 0;
5765 5772                  *valp = 0;
5766 5773                  if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
5767 5774                          vnode_t *avp;
5768 5775                          rnode_t *rp;
5769 5776                          int error = 0;
5770 5777                          mntinfo_t *mi = VTOMI(vp);
5771 5778  
5772 5779                          if (!(mi->mi_flags & MI_EXTATTR))
5773 5780                                  return (0);
5774 5781  
5775 5782                          rp = VTOR(vp);
5776 5783                          if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
5777 5784                              INTR(vp)))
5778 5785                                  return (EINTR);
5779 5786  
5780 5787                          error = nfs3lookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
5781 5788                          if (error || avp == NULL)
5782 5789                                  error = acl_getxattrdir3(vp, &avp, 0, cr, 0);
5783 5790  
5784 5791                          nfs_rw_exit(&rp->r_rwlock);
5785 5792  
5786 5793                          if (error == 0 && avp != NULL) {
5787 5794                                  error = do_xattr_exists_check(avp, valp, cr);
5788 5795                                  VN_RELE(avp);
5789 5796                          } else if (error == ENOENT) {
5790 5797                                  error = 0;
5791 5798                                  *valp = 0;
5792 5799                          }
5793 5800                  }
5794 5801                  return (error);
5795 5802          }
5796 5803  
5797 5804          rp = VTOR(vp);
5798 5805          if (rp->r_pathconf != NULL) {
5799 5806                  mutex_enter(&rp->r_statelock);
5800 5807                  if (rp->r_pathconf != NULL && nfs3_pathconf_disable_cache) {
5801 5808                          kmem_free(rp->r_pathconf, sizeof (*rp->r_pathconf));
5802 5809                          rp->r_pathconf = NULL;
5803 5810                  }
5804 5811                  if (rp->r_pathconf != NULL) {
5805 5812                          error = 0;
5806 5813                          switch (cmd) {
5807 5814                          case _PC_LINK_MAX:
5808 5815                                  *valp = rp->r_pathconf->link_max;
5809 5816                                  break;
5810 5817                          case _PC_NAME_MAX:
5811 5818                                  *valp = rp->r_pathconf->name_max;
5812 5819                                  break;
5813 5820                          case _PC_PATH_MAX:
5814 5821                          case _PC_SYMLINK_MAX:
5815 5822                                  *valp = MAXPATHLEN;
5816 5823                                  break;
5817 5824                          case _PC_CHOWN_RESTRICTED:
5818 5825                                  *valp = rp->r_pathconf->chown_restricted;
5819 5826                                  break;
5820 5827                          case _PC_NO_TRUNC:
5821 5828                                  *valp = rp->r_pathconf->no_trunc;
5822 5829                                  break;
5823 5830                          default:
5824 5831                                  error = EINVAL;
5825 5832                                  break;
5826 5833                          }
5827 5834                          mutex_exit(&rp->r_statelock);
5828 5835  #ifdef DEBUG
5829 5836                          nfs3_pathconf_cache_hits++;
5830 5837  #endif
5831 5838                          return (error);
5832 5839                  }
5833 5840                  mutex_exit(&rp->r_statelock);
5834 5841          }
5835 5842  #ifdef DEBUG
5836 5843          nfs3_pathconf_cache_misses++;
5837 5844  #endif
5838 5845  
5839 5846          args.object = *VTOFH3(vp);
5840 5847          fi.vp = vp;
5841 5848          fi.fhp = (caddr_t)&args.object;
5842 5849          fi.copyproc = nfs3copyfh;
5843 5850          fi.lookupproc = nfs3lookup;
5844 5851          fi.xattrdirproc = acl_getxattrdir3;
5845 5852  
5846 5853          douprintf = 1;
5847 5854  
5848 5855          t = gethrtime();
5849 5856  
5850 5857          error = rfs3call(VTOMI(vp), NFSPROC3_PATHCONF,
5851 5858              xdr_nfs_fh3, (caddr_t)&args,
5852 5859              xdr_PATHCONF3res, (caddr_t)&res, cr,
5853 5860              &douprintf, &res.status, 0, &fi);
5854 5861  
5855 5862          if (error)
5856 5863                  return (error);
5857 5864  
5858 5865          error = geterrno3(res.status);
5859 5866  
5860 5867          if (!error) {
5861 5868                  nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
5862 5869                  if (!nfs3_pathconf_disable_cache) {
5863 5870                          mutex_enter(&rp->r_statelock);
5864 5871                          if (rp->r_pathconf == NULL) {
5865 5872                                  rp->r_pathconf = kmem_alloc(
5866 5873                                      sizeof (*rp->r_pathconf), KM_NOSLEEP);
5867 5874                                  if (rp->r_pathconf != NULL)
5868 5875                                          *rp->r_pathconf = res.resok.info;
5869 5876                          }
5870 5877                          mutex_exit(&rp->r_statelock);
5871 5878                  }
5872 5879                  switch (cmd) {
5873 5880                  case _PC_LINK_MAX:
5874 5881                          *valp = res.resok.info.link_max;
5875 5882                          break;
5876 5883                  case _PC_NAME_MAX:
5877 5884                          *valp = res.resok.info.name_max;
5878 5885                          break;
5879 5886                  case _PC_PATH_MAX:
5880 5887                  case _PC_SYMLINK_MAX:
5881 5888                          *valp = MAXPATHLEN;
5882 5889                          break;
5883 5890                  case _PC_CHOWN_RESTRICTED:
5884 5891                          *valp = res.resok.info.chown_restricted;
5885 5892                          break;
5886 5893                  case _PC_NO_TRUNC:
5887 5894                          *valp = res.resok.info.no_trunc;
5888 5895                          break;
5889 5896                  default:
5890 5897                          return (EINVAL);
5891 5898                  }
5892 5899          } else {
5893 5900                  nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
5894 5901                  PURGE_STALE_FH(error, vp, cr);
5895 5902          }
5896 5903  
5897 5904          return (error);
5898 5905  }
5899 5906  
5900 5907  /*
5901 5908   * Called by async thread to do synchronous pageio. Do the i/o, wait
5902 5909   * for it to complete, and cleanup the page list when done.
5903 5910   */
5904 5911  static int
5905 5912  nfs3_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5906 5913          int flags, cred_t *cr)
5907 5914  {
5908 5915          int error;
5909 5916  
5910 5917          ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5911 5918          error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5912 5919          if (flags & B_READ)
5913 5920                  pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
5914 5921          else
5915 5922                  pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
5916 5923          return (error);
5917 5924  }
5918 5925  
5919 5926  /* ARGSUSED */
5920 5927  static int
5921 5928  nfs3_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5922 5929          int flags, cred_t *cr, caller_context_t *ct)
5923 5930  {
5924 5931          int error;
5925 5932          rnode_t *rp;
5926 5933  
5927 5934          if (pp == NULL)
5928 5935                  return (EINVAL);
5929 5936          if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
5930 5937                  return (EIO);
5931 5938  
5932 5939          rp = VTOR(vp);
5933 5940          mutex_enter(&rp->r_statelock);
5934 5941          rp->r_count++;
5935 5942          mutex_exit(&rp->r_statelock);
5936 5943  
5937 5944          if (flags & B_ASYNC) {
5938 5945                  error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
5939 5946                      nfs3_sync_pageio);
5940 5947          } else
5941 5948                  error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5942 5949          mutex_enter(&rp->r_statelock);
5943 5950          rp->r_count--;
5944 5951          cv_broadcast(&rp->r_cv);
5945 5952          mutex_exit(&rp->r_statelock);
5946 5953          return (error);
5947 5954  }
5948 5955  
5949 5956  /* ARGSUSED */
5950 5957  static void
5951 5958  nfs3_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
5952 5959          caller_context_t *ct)
5953 5960  {
5954 5961          int error;
5955 5962          rnode_t *rp;
5956 5963          page_t *plist;
5957 5964          page_t *pptr;
5958 5965          offset3 offset;
5959 5966          count3 len;
5960 5967          k_sigset_t smask;
5961 5968  
5962 5969          /*
5963 5970           * We should get called with fl equal to either B_FREE or
5964 5971           * B_INVAL.  Any other value is illegal.
5965 5972           *
5966 5973           * The page that we are either supposed to free or destroy
5967 5974           * should be exclusive locked and its io lock should not
5968 5975           * be held.
5969 5976           */
5970 5977          ASSERT(fl == B_FREE || fl == B_INVAL);
5971 5978          ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
5972 5979          rp = VTOR(vp);
5973 5980  
5974 5981          /*
5975 5982           * If the page doesn't need to be committed or we shouldn't
5976 5983           * even bother attempting to commit it, then just make sure
5977 5984           * that the p_fsdata byte is clear and then either free or
5978 5985           * destroy the page as appropriate.
5979 5986           */
5980 5987          if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & RSTALE)) {
5981 5988                  pp->p_fsdata = C_NOCOMMIT;
5982 5989                  if (fl == B_FREE)
5983 5990                          page_free(pp, dn);
5984 5991                  else
5985 5992                          page_destroy(pp, dn);
5986 5993                  return;
5987 5994          }
5988 5995  
5989 5996          /*
5990 5997           * If there is a page invalidation operation going on, then
5991 5998           * if this is one of the pages being destroyed, then just
5992 5999           * clear the p_fsdata byte and then either free or destroy
5993 6000           * the page as appropriate.
5994 6001           */
5995 6002          mutex_enter(&rp->r_statelock);
5996 6003          if ((rp->r_flags & RTRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
5997 6004                  mutex_exit(&rp->r_statelock);
5998 6005                  pp->p_fsdata = C_NOCOMMIT;
5999 6006                  if (fl == B_FREE)
6000 6007                          page_free(pp, dn);
6001 6008                  else
6002 6009                          page_destroy(pp, dn);
6003 6010                  return;
6004 6011          }
6005 6012  
6006 6013          /*
6007 6014           * If we are freeing this page and someone else is already
6008 6015           * waiting to do a commit, then just unlock the page and
6009 6016           * return.  That other thread will take care of commiting
6010 6017           * this page.  The page can be freed sometime after the
6011 6018           * commit has finished.  Otherwise, if the page is marked
6012 6019           * as delay commit, then we may be getting called from
6013 6020           * pvn_write_done, one page at a time.   This could result
6014 6021           * in one commit per page, so we end up doing lots of small
6015 6022           * commits instead of fewer larger commits.  This is bad,
6016 6023           * we want do as few commits as possible.
6017 6024           */
6018 6025          if (fl == B_FREE) {
6019 6026                  if (rp->r_flags & RCOMMITWAIT) {
6020 6027                          page_unlock(pp);
6021 6028                          mutex_exit(&rp->r_statelock);
6022 6029                          return;
6023 6030                  }
6024 6031                  if (pp->p_fsdata == C_DELAYCOMMIT) {
6025 6032                          pp->p_fsdata = C_COMMIT;
6026 6033                          page_unlock(pp);
6027 6034                          mutex_exit(&rp->r_statelock);
6028 6035                          return;
6029 6036                  }
6030 6037          }
6031 6038  
6032 6039          /*
6033 6040           * Check to see if there is a signal which would prevent an
6034 6041           * attempt to commit the pages from being successful.  If so,
6035 6042           * then don't bother with all of the work to gather pages and
6036 6043           * generate the unsuccessful RPC.  Just return from here and
6037 6044           * let the page be committed at some later time.
6038 6045           */
6039 6046          sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
6040 6047          if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
6041 6048                  sigunintr(&smask);
6042 6049                  page_unlock(pp);
6043 6050                  mutex_exit(&rp->r_statelock);
6044 6051                  return;
6045 6052          }
6046 6053          sigunintr(&smask);
6047 6054  
6048 6055          /*
6049 6056           * We are starting to need to commit pages, so let's try
6050 6057           * to commit as many as possible at once to reduce the
6051 6058           * overhead.
6052 6059           *
6053 6060           * Set the `commit inprogress' state bit.  We must
6054 6061           * first wait until any current one finishes.  Then
6055 6062           * we initialize the c_pages list with this page.
6056 6063           */
6057 6064          while (rp->r_flags & RCOMMIT) {
6058 6065                  rp->r_flags |= RCOMMITWAIT;
6059 6066                  cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
6060 6067                  rp->r_flags &= ~RCOMMITWAIT;
6061 6068          }
6062 6069          rp->r_flags |= RCOMMIT;
6063 6070          mutex_exit(&rp->r_statelock);
6064 6071          ASSERT(rp->r_commit.c_pages == NULL);
6065 6072          rp->r_commit.c_pages = pp;
6066 6073          rp->r_commit.c_commbase = (offset3)pp->p_offset;
6067 6074          rp->r_commit.c_commlen = PAGESIZE;
6068 6075  
6069 6076          /*
6070 6077           * Gather together all other pages which can be committed.
6071 6078           * They will all be chained off r_commit.c_pages.
6072 6079           */
6073 6080          nfs3_get_commit(vp);
6074 6081  
6075 6082          /*
6076 6083           * Clear the `commit inprogress' status and disconnect
6077 6084           * the list of pages to be committed from the rnode.
6078 6085           * At this same time, we also save the starting offset
6079 6086           * and length of data to be committed on the server.
6080 6087           */
6081 6088          plist = rp->r_commit.c_pages;
6082 6089          rp->r_commit.c_pages = NULL;
6083 6090          offset = rp->r_commit.c_commbase;
6084 6091          len = rp->r_commit.c_commlen;
6085 6092          mutex_enter(&rp->r_statelock);
6086 6093          rp->r_flags &= ~RCOMMIT;
6087 6094          cv_broadcast(&rp->r_commit.c_cv);
6088 6095          mutex_exit(&rp->r_statelock);
6089 6096  
6090 6097          if (curproc == proc_pageout || curproc == proc_fsflush ||
6091 6098              nfs_zone() != VTOMI(vp)->mi_zone) {
6092 6099                  nfs_async_commit(vp, plist, offset, len, cr, nfs3_async_commit);
6093 6100                  return;
6094 6101          }
6095 6102  
6096 6103          /*
6097 6104           * Actually generate the COMMIT3 over the wire operation.
6098 6105           */
6099 6106          error = nfs3_commit(vp, offset, len, cr);
6100 6107  
6101 6108          /*
6102 6109           * If we got an error during the commit, just unlock all
6103 6110           * of the pages.  The pages will get retransmitted to the
6104 6111           * server during a putpage operation.
6105 6112           */
6106 6113          if (error) {
6107 6114                  while (plist != NULL) {
6108 6115                          pptr = plist;
6109 6116                          page_sub(&plist, pptr);
6110 6117                          page_unlock(pptr);
6111 6118                  }
6112 6119                  return;
6113 6120          }
6114 6121  
6115 6122          /*
6116 6123           * We've tried as hard as we can to commit the data to stable
6117 6124           * storage on the server.  We release the rest of the pages
6118 6125           * and clear the commit required state.  They will be put
6119 6126           * onto the tail of the cachelist if they are nolonger
6120 6127           * mapped.
6121 6128           */
6122 6129          while (plist != pp) {
6123 6130                  pptr = plist;
6124 6131                  page_sub(&plist, pptr);
6125 6132                  pptr->p_fsdata = C_NOCOMMIT;
6126 6133                  (void) page_release(pptr, 1);
6127 6134          }
6128 6135  
6129 6136          /*
6130 6137           * It is possible that nfs3_commit didn't return error but
6131 6138           * some other thread has modified the page we are going
6132 6139           * to free/destroy.
6133 6140           *    In this case we need to rewrite the page. Do an explicit check
6134 6141           * before attempting to free/destroy the page. If modified, needs to
6135 6142           * be rewritten so unlock the page and return.
6136 6143           */
6137 6144          if (hat_ismod(pp)) {
6138 6145                  pp->p_fsdata = C_NOCOMMIT;
6139 6146                  page_unlock(pp);
6140 6147                  return;
6141 6148          }
6142 6149  
6143 6150          /*
6144 6151           * Now, as appropriate, either free or destroy the page
6145 6152           * that we were called with.
6146 6153           */
6147 6154          pp->p_fsdata = C_NOCOMMIT;
6148 6155          if (fl == B_FREE)
6149 6156                  page_free(pp, dn);
6150 6157          else
6151 6158                  page_destroy(pp, dn);
6152 6159  }
6153 6160  
6154 6161  static int
6155 6162  nfs3_commit(vnode_t *vp, offset3 offset, count3 count, cred_t *cr)
6156 6163  {
6157 6164          int error;
6158 6165          rnode_t *rp;
6159 6166          COMMIT3args args;
6160 6167          COMMIT3res res;
6161 6168          int douprintf;
6162 6169          cred_t *cred;
6163 6170  
6164 6171          rp = VTOR(vp);
6165 6172          ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6166 6173  
6167 6174          mutex_enter(&rp->r_statelock);
6168 6175          if (rp->r_cred != NULL) {
6169 6176                  cred = rp->r_cred;
6170 6177                  crhold(cred);
6171 6178          } else {
6172 6179                  rp->r_cred = cr;
6173 6180                  crhold(cr);
6174 6181                  cred = cr;
6175 6182                  crhold(cred);
6176 6183          }
6177 6184          mutex_exit(&rp->r_statelock);
6178 6185  
6179 6186          args.file = *VTOFH3(vp);
6180 6187          args.offset = offset;
6181 6188          args.count = count;
6182 6189  
6183 6190  doitagain:
6184 6191          douprintf = 1;
6185 6192          error = rfs3call(VTOMI(vp), NFSPROC3_COMMIT,
6186 6193              xdr_COMMIT3args, (caddr_t)&args,
6187 6194              xdr_COMMIT3res, (caddr_t)&res, cred,
6188 6195              &douprintf, &res.status, 0, NULL);
6189 6196  
6190 6197          crfree(cred);
6191 6198  
6192 6199          if (error)
6193 6200                  return (error);
6194 6201  
6195 6202          error = geterrno3(res.status);
6196 6203          if (!error) {
6197 6204                  ASSERT(rp->r_flags & RHAVEVERF);
6198 6205                  mutex_enter(&rp->r_statelock);
6199 6206                  if (rp->r_verf == res.resok.verf) {
6200 6207                          mutex_exit(&rp->r_statelock);
6201 6208                          return (0);
6202 6209                  }
6203 6210                  nfs3_set_mod(vp);
6204 6211                  rp->r_verf = res.resok.verf;
6205 6212                  mutex_exit(&rp->r_statelock);
6206 6213                  error = NFS_VERF_MISMATCH;
6207 6214          } else {
6208 6215                  if (error == EACCES) {
6209 6216                          mutex_enter(&rp->r_statelock);
6210 6217                          if (cred != cr) {
6211 6218                                  if (rp->r_cred != NULL)
6212 6219                                          crfree(rp->r_cred);
6213 6220                                  rp->r_cred = cr;
6214 6221                                  crhold(cr);
6215 6222                                  cred = cr;
6216 6223                                  crhold(cred);
6217 6224                                  mutex_exit(&rp->r_statelock);
6218 6225                                  goto doitagain;
6219 6226                          }
6220 6227                          mutex_exit(&rp->r_statelock);
6221 6228                  }
6222 6229                  /*
6223 6230                   * Can't do a PURGE_STALE_FH here because this
6224 6231                   * can cause a deadlock.  nfs3_commit can
6225 6232                   * be called from nfs3_dispose which can be called
6226 6233                   * indirectly via pvn_vplist_dirty.  PURGE_STALE_FH
6227 6234                   * can call back to pvn_vplist_dirty.
6228 6235                   */
6229 6236                  if (error == ESTALE) {
6230 6237                          mutex_enter(&rp->r_statelock);
6231 6238                          rp->r_flags |= RSTALE;
6232 6239                          if (!rp->r_error)
6233 6240                                  rp->r_error = error;
6234 6241                          mutex_exit(&rp->r_statelock);
6235 6242                          PURGE_ATTRCACHE(vp);
6236 6243                  } else {
6237 6244                          mutex_enter(&rp->r_statelock);
6238 6245                          if (!rp->r_error)
6239 6246                                  rp->r_error = error;
6240 6247                          mutex_exit(&rp->r_statelock);
6241 6248                  }
6242 6249          }
6243 6250  
6244 6251          return (error);
6245 6252  }
6246 6253  
6247 6254  static void
6248 6255  nfs3_set_mod(vnode_t *vp)
6249 6256  {
6250 6257          ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6251 6258  
6252 6259          pvn_vplist_setdirty(vp, nfs_setmod_check);
6253 6260  }
6254 6261  
6255 6262  /*
6256 6263   * This routine is used to gather together a page list of the pages
6257 6264   * which are to be committed on the server.  This routine must not
6258 6265   * be called if the calling thread holds any locked pages.
6259 6266   *
6260 6267   * The calling thread must have set RCOMMIT.  This bit is used to
6261 6268   * serialize access to the commit structure in the rnode.  As long
6262 6269   * as the thread has set RCOMMIT, then it can manipulate the commit
6263 6270   * structure without requiring any other locks.
6264 6271   */
6265 6272  static void
6266 6273  nfs3_get_commit(vnode_t *vp)
6267 6274  {
6268 6275          rnode_t *rp;
6269 6276          page_t *pp;
6270 6277          kmutex_t *vphm;
6271 6278  
6272 6279          rp = VTOR(vp);
6273 6280  
6274 6281          ASSERT(rp->r_flags & RCOMMIT);
6275 6282  
6276 6283          vphm = page_vnode_mutex(vp);
6277 6284          mutex_enter(vphm);
6278 6285  
6279 6286          /*
6280 6287           * If there are no pages associated with this vnode, then
6281 6288           * just return.
6282 6289           */
6283 6290          if ((pp = vp->v_pages) == NULL) {
6284 6291                  mutex_exit(vphm);
6285 6292                  return;
6286 6293          }
6287 6294  
6288 6295          /*
6289 6296           * Step through all of the pages associated with this vnode
6290 6297           * looking for pages which need to be committed.
6291 6298           */
6292 6299          do {
6293 6300                  /* Skip marker pages. */
6294 6301                  if (pp->p_hash == PVN_VPLIST_HASH_TAG)
6295 6302                          continue;
6296 6303  
6297 6304                  /*
6298 6305                   * If this page does not need to be committed or is
6299 6306                   * modified, then just skip it.
6300 6307                   */
6301 6308                  if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
6302 6309                          continue;
6303 6310  
6304 6311                  /*
6305 6312                   * Attempt to lock the page.  If we can't, then
6306 6313                   * someone else is messing with it and we will
6307 6314                   * just skip it.
6308 6315                   */
6309 6316                  if (!page_trylock(pp, SE_EXCL))
6310 6317                          continue;
6311 6318  
6312 6319                  /*
6313 6320                   * If this page does not need to be committed or is
6314 6321                   * modified, then just skip it.  Recheck now that
6315 6322                   * the page is locked.
6316 6323                   */
6317 6324                  if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
6318 6325                          page_unlock(pp);
6319 6326                          continue;
6320 6327                  }
6321 6328  
6322 6329                  if (PP_ISFREE(pp)) {
6323 6330                          cmn_err(CE_PANIC, "nfs3_get_commit: %p is free",
6324 6331                              (void *)pp);
6325 6332                  }
6326 6333  
6327 6334                  /*
6328 6335                   * The page needs to be committed and we locked it.
6329 6336                   * Update the base and length parameters and add it
6330 6337                   * to r_pages.
6331 6338                   */
6332 6339                  if (rp->r_commit.c_pages == NULL) {
6333 6340                          rp->r_commit.c_commbase = (offset3)pp->p_offset;
6334 6341                          rp->r_commit.c_commlen = PAGESIZE;
6335 6342                  } else if (pp->p_offset < rp->r_commit.c_commbase) {
6336 6343                          rp->r_commit.c_commlen = rp->r_commit.c_commbase -
6337 6344                              (offset3)pp->p_offset + rp->r_commit.c_commlen;
6338 6345                          rp->r_commit.c_commbase = (offset3)pp->p_offset;
6339 6346                  } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
6340 6347                      <= pp->p_offset) {
6341 6348                          rp->r_commit.c_commlen = (offset3)pp->p_offset -
6342 6349                              rp->r_commit.c_commbase + PAGESIZE;
6343 6350                  }
6344 6351                  page_add(&rp->r_commit.c_pages, pp);
6345 6352          } while ((pp = pp->p_vpnext) != vp->v_pages);
6346 6353  
6347 6354          mutex_exit(vphm);
6348 6355  }
6349 6356  
6350 6357  /*
6351 6358   * This routine is used to gather together a page list of the pages
6352 6359   * which are to be committed on the server.  This routine must not
6353 6360   * be called if the calling thread holds any locked pages.
6354 6361   *
6355 6362   * The calling thread must have set RCOMMIT.  This bit is used to
6356 6363   * serialize access to the commit structure in the rnode.  As long
6357 6364   * as the thread has set RCOMMIT, then it can manipulate the commit
6358 6365   * structure without requiring any other locks.
6359 6366   */
6360 6367  static void
6361 6368  nfs3_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len)
6362 6369  {
6363 6370  
6364 6371          rnode_t *rp;
6365 6372          page_t *pp;
6366 6373          u_offset_t end;
6367 6374          u_offset_t off;
6368 6375  
6369 6376          ASSERT(len != 0);
6370 6377  
6371 6378          rp = VTOR(vp);
6372 6379  
6373 6380          ASSERT(rp->r_flags & RCOMMIT);
6374 6381          ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6375 6382  
6376 6383          /*
6377 6384           * If there are no pages associated with this vnode, then
6378 6385           * just return.
6379 6386           */
6380 6387          if ((pp = vp->v_pages) == NULL)
6381 6388                  return;
6382 6389  
6383 6390          /*
6384 6391           * Calculate the ending offset.
6385 6392           */
6386 6393          end = soff + len;
6387 6394  
6388 6395          for (off = soff; off < end; off += PAGESIZE) {
6389 6396                  /*
6390 6397                   * Lookup each page by vp, offset.
6391 6398                   */
6392 6399                  if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL)
6393 6400                          continue;
6394 6401  
6395 6402                  /*
6396 6403                   * If this page does not need to be committed or is
6397 6404                   * modified, then just skip it.
6398 6405                   */
6399 6406                  if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
6400 6407                          page_unlock(pp);
6401 6408                          continue;
6402 6409                  }
6403 6410  
6404 6411                  ASSERT(PP_ISFREE(pp) == 0);
6405 6412  
6406 6413                  /*
6407 6414                   * The page needs to be committed and we locked it.
6408 6415                   * Update the base and length parameters and add it
6409 6416                   * to r_pages.
6410 6417                   */
6411 6418                  if (rp->r_commit.c_pages == NULL) {
6412 6419                          rp->r_commit.c_commbase = (offset3)pp->p_offset;
6413 6420                          rp->r_commit.c_commlen = PAGESIZE;
6414 6421                  } else {
6415 6422                          rp->r_commit.c_commlen = (offset3)pp->p_offset -
6416 6423                              rp->r_commit.c_commbase + PAGESIZE;
6417 6424                  }
6418 6425                  page_add(&rp->r_commit.c_pages, pp);
6419 6426          }
6420 6427  }
6421 6428  
6422 6429  static int
6423 6430  nfs3_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
6424 6431  {
6425 6432          int error;
6426 6433          writeverf3 write_verf;
6427 6434          rnode_t *rp = VTOR(vp);
6428 6435  
6429 6436          ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6430 6437          /*
6431 6438           * Flush the data portion of the file and then commit any
6432 6439           * portions which need to be committed.  This may need to
6433 6440           * be done twice if the server has changed state since
6434 6441           * data was last written.  The data will need to be
6435 6442           * rewritten to the server and then a new commit done.
6436 6443           *
6437 6444           * In fact, this may need to be done several times if the
6438 6445           * server is having problems and crashing while we are
6439 6446           * attempting to do this.
6440 6447           */
6441 6448  
6442 6449  top:
6443 6450          /*
6444 6451           * Do a flush based on the poff and plen arguments.  This
6445 6452           * will asynchronously write out any modified pages in the
6446 6453           * range specified by (poff, plen).  This starts all of the
6447 6454           * i/o operations which will be waited for in the next
6448 6455           * call to nfs3_putpage
6449 6456           */
6450 6457  
6451 6458          mutex_enter(&rp->r_statelock);
6452 6459          write_verf = rp->r_verf;
6453 6460          mutex_exit(&rp->r_statelock);
6454 6461  
6455 6462          error = nfs3_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
6456 6463          if (error == EAGAIN)
6457 6464                  error = 0;
6458 6465  
6459 6466          /*
6460 6467           * Do a flush based on the poff and plen arguments.  This
6461 6468           * will synchronously write out any modified pages in the
6462 6469           * range specified by (poff, plen) and wait until all of
6463 6470           * the asynchronous i/o's in that range are done as well.
6464 6471           */
6465 6472          if (!error)
6466 6473                  error = nfs3_putpage(vp, poff, plen, 0, cr, NULL);
6467 6474  
6468 6475          if (error)
6469 6476                  return (error);
6470 6477  
6471 6478          mutex_enter(&rp->r_statelock);
6472 6479          if (rp->r_verf != write_verf) {
6473 6480                  mutex_exit(&rp->r_statelock);
6474 6481                  goto top;
6475 6482          }
6476 6483          mutex_exit(&rp->r_statelock);
6477 6484  
6478 6485          /*
6479 6486           * Now commit any pages which might need to be committed.
6480 6487           * If the error, NFS_VERF_MISMATCH, is returned, then
6481 6488           * start over with the flush operation.
6482 6489           */
6483 6490  
6484 6491          error = nfs3_commit_vp(vp, poff, plen, cr);
6485 6492  
6486 6493          if (error == NFS_VERF_MISMATCH)
6487 6494                  goto top;
6488 6495  
6489 6496          return (error);
6490 6497  }
6491 6498  
6492 6499  static int
6493 6500  nfs3_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, cred_t *cr)
6494 6501  {
6495 6502          rnode_t *rp;
6496 6503          page_t *plist;
6497 6504          offset3 offset;
6498 6505          count3 len;
6499 6506  
6500 6507  
6501 6508          rp = VTOR(vp);
6502 6509  
6503 6510          if (nfs_zone() != VTOMI(vp)->mi_zone)
6504 6511                  return (EIO);
6505 6512          /*
6506 6513           * Set the `commit inprogress' state bit.  We must
6507 6514           * first wait until any current one finishes.
6508 6515           */
6509 6516          mutex_enter(&rp->r_statelock);
6510 6517          while (rp->r_flags & RCOMMIT) {
6511 6518                  rp->r_flags |= RCOMMITWAIT;
6512 6519                  cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
6513 6520                  rp->r_flags &= ~RCOMMITWAIT;
6514 6521          }
6515 6522          rp->r_flags |= RCOMMIT;
6516 6523          mutex_exit(&rp->r_statelock);
6517 6524  
6518 6525          /*
6519 6526           * Gather together all of the pages which need to be
6520 6527           * committed.
6521 6528           */
6522 6529          if (plen == 0)
6523 6530                  nfs3_get_commit(vp);
6524 6531          else
6525 6532                  nfs3_get_commit_range(vp, poff, plen);
6526 6533  
6527 6534          /*
6528 6535           * Clear the `commit inprogress' bit and disconnect the
6529 6536           * page list which was gathered together in nfs3_get_commit.
6530 6537           */
6531 6538          plist = rp->r_commit.c_pages;
6532 6539          rp->r_commit.c_pages = NULL;
6533 6540          offset = rp->r_commit.c_commbase;
6534 6541          len = rp->r_commit.c_commlen;
6535 6542          mutex_enter(&rp->r_statelock);
6536 6543          rp->r_flags &= ~RCOMMIT;
6537 6544          cv_broadcast(&rp->r_commit.c_cv);
6538 6545          mutex_exit(&rp->r_statelock);
6539 6546  
6540 6547          /*
6541 6548           * If any pages need to be committed, commit them and
6542 6549           * then unlock them so that they can be freed some
6543 6550           * time later.
6544 6551           */
6545 6552          if (plist != NULL) {
6546 6553                  /*
6547 6554                   * No error occurred during the flush portion
6548 6555                   * of this operation, so now attempt to commit
6549 6556                   * the data to stable storage on the server.
6550 6557                   *
6551 6558                   * This will unlock all of the pages on the list.
6552 6559                   */
6553 6560                  return (nfs3_sync_commit(vp, plist, offset, len, cr));
6554 6561          }
6555 6562          return (0);
6556 6563  }
6557 6564  
6558 6565  static int
6559 6566  nfs3_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
6560 6567          cred_t *cr)
6561 6568  {
6562 6569          int error;
6563 6570          page_t *pp;
6564 6571  
6565 6572          ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6566 6573          error = nfs3_commit(vp, offset, count, cr);
6567 6574  
6568 6575          /*
6569 6576           * If we got an error, then just unlock all of the pages
6570 6577           * on the list.
6571 6578           */
6572 6579          if (error) {
6573 6580                  while (plist != NULL) {
6574 6581                          pp = plist;
6575 6582                          page_sub(&plist, pp);
6576 6583                          page_unlock(pp);
6577 6584                  }
6578 6585                  return (error);
6579 6586          }
6580 6587          /*
6581 6588           * We've tried as hard as we can to commit the data to stable
6582 6589           * storage on the server.  We just unlock the pages and clear
6583 6590           * the commit required state.  They will get freed later.
6584 6591           */
6585 6592          while (plist != NULL) {
6586 6593                  pp = plist;
6587 6594                  page_sub(&plist, pp);
6588 6595                  pp->p_fsdata = C_NOCOMMIT;
6589 6596                  page_unlock(pp);
6590 6597          }
6591 6598  
6592 6599          return (error);
6593 6600  }
6594 6601  
6595 6602  static void
6596 6603  nfs3_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
6597 6604          cred_t *cr)
6598 6605  {
6599 6606          ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6600 6607          (void) nfs3_sync_commit(vp, plist, offset, count, cr);
6601 6608  }
6602 6609  
6603 6610  /* ARGSUSED */
6604 6611  static int
6605 6612  nfs3_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
6606 6613          caller_context_t *ct)
6607 6614  {
6608 6615          int error;
6609 6616          mntinfo_t *mi;
6610 6617  
6611 6618          mi = VTOMI(vp);
6612 6619  
6613 6620          if (nfs_zone() != mi->mi_zone)
6614 6621                  return (EIO);
6615 6622  
6616 6623          if (mi->mi_flags & MI_ACL) {
6617 6624                  error = acl_setacl3(vp, vsecattr, flag, cr);
6618 6625                  if (mi->mi_flags & MI_ACL)
6619 6626                          return (error);
6620 6627          }
6621 6628  
6622 6629          return (ENOSYS);
6623 6630  }
6624 6631  
6625 6632  /* ARGSUSED */
6626 6633  static int
6627 6634  nfs3_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
6628 6635          caller_context_t *ct)
6629 6636  {
6630 6637          int error;
6631 6638          mntinfo_t *mi;
6632 6639  
6633 6640          mi = VTOMI(vp);
6634 6641  
6635 6642          if (nfs_zone() != mi->mi_zone)
6636 6643                  return (EIO);
6637 6644  
6638 6645          if (mi->mi_flags & MI_ACL) {
6639 6646                  error = acl_getacl3(vp, vsecattr, flag, cr);
6640 6647                  if (mi->mi_flags & MI_ACL)
6641 6648                          return (error);
6642 6649          }
6643 6650  
6644 6651          return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
6645 6652  }
6646 6653  
6647 6654  /* ARGSUSED */
6648 6655  static int
6649 6656  nfs3_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
6650 6657          caller_context_t *ct)
6651 6658  {
6652 6659          int error;
6653 6660          struct shrlock nshr;
6654 6661          struct nfs_owner nfs_owner;
6655 6662          netobj lm_fh3;
6656 6663  
6657 6664          if (nfs_zone() != VTOMI(vp)->mi_zone)
6658 6665                  return (EIO);
6659 6666  
6660 6667          /*
6661 6668           * check for valid cmd parameter
6662 6669           */
6663 6670          if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
6664 6671                  return (EINVAL);
6665 6672  
6666 6673          /*
6667 6674           * Check access permissions
6668 6675           */
6669 6676          if (cmd == F_SHARE &&
6670 6677              (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
6671 6678              ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
6672 6679                  return (EBADF);
6673 6680  
6674 6681          /*
6675 6682           * If the filesystem is mounted using local locking, pass the
6676 6683           * request off to the local share code.
6677 6684           */
6678 6685          if (VTOMI(vp)->mi_flags & MI_LLOCK)
6679 6686                  return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
6680 6687  
6681 6688          switch (cmd) {
6682 6689          case F_SHARE:
6683 6690          case F_UNSHARE:
6684 6691                  lm_fh3.n_len = VTOFH3(vp)->fh3_length;
6685 6692                  lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data);
6686 6693  
6687 6694                  /*
6688 6695                   * If passed an owner that is too large to fit in an
6689 6696                   * nfs_owner it is likely a recursive call from the
6690 6697                   * lock manager client and pass it straight through.  If
6691 6698                   * it is not a nfs_owner then simply return an error.
6692 6699                   */
6693 6700                  if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
6694 6701                          if (((struct nfs_owner *)shr->s_owner)->magic !=
6695 6702                              NFS_OWNER_MAGIC)
6696 6703                                  return (EINVAL);
6697 6704  
6698 6705                          if (error = lm4_shrlock(vp, cmd, shr, flag, &lm_fh3)) {
6699 6706                                  error = set_errno(error);
6700 6707                          }
6701 6708                          return (error);
6702 6709                  }
6703 6710                  /*
6704 6711                   * Remote share reservations owner is a combination of
6705 6712                   * a magic number, hostname, and the local owner
6706 6713                   */
6707 6714                  bzero(&nfs_owner, sizeof (nfs_owner));
6708 6715                  nfs_owner.magic = NFS_OWNER_MAGIC;
6709 6716                  (void) strncpy(nfs_owner.hname, uts_nodename(),
6710 6717                      sizeof (nfs_owner.hname));
6711 6718                  bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
6712 6719                  nshr.s_access = shr->s_access;
6713 6720                  nshr.s_deny = shr->s_deny;
6714 6721                  nshr.s_sysid = 0;
6715 6722                  nshr.s_pid = ttoproc(curthread)->p_pid;
6716 6723                  nshr.s_own_len = sizeof (nfs_owner);
6717 6724                  nshr.s_owner = (caddr_t)&nfs_owner;
6718 6725  
6719 6726                  if (error = lm4_shrlock(vp, cmd, &nshr, flag, &lm_fh3)) {
6720 6727                          error = set_errno(error);
6721 6728                  }
6722 6729  
6723 6730                  break;
6724 6731  
6725 6732          case F_HASREMOTELOCKS:
6726 6733                  /*
6727 6734                   * NFS client can't store remote locks itself
6728 6735                   */
6729 6736                  shr->s_access = 0;
6730 6737                  error = 0;
6731 6738                  break;
6732 6739  
6733 6740          default:
6734 6741                  error = EINVAL;
6735 6742                  break;
6736 6743          }
6737 6744  
6738 6745          return (error);
6739 6746  }

↓ open down ↓

4415 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX