1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
  28  *      All rights reserved.
  29  */
  30 
  31 /*
  32  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  33  */
  34 
  35 #include <sys/param.h>
  36 #include <sys/types.h>
  37 #include <sys/systm.h>
  38 #include <sys/cred.h>
  39 #include <sys/time.h>
  40 #include <sys/vnode.h>
  41 #include <sys/vfs.h>
  42 #include <sys/vfs_opreg.h>
  43 #include <sys/file.h>
  44 #include <sys/filio.h>
  45 #include <sys/uio.h>
  46 #include <sys/buf.h>
  47 #include <sys/mman.h>
  48 #include <sys/pathname.h>
  49 #include <sys/dirent.h>
  50 #include <sys/debug.h>
  51 #include <sys/vmsystm.h>
  52 #include <sys/fcntl.h>
  53 #include <sys/flock.h>
  54 #include <sys/swap.h>
  55 #include <sys/errno.h>
  56 #include <sys/strsubr.h>
  57 #include <sys/sysmacros.h>
  58 #include <sys/kmem.h>
  59 #include <sys/cmn_err.h>
  60 #include <sys/pathconf.h>
  61 #include <sys/utsname.h>
  62 #include <sys/dnlc.h>
  63 #include <sys/acl.h>
  64 #include <sys/systeminfo.h>
  65 #include <sys/atomic.h>
  66 #include <sys/policy.h>
  67 #include <sys/sdt.h>
  68 #include <sys/zone.h>
  69 
  70 #include <rpc/types.h>
  71 #include <rpc/auth.h>
  72 #include <rpc/clnt.h>
  73 #include <rpc/rpc_rdma.h>
  74 
  75 #include <nfs/nfs.h>
  76 #include <nfs/nfs_clnt.h>
  77 #include <nfs/rnode.h>
  78 #include <nfs/nfs_acl.h>
  79 #include <nfs/lm.h>
  80 
  81 #include <vm/hat.h>
  82 #include <vm/as.h>
  83 #include <vm/page.h>
  84 #include <vm/pvn.h>
  85 #include <vm/seg.h>
  86 #include <vm/seg_map.h>
  87 #include <vm/seg_kpm.h>
  88 #include <vm/seg_vn.h>
  89 
  90 #include <fs/fs_subr.h>
  91 
  92 #include <sys/ddi.h>
  93 
  94 static int      nfs3_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
  95                         cred_t *);
  96 static int      nfs3write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
  97                         stable_how *);
  98 static int      nfs3read(vnode_t *, caddr_t, offset_t, int, size_t *, cred_t *);
  99 static int      nfs3setattr(vnode_t *, struct vattr *, int, cred_t *);
 100 static int      nfs3_accessx(void *, int, cred_t *);
 101 static int      nfs3lookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
 102 static int      nfs3lookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
 103 static int      nfs3create(vnode_t *, char *, struct vattr *, enum vcexcl,
 104                         int, vnode_t **, cred_t *, int);
 105 static int      nfs3excl_create_settimes(vnode_t *, struct vattr *, cred_t *);
 106 static int      nfs3mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
 107                         int, vnode_t **, cred_t *);
 108 static int      nfs3rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 109                         caller_context_t *);
 110 static int      do_nfs3readdir(vnode_t *, rddir_cache *, cred_t *);
 111 static void     nfs3readdir(vnode_t *, rddir_cache *, cred_t *);
 112 static void     nfs3readdirplus(vnode_t *, rddir_cache *, cred_t *);
 113 static int      nfs3_bio(struct buf *, stable_how *, cred_t *);
 114 static int      nfs3_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
 115                         page_t *[], size_t, struct seg *, caddr_t,
 116                         enum seg_rw, cred_t *);
 117 static void     nfs3_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
 118                         cred_t *);
 119 static int      nfs3_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
 120                         int, cred_t *);
 121 static int      nfs3_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
 122                         int, cred_t *);
 123 static int      nfs3_commit(vnode_t *, offset3, count3, cred_t *);
 124 static void     nfs3_set_mod(vnode_t *);
 125 static void     nfs3_get_commit(vnode_t *);
 126 static void     nfs3_get_commit_range(vnode_t *, u_offset_t, size_t);
 127 static int      nfs3_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
 128 static int      nfs3_commit_vp(vnode_t *, u_offset_t, size_t,  cred_t *);
 129 static int      nfs3_sync_commit(vnode_t *, page_t *, offset3, count3,
 130                         cred_t *);
 131 static void     nfs3_async_commit(vnode_t *, page_t *, offset3, count3,
 132                         cred_t *);
 133 static void     nfs3_delmap_callback(struct as *, void *, uint_t);
 134 
 135 /*
 136  * Error flags used to pass information about certain special errors
 137  * which need to be handled specially.
 138  */
 139 #define NFS_EOF                 -98
 140 #define NFS_VERF_MISMATCH       -97
 141 
 142 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
 143 #define ALIGN64(x, ptr, sz)                                             \
 144         x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);           \
 145         if (x) {                                                        \
 146                 x = sizeof (uint64_t) - (x);                            \
 147                 sz -= (x);                                              \
 148                 ptr += (x);                                             \
 149         }
 150 
 151 /*
 152  * These are the vnode ops routines which implement the vnode interface to
 153  * the networked file system.  These routines just take their parameters,
 154  * make them look networkish by putting the right info into interface structs,
 155  * and then calling the appropriate remote routine(s) to do the work.
 156  *
 157  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
 158  * we purge the directory cache relative to that vnode.  This way, the
 159  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
 160  * more details on rnode locking.
 161  */
 162 
 163 static int      nfs3_open(vnode_t **, int, cred_t *, caller_context_t *);
 164 static int      nfs3_close(vnode_t *, int, int, offset_t, cred_t *,
 165                         caller_context_t *);
 166 static int      nfs3_read(vnode_t *, struct uio *, int, cred_t *,
 167                         caller_context_t *);
 168 static int      nfs3_write(vnode_t *, struct uio *, int, cred_t *,
 169                         caller_context_t *);
 170 static int      nfs3_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
 171                         caller_context_t *);
 172 static int      nfs3_getattr(vnode_t *, struct vattr *, int, cred_t *,
 173                         caller_context_t *);
 174 static int      nfs3_setattr(vnode_t *, struct vattr *, int, cred_t *,
 175                         caller_context_t *);
 176 static int      nfs3_access(vnode_t *, int, int, cred_t *, caller_context_t *);
 177 static int      nfs3_readlink(vnode_t *, struct uio *, cred_t *,
 178                         caller_context_t *);
 179 static int      nfs3_fsync(vnode_t *, int, cred_t *, caller_context_t *);
 180 static void     nfs3_inactive(vnode_t *, cred_t *, caller_context_t *);
 181 static int      nfs3_lookup(vnode_t *, char *, vnode_t **,
 182                         struct pathname *, int, vnode_t *, cred_t *,
 183                         caller_context_t *, int *, pathname_t *);
 184 static int      nfs3_create(vnode_t *, char *, struct vattr *, enum vcexcl,
 185                         int, vnode_t **, cred_t *, int, caller_context_t *,
 186                         vsecattr_t *);
 187 static int      nfs3_remove(vnode_t *, char *, cred_t *, caller_context_t *,
 188                         int);
 189 static int      nfs3_link(vnode_t *, vnode_t *, char *, cred_t *,
 190                         caller_context_t *, int);
 191 static int      nfs3_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 192                         caller_context_t *, int);
 193 static int      nfs3_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
 194                         cred_t *, caller_context_t *, int, vsecattr_t *);
 195 static int      nfs3_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
 196                         caller_context_t *, int);
 197 static int      nfs3_symlink(vnode_t *, char *, struct vattr *, char *,
 198                         cred_t *, caller_context_t *, int);
 199 static int      nfs3_readdir(vnode_t *, struct uio *, cred_t *, int *,
 200                         caller_context_t *, int);
 201 static int      nfs3_fid(vnode_t *, fid_t *, caller_context_t *);
 202 static int      nfs3_rwlock(vnode_t *, int, caller_context_t *);
 203 static void     nfs3_rwunlock(vnode_t *, int, caller_context_t *);
 204 static int      nfs3_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
 205 static int      nfs3_getpage(vnode_t *, offset_t, size_t, uint_t *,
 206                         page_t *[], size_t, struct seg *, caddr_t,
 207                         enum seg_rw, cred_t *, caller_context_t *);
 208 static int      nfs3_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
 209                         caller_context_t *);
 210 static int      nfs3_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
 211                         uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 212 static int      nfs3_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 213                         uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 214 static int      nfs3_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
 215                         struct flk_callback *, cred_t *, caller_context_t *);
 216 static int      nfs3_space(vnode_t *, int, struct flock64 *, int, offset_t,
 217                         cred_t *, caller_context_t *);
 218 static int      nfs3_realvp(vnode_t *, vnode_t **, caller_context_t *);
 219 static int      nfs3_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 220                         uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
 221 static int      nfs3_pathconf(vnode_t *, int, ulong_t *, cred_t *,
 222                         caller_context_t *);
 223 static int      nfs3_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
 224                         cred_t *, caller_context_t *);
 225 static void     nfs3_dispose(vnode_t *, page_t *, int, int, cred_t *,
 226                         caller_context_t *);
 227 static int      nfs3_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 228                         caller_context_t *);
 229 static int      nfs3_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 230                         caller_context_t *);
 231 static int      nfs3_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
 232                         caller_context_t *);
 233 
 234 struct vnodeops *nfs3_vnodeops;
 235 
 236 const fs_operation_def_t nfs3_vnodeops_template[] = {
 237         VOPNAME_OPEN,           { .vop_open = nfs3_open },
 238         VOPNAME_CLOSE,          { .vop_close = nfs3_close },
 239         VOPNAME_READ,           { .vop_read = nfs3_read },
 240         VOPNAME_WRITE,          { .vop_write = nfs3_write },
 241         VOPNAME_IOCTL,          { .vop_ioctl = nfs3_ioctl },
 242         VOPNAME_GETATTR,        { .vop_getattr = nfs3_getattr },
 243         VOPNAME_SETATTR,        { .vop_setattr = nfs3_setattr },
 244         VOPNAME_ACCESS,         { .vop_access = nfs3_access },
 245         VOPNAME_LOOKUP,         { .vop_lookup = nfs3_lookup },
 246         VOPNAME_CREATE,         { .vop_create = nfs3_create },
 247         VOPNAME_REMOVE,         { .vop_remove = nfs3_remove },
 248         VOPNAME_LINK,           { .vop_link = nfs3_link },
 249         VOPNAME_RENAME,         { .vop_rename = nfs3_rename },
 250         VOPNAME_MKDIR,          { .vop_mkdir = nfs3_mkdir },
 251         VOPNAME_RMDIR,          { .vop_rmdir = nfs3_rmdir },
 252         VOPNAME_READDIR,        { .vop_readdir = nfs3_readdir },
 253         VOPNAME_SYMLINK,        { .vop_symlink = nfs3_symlink },
 254         VOPNAME_READLINK,       { .vop_readlink = nfs3_readlink },
 255         VOPNAME_FSYNC,          { .vop_fsync = nfs3_fsync },
 256         VOPNAME_INACTIVE,       { .vop_inactive = nfs3_inactive },
 257         VOPNAME_FID,            { .vop_fid = nfs3_fid },
 258         VOPNAME_RWLOCK,         { .vop_rwlock = nfs3_rwlock },
 259         VOPNAME_RWUNLOCK,       { .vop_rwunlock = nfs3_rwunlock },
 260         VOPNAME_SEEK,           { .vop_seek = nfs3_seek },
 261         VOPNAME_FRLOCK,         { .vop_frlock = nfs3_frlock },
 262         VOPNAME_SPACE,          { .vop_space = nfs3_space },
 263         VOPNAME_REALVP,         { .vop_realvp = nfs3_realvp },
 264         VOPNAME_GETPAGE,        { .vop_getpage = nfs3_getpage },
 265         VOPNAME_PUTPAGE,        { .vop_putpage = nfs3_putpage },
 266         VOPNAME_MAP,            { .vop_map = nfs3_map },
 267         VOPNAME_ADDMAP,         { .vop_addmap = nfs3_addmap },
 268         VOPNAME_DELMAP,         { .vop_delmap = nfs3_delmap },
 269         /* no separate nfs3_dump */
 270         VOPNAME_DUMP,           { .vop_dump = nfs_dump },
 271         VOPNAME_PATHCONF,       { .vop_pathconf = nfs3_pathconf },
 272         VOPNAME_PAGEIO,         { .vop_pageio = nfs3_pageio },
 273         VOPNAME_DISPOSE,        { .vop_dispose = nfs3_dispose },
 274         VOPNAME_SETSECATTR,     { .vop_setsecattr = nfs3_setsecattr },
 275         VOPNAME_GETSECATTR,     { .vop_getsecattr = nfs3_getsecattr },
 276         VOPNAME_SHRLOCK,        { .vop_shrlock = nfs3_shrlock },
 277         VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
 278         NULL,                   NULL
 279 };
 280 
 281 /*
 282  * XXX:  This is referenced in modstubs.s
 283  */
 284 struct vnodeops *
 285 nfs3_getvnodeops(void)
 286 {
 287         return (nfs3_vnodeops);
 288 }
 289 
 290 /* ARGSUSED */
 291 static int
 292 nfs3_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 293 {
 294         int error;
 295         struct vattr va;
 296         rnode_t *rp;
 297         vnode_t *vp;
 298 
 299         vp = *vpp;
 300         if (nfs_zone() != VTOMI(vp)->mi_zone)
 301                 return (EIO);
 302         rp = VTOR(vp);
 303         mutex_enter(&rp->r_statelock);
 304         if (rp->r_cred == NULL) {
 305                 crhold(cr);
 306                 rp->r_cred = cr;
 307         }
 308         mutex_exit(&rp->r_statelock);
 309 
 310         /*
 311          * If there is no cached data or if close-to-open
 312          * consistency checking is turned off, we can avoid
 313          * the over the wire getattr.  Otherwise, if the
 314          * file system is mounted readonly, then just verify
 315          * the caches are up to date using the normal mechanism.
 316          * Else, if the file is not mmap'd, then just mark
 317          * the attributes as timed out.  They will be refreshed
 318          * and the caches validated prior to being used.
 319          * Else, the file system is mounted writeable so
 320          * force an over the wire GETATTR in order to ensure
 321          * that all cached data is valid.
 322          */
 323         if (vp->v_count > 1 ||
 324             ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
 325             !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
 326                 if (vn_is_readonly(vp))
 327                         error = nfs3_validate_caches(vp, cr);
 328                 else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
 329                         PURGE_ATTRCACHE(vp);
 330                         error = 0;
 331                 } else {
 332                         va.va_mask = AT_ALL;
 333                         error = nfs3_getattr_otw(vp, &va, cr);
 334                 }
 335         } else
 336                 error = 0;
 337 
 338         return (error);
 339 }
 340 
 341 /* ARGSUSED */
 342 static int
 343 nfs3_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 344                 caller_context_t *ct)
 345 {
 346         rnode_t *rp;
 347         int error;
 348         struct vattr va;
 349 
 350         /*
 351          * zone_enter(2) prevents processes from changing zones with NFS files
 352          * open; if we happen to get here from the wrong zone we can't do
 353          * anything over the wire.
 354          */
 355         if (VTOMI(vp)->mi_zone != nfs_zone()) {
 356                 /*
 357                  * We could attempt to clean up locks, except we're sure
 358                  * that the current process didn't acquire any locks on
 359                  * the file: any attempt to lock a file belong to another zone
 360                  * will fail, and one can't lock an NFS file and then change
 361                  * zones, as that fails too.
 362                  *
 363                  * Returning an error here is the sane thing to do.  A
 364                  * subsequent call to VN_RELE() which translates to a
 365                  * nfs3_inactive() will clean up state: if the zone of the
 366                  * vnode's origin is still alive and kicking, an async worker
 367                  * thread will handle the request (from the correct zone), and
 368                  * everything (minus the commit and final nfs3_getattr_otw()
 369                  * call) should be OK. If the zone is going away
 370                  * nfs_async_inactive() will throw away cached pages inline.
 371                  */
 372                 return (EIO);
 373         }
 374 
 375         /*
 376          * If we are using local locking for this filesystem, then
 377          * release all of the SYSV style record locks.  Otherwise,
 378          * we are doing network locking and we need to release all
 379          * of the network locks.  All of the locks held by this
 380          * process on this file are released no matter what the
 381          * incoming reference count is.
 382          */
 383         if (VTOMI(vp)->mi_flags & MI_LLOCK) {
 384                 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 385                 cleanshares(vp, ttoproc(curthread)->p_pid);
 386         } else
 387                 nfs_lockrelease(vp, flag, offset, cr);
 388 
 389         if (count > 1)
 390                 return (0);
 391 
 392         /*
 393          * If the file has been `unlinked', then purge the
 394          * DNLC so that this vnode will get reycled quicker
 395          * and the .nfs* file on the server will get removed.
 396          */
 397         rp = VTOR(vp);
 398         if (rp->r_unldvp != NULL)
 399                 dnlc_purge_vp(vp);
 400 
 401         /*
 402          * If the file was open for write and there are pages,
 403          * then if the file system was mounted using the "no-close-
 404          *      to-open" semantics, then start an asynchronous flush
 405          *      of the all of the pages in the file.
 406          * else the file system was not mounted using the "no-close-
 407          *      to-open" semantics, then do a synchronous flush and
 408          *      commit of all of the dirty and uncommitted pages.
 409          *
 410          * The asynchronous flush of the pages in the "nocto" path
 411          * mostly just associates a cred pointer with the rnode so
 412          * writes which happen later will have a better chance of
 413          * working.  It also starts the data being written to the
 414          * server, but without unnecessarily delaying the application.
 415          */
 416         if ((flag & FWRITE) && vn_has_cached_data(vp)) {
 417                 if (VTOMI(vp)->mi_flags & MI_NOCTO) {
 418                         error = nfs3_putpage(vp, (offset_t)0, 0, B_ASYNC,
 419                             cr, ct);
 420                         if (error == EAGAIN)
 421                                 error = 0;
 422                 } else
 423                         error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr);
 424                 if (!error) {
 425                         mutex_enter(&rp->r_statelock);
 426                         error = rp->r_error;
 427                         rp->r_error = 0;
 428                         mutex_exit(&rp->r_statelock);
 429                 }
 430         } else {
 431                 mutex_enter(&rp->r_statelock);
 432                 error = rp->r_error;
 433                 rp->r_error = 0;
 434                 mutex_exit(&rp->r_statelock);
 435         }
 436 
 437         /*
 438          * If RWRITEATTR is set, then issue an over the wire GETATTR to
 439          * refresh the attribute cache with a set of attributes which
 440          * weren't returned from a WRITE.  This will enable the close-
 441          * to-open processing to work.
 442          */
 443         if (rp->r_flags & RWRITEATTR)
 444                 (void) nfs3_getattr_otw(vp, &va, cr);
 445 
 446         return (error);
 447 }
 448 
 449 /* ARGSUSED */
 450 static int
 451 nfs3_directio_read(vnode_t *vp, struct uio *uiop, cred_t *cr)
 452 {
 453         mntinfo_t *mi;
 454         READ3args args;
 455         READ3uiores res;
 456         int tsize;
 457         offset_t offset;
 458         ssize_t count;
 459         int error;
 460         int douprintf;
 461         failinfo_t fi;
 462         char *sv_hostname;
 463 
 464         mi = VTOMI(vp);
 465         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
 466         sv_hostname = VTOR(vp)->r_server->sv_hostname;
 467 
 468         douprintf = 1;
 469         args.file = *VTOFH3(vp);
 470         fi.vp = vp;
 471         fi.fhp = (caddr_t)&args.file;
 472         fi.copyproc = nfs3copyfh;
 473         fi.lookupproc = nfs3lookup;
 474         fi.xattrdirproc = acl_getxattrdir3;
 475 
 476         res.uiop = uiop;
 477 
 478         res.wlist = NULL;
 479 
 480         offset = uiop->uio_loffset;
 481         count = uiop->uio_resid;
 482 
 483         do {
 484                 if (mi->mi_io_kstats) {
 485                         mutex_enter(&mi->mi_lock);
 486                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
 487                         mutex_exit(&mi->mi_lock);
 488                 }
 489 
 490                 do {
 491                         tsize = MIN(mi->mi_tsize, count);
 492                         args.offset = (offset3)offset;
 493                         args.count = (count3)tsize;
 494                         res.size = (uint_t)tsize;
 495                         args.res_uiop = uiop;
 496                         args.res_data_val_alt = NULL;
 497 
 498                         error = rfs3call(mi, NFSPROC3_READ,
 499                             xdr_READ3args, (caddr_t)&args,
 500                             xdr_READ3uiores, (caddr_t)&res, cr,
 501                             &douprintf, &res.status, 0, &fi);
 502                 } while (error == ENFS_TRYAGAIN);
 503 
 504                 if (mi->mi_io_kstats) {
 505                         mutex_enter(&mi->mi_lock);
 506                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
 507                         mutex_exit(&mi->mi_lock);
 508                 }
 509 
 510                 if (error)
 511                         return (error);
 512 
 513                 error = geterrno3(res.status);
 514                 if (error)
 515                         return (error);
 516 
 517                 if (res.count != res.size) {
 518                         zcmn_err(getzoneid(), CE_WARN,
 519 "nfs3_directio_read: server %s returned incorrect amount",
 520                             sv_hostname);
 521                         return (EIO);
 522                 }
 523                 count -= res.count;
 524                 offset += res.count;
 525                 if (mi->mi_io_kstats) {
 526                         mutex_enter(&mi->mi_lock);
 527                         KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
 528                         KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count;
 529                         mutex_exit(&mi->mi_lock);
 530                 }
 531                 lwp_stat_update(LWP_STAT_INBLK, 1);
 532         } while (count && !res.eof);
 533 
 534         return (0);
 535 }
 536 
 537 /* ARGSUSED */
 538 static int
 539 nfs3_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 540         caller_context_t *ct)
 541 {
 542         rnode_t *rp;
 543         u_offset_t off;
 544         offset_t diff;
 545         int on;
 546         size_t n;
 547         caddr_t base;
 548         uint_t flags;
 549         int error = 0;
 550         mntinfo_t *mi;
 551 
 552         rp = VTOR(vp);
 553         mi = VTOMI(vp);
 554 
 555         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
 556 
 557         if (nfs_zone() != mi->mi_zone)
 558                 return (EIO);
 559 
 560         if (vp->v_type != VREG)
 561                 return (EISDIR);
 562 
 563         if (uiop->uio_resid == 0)
 564                 return (0);
 565 
 566         if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
 567                 return (EINVAL);
 568 
 569         /*
 570          * Bypass VM if caching has been disabled (e.g., locking) or if
 571          * using client-side direct I/O and the file is not mmap'd and
 572          * there are no cached pages.
 573          */
 574         if ((vp->v_flag & VNOCACHE) ||
 575             (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 576             rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 577             !vn_has_cached_data(vp))) {
 578                 return (nfs3_directio_read(vp, uiop, cr));
 579         }
 580 
 581         do {
 582                 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 583                 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 584                 n = MIN(MAXBSIZE - on, uiop->uio_resid);
 585 
 586                 error = nfs3_validate_caches(vp, cr);
 587                 if (error)
 588                         break;
 589 
 590                 mutex_enter(&rp->r_statelock);
 591                 while (rp->r_flags & RINCACHEPURGE) {
 592                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 593                                 mutex_exit(&rp->r_statelock);
 594                                 return (EINTR);
 595                         }
 596                 }
 597                 diff = rp->r_size - uiop->uio_loffset;
 598                 mutex_exit(&rp->r_statelock);
 599                 if (diff <= 0)
 600                         break;
 601                 if (diff < n)
 602                         n = (size_t)diff;
 603 
 604                 if (vpm_enable) {
 605                         /*
 606                          * Copy data.
 607                          */
 608                         error = vpm_data_copy(vp, off + on, n, uiop,
 609                             1, NULL, 0, S_READ);
 610                 } else {
 611                         base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
 612                             S_READ);
 613 
 614                         error = uiomove(base + on, n, UIO_READ, uiop);
 615                 }
 616 
 617                 if (!error) {
 618                         /*
 619                          * If read a whole block or read to eof,
 620                          * won't need this buffer again soon.
 621                          */
 622                         mutex_enter(&rp->r_statelock);
 623                         if (n + on == MAXBSIZE ||
 624                             uiop->uio_loffset == rp->r_size)
 625                                 flags = SM_DONTNEED;
 626                         else
 627                                 flags = 0;
 628                         mutex_exit(&rp->r_statelock);
 629                         if (vpm_enable) {
 630                                 error = vpm_sync_pages(vp, off, n, flags);
 631                         } else {
 632                                 error = segmap_release(segkmap, base, flags);
 633                         }
 634                 } else {
 635                         if (vpm_enable) {
 636                                 (void) vpm_sync_pages(vp, off, n, 0);
 637                         } else {
 638                                 (void) segmap_release(segkmap, base, 0);
 639                         }
 640                 }
 641         } while (!error && uiop->uio_resid > 0);
 642 
 643         return (error);
 644 }
 645 
 646 /* ARGSUSED */
 647 static int
 648 nfs3_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 649         caller_context_t *ct)
 650 {
 651         rlim64_t limit = uiop->uio_llimit;
 652         rnode_t *rp;
 653         u_offset_t off;
 654         caddr_t base;
 655         uint_t flags;
 656         int remainder;
 657         size_t n;
 658         int on;
 659         int error;
 660         int resid;
 661         offset_t offset;
 662         mntinfo_t *mi;
 663         uint_t bsize;
 664 
 665         rp = VTOR(vp);
 666 
 667         if (vp->v_type != VREG)
 668                 return (EISDIR);
 669 
 670         mi = VTOMI(vp);
 671         if (nfs_zone() != mi->mi_zone)
 672                 return (EIO);
 673         if (uiop->uio_resid == 0)
 674                 return (0);
 675 
 676         if (ioflag & FAPPEND) {
 677                 struct vattr va;
 678 
 679                 /*
 680                  * Must serialize if appending.
 681                  */
 682                 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
 683                         nfs_rw_exit(&rp->r_rwlock);
 684                         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
 685                             INTR(vp)))
 686                                 return (EINTR);
 687                 }
 688 
 689                 va.va_mask = AT_SIZE;
 690                 error = nfs3getattr(vp, &va, cr);
 691                 if (error)
 692                         return (error);
 693                 uiop->uio_loffset = va.va_size;
 694         }
 695 
 696         offset = uiop->uio_loffset + uiop->uio_resid;
 697 
 698         if (uiop->uio_loffset < 0 || offset < 0)
 699                 return (EINVAL);
 700 
 701         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 702                 limit = MAXOFFSET_T;
 703 
 704         /*
 705          * Check to make sure that the process will not exceed
 706          * its limit on file size.  It is okay to write up to
 707          * the limit, but not beyond.  Thus, the write which
 708          * reaches the limit will be short and the next write
 709          * will return an error.
 710          */
 711         remainder = 0;
 712         if (offset > limit) {
 713                 remainder = offset - limit;
 714                 uiop->uio_resid = limit - uiop->uio_loffset;
 715                 if (uiop->uio_resid <= 0) {
 716                         proc_t *p = ttoproc(curthread);
 717 
 718                         uiop->uio_resid += remainder;
 719                         mutex_enter(&p->p_lock);
 720                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 721                             p->p_rctls, p, RCA_UNSAFE_SIGINFO);
 722                         mutex_exit(&p->p_lock);
 723                         return (EFBIG);
 724                 }
 725         }
 726 
 727         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
 728                 return (EINTR);
 729 
 730         /*
 731          * Bypass VM if caching has been disabled (e.g., locking) or if
 732          * using client-side direct I/O and the file is not mmap'd and
 733          * there are no cached pages.
 734          */
 735         if ((vp->v_flag & VNOCACHE) ||
 736             (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 737             rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 738             !vn_has_cached_data(vp))) {
 739                 size_t bufsize;
 740                 int count;
 741                 u_offset_t org_offset;
 742                 stable_how stab_comm;
 743 
 744 nfs3_fwrite:
 745                 if (rp->r_flags & RSTALE) {
 746                         resid = uiop->uio_resid;
 747                         offset = uiop->uio_loffset;
 748                         error = rp->r_error;
 749                         /*
 750                          * A close may have cleared r_error, if so,
 751                          * propagate ESTALE error return properly
 752                          */
 753                         if (error == 0)
 754                                 error = ESTALE;
 755                         goto bottom;
 756                 }
 757                 bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
 758                 base = kmem_alloc(bufsize, KM_SLEEP);
 759                 do {
 760                         if (ioflag & FDSYNC)
 761                                 stab_comm = DATA_SYNC;
 762                         else
 763                                 stab_comm = FILE_SYNC;
 764                         resid = uiop->uio_resid;
 765                         offset = uiop->uio_loffset;
 766                         count = MIN(uiop->uio_resid, bufsize);
 767                         org_offset = uiop->uio_loffset;
 768                         error = uiomove(base, count, UIO_WRITE, uiop);
 769                         if (!error) {
 770                                 error = nfs3write(vp, base, org_offset,
 771                                     count, cr, &stab_comm);
 772                         }
 773                 } while (!error && uiop->uio_resid > 0);
 774                 kmem_free(base, bufsize);
 775                 goto bottom;
 776         }
 777 
 778 
 779         bsize = vp->v_vfsp->vfs_bsize;
 780 
 781         do {
 782                 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 783                 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 784                 n = MIN(MAXBSIZE - on, uiop->uio_resid);
 785 
 786                 resid = uiop->uio_resid;
 787                 offset = uiop->uio_loffset;
 788 
 789                 if (rp->r_flags & RSTALE) {
 790                         error = rp->r_error;
 791                         /*
 792                          * A close may have cleared r_error, if so,
 793                          * propagate ESTALE error return properly
 794                          */
 795                         if (error == 0)
 796                                 error = ESTALE;
 797                         break;
 798                 }
 799 
 800                 /*
 801                  * Don't create dirty pages faster than they
 802                  * can be cleaned so that the system doesn't
 803                  * get imbalanced.  If the async queue is
 804                  * maxed out, then wait for it to drain before
 805                  * creating more dirty pages.  Also, wait for
 806                  * any threads doing pagewalks in the vop_getattr
 807                  * entry points so that they don't block for
 808                  * long periods.
 809                  */
 810                 mutex_enter(&rp->r_statelock);
 811                 while ((mi->mi_max_threads != 0 &&
 812                     rp->r_awcount > 2 * mi->mi_max_threads) ||
 813                     rp->r_gcount > 0) {
 814                         if (INTR(vp)) {
 815                                 klwp_t *lwp = ttolwp(curthread);
 816 
 817                                 if (lwp != NULL)
 818                                         lwp->lwp_nostop++;
 819                                 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 820                                         mutex_exit(&rp->r_statelock);
 821                                         if (lwp != NULL)
 822                                                 lwp->lwp_nostop--;
 823                                         error = EINTR;
 824                                         goto bottom;
 825                                 }
 826                                 if (lwp != NULL)
 827                                         lwp->lwp_nostop--;
 828                         } else
 829                                 cv_wait(&rp->r_cv, &rp->r_statelock);
 830                 }
 831                 mutex_exit(&rp->r_statelock);
 832 
 833                 /*
 834                  * Touch the page and fault it in if it is not in core
 835                  * before segmap_getmapflt or vpm_data_copy can lock it.
 836                  * This is to avoid the deadlock if the buffer is mapped
 837                  * to the same file through mmap which we want to write.
 838                  */
 839                 uio_prefaultpages((long)n, uiop);
 840 
 841                 if (vpm_enable) {
 842                         /*
 843                          * It will use kpm mappings, so no need to
 844                          * pass an address.
 845                          */
 846                         error = writerp(rp, NULL, n, uiop, 0);
 847                 } else  {
 848                         if (segmap_kpm) {
 849                                 int pon = uiop->uio_loffset & PAGEOFFSET;
 850                                 size_t pn = MIN(PAGESIZE - pon,
 851                                     uiop->uio_resid);
 852                                 int pagecreate;
 853 
 854                                 mutex_enter(&rp->r_statelock);
 855                                 pagecreate = (pon == 0) && (pn == PAGESIZE ||
 856                                     uiop->uio_loffset + pn >= rp->r_size);
 857                                 mutex_exit(&rp->r_statelock);
 858 
 859                                 base = segmap_getmapflt(segkmap, vp, off + on,
 860                                     pn, !pagecreate, S_WRITE);
 861 
 862                                 error = writerp(rp, base + pon, n, uiop,
 863                                     pagecreate);
 864 
 865                         } else {
 866                                 base = segmap_getmapflt(segkmap, vp, off + on,
 867                                     n, 0, S_READ);
 868                                 error = writerp(rp, base + on, n, uiop, 0);
 869                         }
 870                 }
 871 
 872                 if (!error) {
 873                         if (mi->mi_flags & MI_NOAC)
 874                                 flags = SM_WRITE;
 875                         else if ((uiop->uio_loffset % bsize) == 0 ||
 876                             IS_SWAPVP(vp)) {
 877                                 /*
 878                                  * Have written a whole block.
 879                                  * Start an asynchronous write
 880                                  * and mark the buffer to
 881                                  * indicate that it won't be
 882                                  * needed again soon.
 883                                  */
 884                                 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
 885                         } else
 886                                 flags = 0;
 887                         if ((ioflag & (FSYNC|FDSYNC)) ||
 888                             (rp->r_flags & ROUTOFSPACE)) {
 889                                 flags &= ~SM_ASYNC;
 890                                 flags |= SM_WRITE;
 891                         }
 892                         if (vpm_enable) {
 893                                 error = vpm_sync_pages(vp, off, n, flags);
 894                         } else {
 895                                 error = segmap_release(segkmap, base, flags);
 896                         }
 897                 } else {
 898                         if (vpm_enable) {
 899                                 (void) vpm_sync_pages(vp, off, n, 0);
 900                         } else {
 901                                 (void) segmap_release(segkmap, base, 0);
 902                         }
 903                         /*
 904                          * In the event that we got an access error while
 905                          * faulting in a page for a write-only file just
 906                          * force a write.
 907                          */
 908                         if (error == EACCES)
 909                                 goto nfs3_fwrite;
 910                 }
 911         } while (!error && uiop->uio_resid > 0);
 912 
 913 bottom:
 914         if (error) {
 915                 uiop->uio_resid = resid + remainder;
 916                 uiop->uio_loffset = offset;
 917         } else
 918                 uiop->uio_resid += remainder;
 919 
 920         nfs_rw_exit(&rp->r_lkserlock);
 921 
 922         return (error);
 923 }
 924 
 925 /*
 926  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
 927  */
 928 static int
 929 nfs3_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
 930         int flags, cred_t *cr)
 931 {
 932         struct buf *bp;
 933         int error;
 934         page_t *savepp;
 935         uchar_t fsdata;
 936         stable_how stab_comm;
 937 
 938         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
 939         bp = pageio_setup(pp, len, vp, flags);
 940         ASSERT(bp != NULL);
 941 
 942         /*
 943          * pageio_setup should have set b_addr to 0.  This
 944          * is correct since we want to do I/O on a page
 945          * boundary.  bp_mapin will use this addr to calculate
 946          * an offset, and then set b_addr to the kernel virtual
 947          * address it allocated for us.
 948          */
 949         ASSERT(bp->b_un.b_addr == 0);
 950 
 951         bp->b_edev = 0;
 952         bp->b_dev = 0;
 953         bp->b_lblkno = lbtodb(off);
 954         bp->b_file = vp;
 955         bp->b_offset = (offset_t)off;
 956         bp_mapin(bp);
 957 
 958         /*
 959          * Calculate the desired level of stability to write data
 960          * on the server and then mark all of the pages to reflect
 961          * this.
 962          */
 963         if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
 964             freemem > desfree) {
 965                 stab_comm = UNSTABLE;
 966                 fsdata = C_DELAYCOMMIT;
 967         } else {
 968                 stab_comm = FILE_SYNC;
 969                 fsdata = C_NOCOMMIT;
 970         }
 971 
 972         savepp = pp;
 973         do {
 974                 pp->p_fsdata = fsdata;
 975         } while ((pp = pp->p_next) != savepp);
 976 
 977         error = nfs3_bio(bp, &stab_comm, cr);
 978 
 979         bp_mapout(bp);
 980         pageio_done(bp);
 981 
 982         /*
 983          * If the server wrote pages in a more stable fashion than
 984          * was requested, then clear all of the marks in the pages
 985          * indicating that COMMIT operations were required.
 986          */
 987         if (stab_comm != UNSTABLE && fsdata == C_DELAYCOMMIT) {
 988                 do {
 989                         pp->p_fsdata = C_NOCOMMIT;
 990                 } while ((pp = pp->p_next) != savepp);
 991         }
 992 
 993         return (error);
 994 }
 995 
 996 /*
 997  * Write to file.  Writes to remote server in largest size
 998  * chunks that the server can handle.  Write is synchronous.
 999  */
1000 static int
1001 nfs3write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
1002         stable_how *stab_comm)
1003 {
1004         mntinfo_t *mi;
1005         WRITE3args args;
1006         WRITE3res res;
1007         int error;
1008         int tsize;
1009         rnode_t *rp;
1010         int douprintf;
1011 
1012         rp = VTOR(vp);
1013         mi = VTOMI(vp);
1014 
1015         ASSERT(nfs_zone() == mi->mi_zone);
1016 
1017         args.file = *VTOFH3(vp);
1018         args.stable = *stab_comm;
1019 
1020         *stab_comm = FILE_SYNC;
1021 
1022         douprintf = 1;
1023 
1024         do {
1025                 if ((vp->v_flag & VNOCACHE) ||
1026                     (rp->r_flags & RDIRECTIO) ||
1027                     (mi->mi_flags & MI_DIRECTIO))
1028                         tsize = MIN(mi->mi_stsize, count);
1029                 else
1030                         tsize = MIN(mi->mi_curwrite, count);
1031                 args.offset = (offset3)offset;
1032                 args.count = (count3)tsize;
1033                 args.data.data_len = (uint_t)tsize;
1034                 args.data.data_val = base;
1035 
1036                 if (mi->mi_io_kstats) {
1037                         mutex_enter(&mi->mi_lock);
1038                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1039                         mutex_exit(&mi->mi_lock);
1040                 }
1041                 args.mblk = NULL;
1042                 do {
1043                         error = rfs3call(mi, NFSPROC3_WRITE,
1044                             xdr_WRITE3args, (caddr_t)&args,
1045                             xdr_WRITE3res, (caddr_t)&res, cr,
1046                             &douprintf, &res.status, 0, NULL);
1047                 } while (error == ENFS_TRYAGAIN);
1048                 if (mi->mi_io_kstats) {
1049                         mutex_enter(&mi->mi_lock);
1050                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1051                         mutex_exit(&mi->mi_lock);
1052                 }
1053 
1054                 if (error)
1055                         return (error);
1056                 error = geterrno3(res.status);
1057                 if (!error) {
1058                         if (res.resok.count > args.count) {
1059                                 zcmn_err(getzoneid(), CE_WARN,
1060                                     "nfs3write: server %s wrote %u, "
1061                                     "requested was %u",
1062                                     rp->r_server->sv_hostname,
1063                                     res.resok.count, args.count);
1064                                 return (EIO);
1065                         }
1066                         if (res.resok.committed == UNSTABLE) {
1067                                 *stab_comm = UNSTABLE;
1068                                 if (args.stable == DATA_SYNC ||
1069                                     args.stable == FILE_SYNC) {
1070                                         zcmn_err(getzoneid(), CE_WARN,
1071                         "nfs3write: server %s did not commit to stable storage",
1072                                             rp->r_server->sv_hostname);
1073                                         return (EIO);
1074                                 }
1075                         }
1076                         tsize = (int)res.resok.count;
1077                         count -= tsize;
1078                         base += tsize;
1079                         offset += tsize;
1080                         if (mi->mi_io_kstats) {
1081                                 mutex_enter(&mi->mi_lock);
1082                                 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
1083                                 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
1084                                     tsize;
1085                                 mutex_exit(&mi->mi_lock);
1086                         }
1087                         lwp_stat_update(LWP_STAT_OUBLK, 1);
1088                         mutex_enter(&rp->r_statelock);
1089                         if (rp->r_flags & RHAVEVERF) {
1090                                 if (rp->r_verf != res.resok.verf) {
1091                                         nfs3_set_mod(vp);
1092                                         rp->r_verf = res.resok.verf;
1093                                         /*
1094                                          * If the data was written UNSTABLE,
1095                                          * then might as well stop because
1096                                          * the whole block will have to get
1097                                          * rewritten anyway.
1098                                          */
1099                                         if (*stab_comm == UNSTABLE) {
1100                                                 mutex_exit(&rp->r_statelock);
1101                                                 break;
1102                                         }
1103                                 }
1104                         } else {
1105                                 rp->r_verf = res.resok.verf;
1106                                 rp->r_flags |= RHAVEVERF;
1107                         }
1108                         /*
1109                          * Mark the attribute cache as timed out and
1110                          * set RWRITEATTR to indicate that the file
1111                          * was modified with a WRITE operation and
1112                          * that the attributes can not be trusted.
1113                          */
1114                         PURGE_ATTRCACHE_LOCKED(rp);
1115                         rp->r_flags |= RWRITEATTR;
1116                         mutex_exit(&rp->r_statelock);
1117                 }
1118         } while (!error && count);
1119 
1120         return (error);
1121 }
1122 
1123 /*
1124  * Read from a file.  Reads data in largest chunks our interface can handle.
1125  */
1126 static int
1127 nfs3read(vnode_t *vp, caddr_t base, offset_t offset, int count,
1128         size_t *residp, cred_t *cr)
1129 {
1130         mntinfo_t *mi;
1131         READ3args args;
1132         READ3vres res;
1133         int tsize;
1134         int error;
1135         int douprintf;
1136         failinfo_t fi;
1137         rnode_t *rp;
1138         struct vattr va;
1139         hrtime_t t;
1140 
1141         rp = VTOR(vp);
1142         mi = VTOMI(vp);
1143         ASSERT(nfs_zone() == mi->mi_zone);
1144         douprintf = 1;
1145 
1146         args.file = *VTOFH3(vp);
1147         fi.vp = vp;
1148         fi.fhp = (caddr_t)&args.file;
1149         fi.copyproc = nfs3copyfh;
1150         fi.lookupproc = nfs3lookup;
1151         fi.xattrdirproc = acl_getxattrdir3;
1152 
1153         res.pov.fres.vp = vp;
1154         res.pov.fres.vap = &va;
1155 
1156         res.wlist = NULL;
1157         *residp = count;
1158         do {
1159                 if (mi->mi_io_kstats) {
1160                         mutex_enter(&mi->mi_lock);
1161                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1162                         mutex_exit(&mi->mi_lock);
1163                 }
1164 
1165                 do {
1166                         if ((vp->v_flag & VNOCACHE) ||
1167                             (rp->r_flags & RDIRECTIO) ||
1168                             (mi->mi_flags & MI_DIRECTIO))
1169                                 tsize = MIN(mi->mi_tsize, count);
1170                         else
1171                                 tsize = MIN(mi->mi_curread, count);
1172                         res.data.data_val = base;
1173                         res.data.data_len = tsize;
1174                         args.offset = (offset3)offset;
1175                         args.count = (count3)tsize;
1176                         args.res_uiop = NULL;
1177                         args.res_data_val_alt = base;
1178 
1179                         t = gethrtime();
1180                         error = rfs3call(mi, NFSPROC3_READ,
1181                             xdr_READ3args, (caddr_t)&args,
1182                             xdr_READ3vres, (caddr_t)&res, cr,
1183                             &douprintf, &res.status, 0, &fi);
1184                 } while (error == ENFS_TRYAGAIN);
1185 
1186                 if (mi->mi_io_kstats) {
1187                         mutex_enter(&mi->mi_lock);
1188                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1189                         mutex_exit(&mi->mi_lock);
1190                 }
1191 
1192                 if (error)
1193                         return (error);
1194 
1195                 error = geterrno3(res.status);
1196                 if (error)
1197                         return (error);
1198 
1199                 if (res.count != res.data.data_len) {
1200                         zcmn_err(getzoneid(), CE_WARN,
1201                             "nfs3read: server %s returned incorrect amount",
1202                             rp->r_server->sv_hostname);
1203                         return (EIO);
1204                 }
1205 
1206                 count -= res.count;
1207                 *residp = count;
1208                 base += res.count;
1209                 offset += res.count;
1210                 if (mi->mi_io_kstats) {
1211                         mutex_enter(&mi->mi_lock);
1212                         KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
1213                         KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count;
1214                         mutex_exit(&mi->mi_lock);
1215                 }
1216                 lwp_stat_update(LWP_STAT_INBLK, 1);
1217         } while (count && !res.eof);
1218 
1219         if (res.pov.attributes) {
1220                 mutex_enter(&rp->r_statelock);
1221                 if (!CACHE_VALID(rp, va.va_mtime, va.va_size)) {
1222                         mutex_exit(&rp->r_statelock);
1223                         PURGE_ATTRCACHE(vp);
1224                 } else {
1225                         if (rp->r_mtime <= t)
1226                                 nfs_attrcache_va(vp, &va);
1227                         mutex_exit(&rp->r_statelock);
1228                 }
1229         }
1230 
1231         return (0);
1232 }
1233 
1234 /* ARGSUSED */
1235 static int
1236 nfs3_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
1237         caller_context_t *ct)
1238 {
1239 
1240         if (nfs_zone() != VTOMI(vp)->mi_zone)
1241                 return (EIO);
1242         switch (cmd) {
1243                 case _FIODIRECTIO:
1244                         return (nfs_directio(vp, (int)arg, cr));
1245                 default:
1246                         return (ENOTTY);
1247         }
1248 }
1249 
1250 /* ARGSUSED */
1251 static int
1252 nfs3_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1253         caller_context_t *ct)
1254 {
1255         int error;
1256         rnode_t *rp;
1257 
1258         if (nfs_zone() != VTOMI(vp)->mi_zone)
1259                 return (EIO);
1260         /*
1261          * If it has been specified that the return value will
1262          * just be used as a hint, and we are only being asked
1263          * for size, fsid or rdevid, then return the client's
1264          * notion of these values without checking to make sure
1265          * that the attribute cache is up to date.
1266          * The whole point is to avoid an over the wire GETATTR
1267          * call.
1268          */
1269         rp = VTOR(vp);
1270         if (flags & ATTR_HINT) {
1271                 if (vap->va_mask ==
1272                     (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
1273                         mutex_enter(&rp->r_statelock);
1274                         if (vap->va_mask | AT_SIZE)
1275                                 vap->va_size = rp->r_size;
1276                         if (vap->va_mask | AT_FSID)
1277                                 vap->va_fsid = rp->r_attr.va_fsid;
1278                         if (vap->va_mask | AT_RDEV)
1279                                 vap->va_rdev = rp->r_attr.va_rdev;
1280                         mutex_exit(&rp->r_statelock);
1281                         return (0);
1282                 }
1283         }
1284 
1285         /*
1286          * Only need to flush pages if asking for the mtime
1287          * and if there any dirty pages or any outstanding
1288          * asynchronous (write) requests for this file.
1289          */
1290         if (vap->va_mask & AT_MTIME) {
1291                 if (vn_has_cached_data(vp) &&
1292                     ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1293                         mutex_enter(&rp->r_statelock);
1294                         rp->r_gcount++;
1295                         mutex_exit(&rp->r_statelock);
1296                         error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1297                         mutex_enter(&rp->r_statelock);
1298                         if (error && (error == ENOSPC || error == EDQUOT)) {
1299                                 if (!rp->r_error)
1300                                         rp->r_error = error;
1301                         }
1302                         if (--rp->r_gcount == 0)
1303                                 cv_broadcast(&rp->r_cv);
1304                         mutex_exit(&rp->r_statelock);
1305                 }
1306         }
1307 
1308         return (nfs3getattr(vp, vap, cr));
1309 }
1310 
1311 /*ARGSUSED4*/
1312 static int
1313 nfs3_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1314                 caller_context_t *ct)
1315 {
1316         int error;
1317         struct vattr va;
1318 
1319         if (vap->va_mask & AT_NOSET)
1320                 return (EINVAL);
1321         if (nfs_zone() != VTOMI(vp)->mi_zone)
1322                 return (EIO);
1323 
1324         va.va_mask = AT_UID | AT_MODE;
1325         error = nfs3getattr(vp, &va, cr);
1326         if (error)
1327                 return (error);
1328 
1329         error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs3_accessx,
1330             vp);
1331         if (error)
1332                 return (error);
1333 
1334         return (nfs3setattr(vp, vap, flags, cr));
1335 }
1336 
1337 static int
1338 nfs3setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1339 {
1340         int error;
1341         uint_t mask;
1342         SETATTR3args args;
1343         SETATTR3res res;
1344         int douprintf;
1345         rnode_t *rp;
1346         struct vattr va;
1347         mode_t omode;
1348         vsecattr_t *vsp;
1349         hrtime_t t;
1350 
1351         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1352         mask = vap->va_mask;
1353 
1354         rp = VTOR(vp);
1355 
1356         /*
1357          * Only need to flush pages if there are any pages and
1358          * if the file is marked as dirty in some fashion.  The
1359          * file must be flushed so that we can accurately
1360          * determine the size of the file and the cached data
1361          * after the SETATTR returns.  A file is considered to
1362          * be dirty if it is either marked with RDIRTY, has
1363          * outstanding i/o's active, or is mmap'd.  In this
1364          * last case, we can't tell whether there are dirty
1365          * pages, so we flush just to be sure.
1366          */
1367         if (vn_has_cached_data(vp) &&
1368             ((rp->r_flags & RDIRTY) ||
1369             rp->r_count > 0 ||
1370             rp->r_mapcnt > 0)) {
1371                 ASSERT(vp->v_type != VCHR);
1372                 error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
1373                 if (error && (error == ENOSPC || error == EDQUOT)) {
1374                         mutex_enter(&rp->r_statelock);
1375                         if (!rp->r_error)
1376                                 rp->r_error = error;
1377                         mutex_exit(&rp->r_statelock);
1378                 }
1379         }
1380 
1381         args.object = *RTOFH3(rp);
1382         /*
1383          * If the intent is for the server to set the times,
1384          * there is no point in have the mask indicating set mtime or
1385          * atime, because the vap values may be junk, and so result
1386          * in an overflow error. Remove these flags from the vap mask
1387          * before calling in this case, and restore them afterwards.
1388          */
1389         if ((mask & (AT_ATIME | AT_MTIME)) && !(flags & ATTR_UTIME)) {
1390                 /* Use server times, so don't set the args time fields */
1391                 vap->va_mask &= ~(AT_ATIME | AT_MTIME);
1392                 error = vattr_to_sattr3(vap, &args.new_attributes);
1393                 vap->va_mask |= (mask & (AT_ATIME | AT_MTIME));
1394                 if (mask & AT_ATIME) {
1395                         args.new_attributes.atime.set_it = SET_TO_SERVER_TIME;
1396                 }
1397                 if (mask & AT_MTIME) {
1398                         args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME;
1399                 }
1400         } else {
1401                 /* Either do not set times or use the client specified times */
1402                 error = vattr_to_sattr3(vap, &args.new_attributes);
1403         }
1404 
1405         if (error) {
1406                 /* req time field(s) overflow - return immediately */
1407                 return (error);
1408         }
1409 
1410         va.va_mask = AT_MODE | AT_CTIME;
1411         error = nfs3getattr(vp, &va, cr);
1412         if (error)
1413                 return (error);
1414         omode = va.va_mode;
1415 
1416 tryagain:
1417         if (mask & AT_SIZE) {
1418                 args.guard.check = TRUE;
1419                 args.guard.obj_ctime.seconds = va.va_ctime.tv_sec;
1420                 args.guard.obj_ctime.nseconds = va.va_ctime.tv_nsec;
1421         } else
1422                 args.guard.check = FALSE;
1423 
1424         douprintf = 1;
1425 
1426         t = gethrtime();
1427 
1428         error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR,
1429             xdr_SETATTR3args, (caddr_t)&args,
1430             xdr_SETATTR3res, (caddr_t)&res, cr,
1431             &douprintf, &res.status, 0, NULL);
1432 
1433         /*
1434          * Purge the access cache and ACL cache if changing either the
1435          * owner of the file, the group owner, or the mode.  These may
1436          * change the access permissions of the file, so purge old
1437          * information and start over again.
1438          */
1439         if (mask & (AT_UID | AT_GID | AT_MODE)) {
1440                 (void) nfs_access_purge_rp(rp);
1441                 if (rp->r_secattr != NULL) {
1442                         mutex_enter(&rp->r_statelock);
1443                         vsp = rp->r_secattr;
1444                         rp->r_secattr = NULL;
1445                         mutex_exit(&rp->r_statelock);
1446                         if (vsp != NULL)
1447                                 nfs_acl_free(vsp);
1448                 }
1449         }
1450 
1451         if (error) {
1452                 PURGE_ATTRCACHE(vp);
1453                 return (error);
1454         }
1455 
1456         error = geterrno3(res.status);
1457         if (!error) {
1458                 /*
1459                  * If changing the size of the file, invalidate
1460                  * any local cached data which is no longer part
1461                  * of the file.  We also possibly invalidate the
1462                  * last page in the file.  We could use
1463                  * pvn_vpzero(), but this would mark the page as
1464                  * modified and require it to be written back to
1465                  * the server for no particularly good reason.
1466                  * This way, if we access it, then we bring it
1467                  * back in.  A read should be cheaper than a
1468                  * write.
1469                  */
1470                 if (mask & AT_SIZE) {
1471                         nfs_invalidate_pages(vp,
1472                             (vap->va_size & PAGEMASK), cr);
1473                 }
1474                 nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr);
1475                 /*
1476                  * Some servers will change the mode to clear the setuid
1477                  * and setgid bits when changing the uid or gid.  The
1478                  * client needs to compensate appropriately.
1479                  */
1480                 if (mask & (AT_UID | AT_GID)) {
1481                         int terror;
1482 
1483                         va.va_mask = AT_MODE;
1484                         terror = nfs3getattr(vp, &va, cr);
1485                         if (!terror &&
1486                             (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
1487                             (!(mask & AT_MODE) && va.va_mode != omode))) {
1488                                 va.va_mask = AT_MODE;
1489                                 if (mask & AT_MODE)
1490                                         va.va_mode = vap->va_mode;
1491                                 else
1492                                         va.va_mode = omode;
1493                                 (void) nfs3setattr(vp, &va, 0, cr);
1494                         }
1495                 }
1496         } else {
1497                 nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr);
1498                 /*
1499                  * If we got back a "not synchronized" error, then
1500                  * we need to retry with a new guard value.  The
1501                  * guard value used is the change time.  If the
1502                  * server returned post_op_attr, then we can just
1503                  * retry because we have the latest attributes.
1504                  * Otherwise, we issue a GETATTR to get the latest
1505                  * attributes and then retry.  If we couldn't get
1506                  * the attributes this way either, then we give
1507                  * up because we can't complete the operation as
1508                  * required.
1509                  */
1510                 if (res.status == NFS3ERR_NOT_SYNC) {
1511                         va.va_mask = AT_CTIME;
1512                         if (nfs3getattr(vp, &va, cr) == 0)
1513                                 goto tryagain;
1514                 }
1515                 PURGE_STALE_FH(error, vp, cr);
1516         }
1517 
1518         return (error);
1519 }
1520 
1521 static int
1522 nfs3_accessx(void *vp, int mode, cred_t *cr)
1523 {
1524         ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1525         return (nfs3_access(vp, mode, 0, cr, NULL));
1526 }
1527 
1528 /* ARGSUSED */
1529 static int
1530 nfs3_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
1531 {
1532         int error;
1533         ACCESS3args args;
1534         ACCESS3res res;
1535         int douprintf;
1536         uint32 acc;
1537         rnode_t *rp;
1538         cred_t *cred, *ncr, *ncrfree = NULL;
1539         failinfo_t fi;
1540         nfs_access_type_t cacc;
1541         hrtime_t t;
1542 
1543         acc = 0;
1544         if (nfs_zone() != VTOMI(vp)->mi_zone)
1545                 return (EIO);
1546         if (mode & VREAD)
1547                 acc |= ACCESS3_READ;
1548         if (mode & VWRITE) {
1549                 if (vn_is_readonly(vp) && !IS_DEVVP(vp))
1550                         return (EROFS);
1551                 if (vp->v_type == VDIR)
1552                         acc |= ACCESS3_DELETE;
1553                 acc |= ACCESS3_MODIFY | ACCESS3_EXTEND;
1554         }
1555         if (mode & VEXEC) {
1556                 if (vp->v_type == VDIR)
1557                         acc |= ACCESS3_LOOKUP;
1558                 else
1559                         acc |= ACCESS3_EXECUTE;
1560         }
1561 
1562         rp = VTOR(vp);
1563         args.object = *VTOFH3(vp);
1564         if (vp->v_type == VDIR) {
1565                 args.access = ACCESS3_READ | ACCESS3_DELETE | ACCESS3_MODIFY |
1566                     ACCESS3_EXTEND | ACCESS3_LOOKUP;
1567         } else {
1568                 args.access = ACCESS3_READ | ACCESS3_MODIFY | ACCESS3_EXTEND |
1569                     ACCESS3_EXECUTE;
1570         }
1571         fi.vp = vp;
1572         fi.fhp = (caddr_t)&args.object;
1573         fi.copyproc = nfs3copyfh;
1574         fi.lookupproc = nfs3lookup;
1575         fi.xattrdirproc = acl_getxattrdir3;
1576 
1577         cred = cr;
1578         /*
1579          * ncr and ncrfree both initially
1580          * point to the memory area returned
1581          * by crnetadjust();
1582          * ncrfree not NULL when exiting means
1583          * that we need to release it
1584          */
1585         ncr = crnetadjust(cred);
1586         ncrfree = ncr;
1587 tryagain:
1588         if (rp->r_acache != NULL) {
1589                 cacc = nfs_access_check(rp, acc, cred);
1590                 if (cacc == NFS_ACCESS_ALLOWED) {
1591                         if (ncrfree != NULL)
1592                                 crfree(ncrfree);
1593                         return (0);
1594                 }
1595                 if (cacc == NFS_ACCESS_DENIED) {
1596                         /*
1597                          * If the cred can be adjusted, try again
1598                          * with the new cred.
1599                          */
1600                         if (ncr != NULL) {
1601                                 cred = ncr;
1602                                 ncr = NULL;
1603                                 goto tryagain;
1604                         }
1605                         if (ncrfree != NULL)
1606                                 crfree(ncrfree);
1607                         return (EACCES);
1608                 }
1609         }
1610 
1611         douprintf = 1;
1612 
1613         t = gethrtime();
1614 
1615         error = rfs3call(VTOMI(vp), NFSPROC3_ACCESS,
1616             xdr_ACCESS3args, (caddr_t)&args,
1617             xdr_ACCESS3res, (caddr_t)&res, cred,
1618             &douprintf, &res.status, 0, &fi);
1619 
1620         if (error) {
1621                 if (ncrfree != NULL)
1622                         crfree(ncrfree);
1623                 return (error);
1624         }
1625 
1626         error = geterrno3(res.status);
1627         if (!error) {
1628                 nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
1629                 nfs_access_cache(rp, args.access, res.resok.access, cred);
1630                 /*
1631                  * we just cached results with cred; if cred is the
1632                  * adjusted credentials from crnetadjust, we do not want
1633                  * to release them before exiting: hence setting ncrfree
1634                  * to NULL
1635                  */
1636                 if (cred != cr)
1637                         ncrfree = NULL;
1638                 if ((acc & res.resok.access) != acc) {
1639                         /*
1640                          * If the cred can be adjusted, try again
1641                          * with the new cred.
1642                          */
1643                         if (ncr != NULL) {
1644                                 cred = ncr;
1645                                 ncr = NULL;
1646                                 goto tryagain;
1647                         }
1648                         error = EACCES;
1649                 }
1650         } else {
1651                 nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
1652                 PURGE_STALE_FH(error, vp, cr);
1653         }
1654 
1655         if (ncrfree != NULL)
1656                 crfree(ncrfree);
1657 
1658         return (error);
1659 }
1660 
1661 static int nfs3_do_symlink_cache = 1;
1662 
1663 /* ARGSUSED */
1664 static int
1665 nfs3_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
1666 {
1667         int error;
1668         READLINK3args args;
1669         READLINK3res res;
1670         nfspath3 resdata_backup;
1671         rnode_t *rp;
1672         int douprintf;
1673         int len;
1674         failinfo_t fi;
1675         hrtime_t t;
1676 
1677         /*
1678          * Can't readlink anything other than a symbolic link.
1679          */
1680         if (vp->v_type != VLNK)
1681                 return (EINVAL);
1682         if (nfs_zone() != VTOMI(vp)->mi_zone)
1683                 return (EIO);
1684 
1685         rp = VTOR(vp);
1686         if (nfs3_do_symlink_cache && rp->r_symlink.contents != NULL) {
1687                 error = nfs3_validate_caches(vp, cr);
1688                 if (error)
1689                         return (error);
1690                 mutex_enter(&rp->r_statelock);
1691                 if (rp->r_symlink.contents != NULL) {
1692                         error = uiomove(rp->r_symlink.contents,
1693                             rp->r_symlink.len, UIO_READ, uiop);
1694                         mutex_exit(&rp->r_statelock);
1695                         return (error);
1696                 }
1697                 mutex_exit(&rp->r_statelock);
1698         }
1699 
1700         args.symlink = *VTOFH3(vp);
1701         fi.vp = vp;
1702         fi.fhp = (caddr_t)&args.symlink;
1703         fi.copyproc = nfs3copyfh;
1704         fi.lookupproc = nfs3lookup;
1705         fi.xattrdirproc = acl_getxattrdir3;
1706 
1707         res.resok.data = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1708 
1709         resdata_backup = res.resok.data;
1710 
1711         douprintf = 1;
1712 
1713         t = gethrtime();
1714 
1715         error = rfs3call(VTOMI(vp), NFSPROC3_READLINK,
1716             xdr_READLINK3args, (caddr_t)&args,
1717             xdr_READLINK3res, (caddr_t)&res, cr,
1718             &douprintf, &res.status, 0, &fi);
1719 
1720         if (res.resok.data == nfs3nametoolong)
1721                 error = EINVAL;
1722 
1723         if (error) {
1724                 kmem_free(resdata_backup, MAXPATHLEN);
1725                 return (error);
1726         }
1727 
1728         error = geterrno3(res.status);
1729         if (!error) {
1730                 nfs3_cache_post_op_attr(vp, &res.resok.symlink_attributes, t,
1731                     cr);
1732                 len = strlen(res.resok.data);
1733                 error = uiomove(res.resok.data, len, UIO_READ, uiop);
1734                 if (nfs3_do_symlink_cache && rp->r_symlink.contents == NULL) {
1735                         mutex_enter(&rp->r_statelock);
1736                                 if (rp->r_symlink.contents == NULL) {
1737                                 rp->r_symlink.contents = res.resok.data;
1738                                 rp->r_symlink.len = len;
1739                                 rp->r_symlink.size = MAXPATHLEN;
1740                                 mutex_exit(&rp->r_statelock);
1741                         } else {
1742                                 mutex_exit(&rp->r_statelock);
1743 
1744                                 kmem_free((void *)res.resok.data, MAXPATHLEN);
1745                         }
1746                 } else {
1747                         kmem_free((void *)res.resok.data, MAXPATHLEN);
1748                 }
1749         } else {
1750                 nfs3_cache_post_op_attr(vp,
1751                     &res.resfail.symlink_attributes, t, cr);
1752                 PURGE_STALE_FH(error, vp, cr);
1753 
1754                 kmem_free((void *)res.resok.data, MAXPATHLEN);
1755 
1756         }
1757 
1758         /*
1759          * The over the wire error for attempting to readlink something
1760          * other than a symbolic link is ENXIO.  However, we need to
1761          * return EINVAL instead of ENXIO, so we map it here.
1762          */
1763         return (error == ENXIO ? EINVAL : error);
1764 }
1765 
1766 /*
1767  * Flush local dirty pages to stable storage on the server.
1768  *
1769  * If FNODSYNC is specified, then there is nothing to do because
1770  * metadata changes are not cached on the client before being
1771  * sent to the server.
1772  */
1773 /* ARGSUSED */
1774 static int
1775 nfs3_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1776 {
1777         int error;
1778 
1779         if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1780                 return (0);
1781         if (nfs_zone() != VTOMI(vp)->mi_zone)
1782                 return (EIO);
1783 
1784         error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr);
1785         if (!error)
1786                 error = VTOR(vp)->r_error;
1787         return (error);
1788 }
1789 
1790 /*
1791  * Weirdness: if the file was removed or the target of a rename
1792  * operation while it was open, it got renamed instead.  Here we
1793  * remove the renamed file.
1794  */
1795 /* ARGSUSED */
1796 static void
1797 nfs3_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1798 {
1799         rnode_t *rp;
1800 
1801         ASSERT(vp != DNLC_NO_VNODE);
1802 
1803         /*
1804          * If this is coming from the wrong zone, we let someone in the right
1805          * zone take care of it asynchronously.  We can get here due to
1806          * VN_RELE() being called from pageout() or fsflush().  This call may
1807          * potentially turn into an expensive no-op if, for instance, v_count
1808          * gets incremented in the meantime, but it's still correct.
1809          */
1810         if (nfs_zone() != VTOMI(vp)->mi_zone) {
1811                 nfs_async_inactive(vp, cr, nfs3_inactive);
1812                 return;
1813         }
1814 
1815         rp = VTOR(vp);
1816 redo:
1817         if (rp->r_unldvp != NULL) {
1818                 /*
1819                  * Save the vnode pointer for the directory where the
1820                  * unlinked-open file got renamed, then set it to NULL
1821                  * to prevent another thread from getting here before
1822                  * we're done with the remove.  While we have the
1823                  * statelock, make local copies of the pertinent rnode
1824                  * fields.  If we weren't to do this in an atomic way, the
1825                  * the unl* fields could become inconsistent with respect
1826                  * to each other due to a race condition between this
1827                  * code and nfs_remove().  See bug report 1034328.
1828                  */
1829                 mutex_enter(&rp->r_statelock);
1830                 if (rp->r_unldvp != NULL) {
1831                         vnode_t *unldvp;
1832                         char *unlname;
1833                         cred_t *unlcred;
1834                         REMOVE3args args;
1835                         REMOVE3res res;
1836                         int douprintf;
1837                         int error;
1838                         hrtime_t t;
1839 
1840                         unldvp = rp->r_unldvp;
1841                         rp->r_unldvp = NULL;
1842                         unlname = rp->r_unlname;
1843                         rp->r_unlname = NULL;
1844                         unlcred = rp->r_unlcred;
1845                         rp->r_unlcred = NULL;
1846                         mutex_exit(&rp->r_statelock);
1847 
1848                         /*
1849                          * If there are any dirty pages left, then flush
1850                          * them.  This is unfortunate because they just
1851                          * may get thrown away during the remove operation,
1852                          * but we have to do this for correctness.
1853                          */
1854                         if (vn_has_cached_data(vp) &&
1855                             ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1856                                 ASSERT(vp->v_type != VCHR);
1857                                 error = nfs3_putpage(vp, (offset_t)0, 0, 0,
1858                                     cr, ct);
1859                                 if (error) {
1860                                         mutex_enter(&rp->r_statelock);
1861                                         if (!rp->r_error)
1862                                                 rp->r_error = error;
1863                                         mutex_exit(&rp->r_statelock);
1864                                 }
1865                         }
1866 
1867                         /*
1868                          * Do the remove operation on the renamed file
1869                          */
1870                         setdiropargs3(&args.object, unlname, unldvp);
1871 
1872                         douprintf = 1;
1873 
1874                         t = gethrtime();
1875 
1876                         error = rfs3call(VTOMI(unldvp), NFSPROC3_REMOVE,
1877                             xdr_diropargs3, (caddr_t)&args,
1878                             xdr_REMOVE3res, (caddr_t)&res, unlcred,
1879                             &douprintf, &res.status, 0, NULL);
1880 
1881                         if (error) {
1882                                 PURGE_ATTRCACHE(unldvp);
1883                         } else {
1884                                 error = geterrno3(res.status);
1885                                 if (!error) {
1886                                         nfs3_cache_wcc_data(unldvp,
1887                                             &res.resok.dir_wcc, t, cr);
1888                                         if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1889                                                 nfs_purge_rddir_cache(unldvp);
1890                                 } else {
1891                                         nfs3_cache_wcc_data(unldvp,
1892                                             &res.resfail.dir_wcc, t, cr);
1893                                         PURGE_STALE_FH(error, unldvp, cr);
1894                                 }
1895                         }
1896 
1897                         /*
1898                          * Release stuff held for the remove
1899                          */
1900                         VN_RELE(unldvp);
1901                         kmem_free(unlname, MAXNAMELEN);
1902                         crfree(unlcred);
1903                         goto redo;
1904                 }
1905                 mutex_exit(&rp->r_statelock);
1906         }
1907 
1908         rp_addfree(rp, cr);
1909 }
1910 
1911 /*
1912  * Remote file system operations having to do with directory manipulation.
1913  */
1914 
1915 /* ARGSUSED */
1916 static int
1917 nfs3_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1918         int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1919         int *direntflags, pathname_t *realpnp)
1920 {
1921         int error;
1922         vnode_t *vp;
1923         vnode_t *avp = NULL;
1924         rnode_t *drp;
1925 
1926         if (nfs_zone() != VTOMI(dvp)->mi_zone)
1927                 return (EPERM);
1928 
1929         drp = VTOR(dvp);
1930 
1931         /*
1932          * Are we looking up extended attributes?  If so, "dvp" is
1933          * the file or directory for which we want attributes, and
1934          * we need a lookup of the hidden attribute directory
1935          * before we lookup the rest of the path.
1936          */
1937         if (flags & LOOKUP_XATTR) {
1938                 bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1939                 mntinfo_t *mi;
1940 
1941                 mi = VTOMI(dvp);
1942                 if (!(mi->mi_flags & MI_EXTATTR))
1943                         return (EINVAL);
1944 
1945                 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1946                         return (EINTR);
1947 
1948                 (void) nfs3lookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1949                 if (avp == NULL)
1950                         error = acl_getxattrdir3(dvp, &avp, cflag, cr, 0);
1951                 else
1952                         error = 0;
1953 
1954                 nfs_rw_exit(&drp->r_rwlock);
1955 
1956                 if (error) {
1957                         if (mi->mi_flags & MI_EXTATTR)
1958                                 return (error);
1959                         return (EINVAL);
1960                 }
1961                 dvp = avp;
1962                 drp = VTOR(dvp);
1963         }
1964 
1965         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1966                 error = EINTR;
1967                 goto out;
1968         }
1969 
1970         error = nfs3lookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1971 
1972         nfs_rw_exit(&drp->r_rwlock);
1973 
1974         /*
1975          * If vnode is a device, create special vnode.
1976          */
1977         if (!error && IS_DEVVP(*vpp)) {
1978                 vp = *vpp;
1979                 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1980                 VN_RELE(vp);
1981         }
1982 
1983 out:
1984         if (avp != NULL)
1985                 VN_RELE(avp);
1986 
1987         return (error);
1988 }
1989 
1990 static int nfs3_lookup_neg_cache = 1;
1991 
1992 #ifdef DEBUG
1993 static int nfs3_lookup_dnlc_hits = 0;
1994 static int nfs3_lookup_dnlc_misses = 0;
1995 static int nfs3_lookup_dnlc_neg_hits = 0;
1996 static int nfs3_lookup_dnlc_disappears = 0;
1997 static int nfs3_lookup_dnlc_lookups = 0;
1998 #endif
1999 
2000 /* ARGSUSED */
2001 int
2002 nfs3lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
2003         int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
2004 {
2005         int error;
2006         rnode_t *drp;
2007 
2008         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2009         /*
2010          * If lookup is for "", just return dvp.  Don't need
2011          * to send it over the wire, look it up in the dnlc,
2012          * or perform any access checks.
2013          */
2014         if (*nm == '\0') {
2015                 VN_HOLD(dvp);
2016                 *vpp = dvp;
2017                 return (0);
2018         }
2019 
2020         /*
2021          * Can't do lookups in non-directories.
2022          */
2023         if (dvp->v_type != VDIR)
2024                 return (ENOTDIR);
2025 
2026         /*
2027          * If we're called with RFSCALL_SOFT, it's important that
2028          * the only rfscall is one we make directly; if we permit
2029          * an access call because we're looking up "." or validating
2030          * a dnlc hit, we'll deadlock because that rfscall will not
2031          * have the RFSCALL_SOFT set.
2032          */
2033         if (rfscall_flags & RFSCALL_SOFT)
2034                 goto callit;
2035 
2036         /*
2037          * If lookup is for ".", just return dvp.  Don't need
2038          * to send it over the wire or look it up in the dnlc,
2039          * just need to check access.
2040          */
2041         if (strcmp(nm, ".") == 0) {
2042                 error = nfs3_access(dvp, VEXEC, 0, cr, NULL);
2043                 if (error)
2044                         return (error);
2045                 VN_HOLD(dvp);
2046                 *vpp = dvp;
2047                 return (0);
2048         }
2049 
2050         drp = VTOR(dvp);
2051         if (!(drp->r_flags & RLOOKUP)) {
2052                 mutex_enter(&drp->r_statelock);
2053                 drp->r_flags |= RLOOKUP;
2054                 mutex_exit(&drp->r_statelock);
2055         }
2056 
2057         /*
2058          * Lookup this name in the DNLC.  If there was a valid entry,
2059          * then return the results of the lookup.
2060          */
2061         error = nfs3lookup_dnlc(dvp, nm, vpp, cr);
2062         if (error || *vpp != NULL)
2063                 return (error);
2064 
2065 callit:
2066         error = nfs3lookup_otw(dvp, nm, vpp, cr, rfscall_flags);
2067 
2068         return (error);
2069 }
2070 
2071 static int
2072 nfs3lookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
2073 {
2074         int error;
2075         vnode_t *vp;
2076 
2077         ASSERT(*nm != '\0');
2078         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2079         /*
2080          * Lookup this name in the DNLC.  If successful, then validate
2081          * the caches and then recheck the DNLC.  The DNLC is rechecked
2082          * just in case this entry got invalidated during the call
2083          * to nfs3_validate_caches.
2084          *
2085          * An assumption is being made that it is safe to say that a
2086          * file exists which may not on the server.  Any operations to
2087          * the server will fail with ESTALE.
2088          */
2089 #ifdef DEBUG
2090         nfs3_lookup_dnlc_lookups++;
2091 #endif
2092         vp = dnlc_lookup(dvp, nm);
2093         if (vp != NULL) {
2094                 VN_RELE(vp);
2095                 if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
2096                         PURGE_ATTRCACHE(dvp);
2097                 }
2098                 error = nfs3_validate_caches(dvp, cr);
2099                 if (error)
2100                         return (error);
2101                 vp = dnlc_lookup(dvp, nm);
2102                 if (vp != NULL) {
2103                         error = nfs3_access(dvp, VEXEC, 0, cr, NULL);
2104                         if (error) {
2105                                 VN_RELE(vp);
2106                                 return (error);
2107                         }
2108                         if (vp == DNLC_NO_VNODE) {
2109                                 VN_RELE(vp);
2110 #ifdef DEBUG
2111                                 nfs3_lookup_dnlc_neg_hits++;
2112 #endif
2113                                 return (ENOENT);
2114                         }
2115                         *vpp = vp;
2116 #ifdef DEBUG
2117                         nfs3_lookup_dnlc_hits++;
2118 #endif
2119                         return (0);
2120                 }
2121 #ifdef DEBUG
2122                 nfs3_lookup_dnlc_disappears++;
2123 #endif
2124         }
2125 #ifdef DEBUG
2126         else
2127                 nfs3_lookup_dnlc_misses++;
2128 #endif
2129 
2130         *vpp = NULL;
2131 
2132         return (0);
2133 }
2134 
2135 static int
2136 nfs3lookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
2137         int rfscall_flags)
2138 {
2139         int error;
2140         LOOKUP3args args;
2141         LOOKUP3vres res;
2142         int douprintf;
2143         struct vattr vattr;
2144         struct vattr dvattr;
2145         vnode_t *vp;
2146         failinfo_t fi;
2147         hrtime_t t;
2148 
2149         ASSERT(*nm != '\0');
2150         ASSERT(dvp->v_type == VDIR);
2151         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2152 
2153         setdiropargs3(&args.what, nm, dvp);
2154 
2155         fi.vp = dvp;
2156         fi.fhp = (caddr_t)&args.what.dir;
2157         fi.copyproc = nfs3copyfh;
2158         fi.lookupproc = nfs3lookup;
2159         fi.xattrdirproc = acl_getxattrdir3;
2160         res.obj_attributes.fres.vp = dvp;
2161         res.obj_attributes.fres.vap = &vattr;
2162         res.dir_attributes.fres.vp = dvp;
2163         res.dir_attributes.fres.vap = &dvattr;
2164 
2165         douprintf = 1;
2166 
2167         t = gethrtime();
2168 
2169         error = rfs3call(VTOMI(dvp), NFSPROC3_LOOKUP,
2170             xdr_diropargs3, (caddr_t)&args,
2171             xdr_LOOKUP3vres, (caddr_t)&res, cr,
2172             &douprintf, &res.status, rfscall_flags, &fi);
2173 
2174         if (error)
2175                 return (error);
2176 
2177         nfs3_cache_post_op_vattr(dvp, &res.dir_attributes, t, cr);
2178 
2179         error = geterrno3(res.status);
2180         if (error) {
2181                 PURGE_STALE_FH(error, dvp, cr);
2182                 if (error == ENOENT && nfs3_lookup_neg_cache)
2183                         dnlc_enter(dvp, nm, DNLC_NO_VNODE);
2184                 return (error);
2185         }
2186 
2187         if (res.obj_attributes.attributes) {
2188                 vp = makenfs3node_va(&res.object, res.obj_attributes.fres.vap,
2189                     dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
2190         } else {
2191                 vp = makenfs3node_va(&res.object, NULL,
2192                     dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
2193                 if (vp->v_type == VNON) {
2194                         vattr.va_mask = AT_TYPE;
2195                         error = nfs3getattr(vp, &vattr, cr);
2196                         if (error) {
2197                                 VN_RELE(vp);
2198                                 return (error);
2199                         }
2200                         vp->v_type = vattr.va_type;
2201                 }
2202         }
2203 
2204         if (!(rfscall_flags & RFSCALL_SOFT))
2205                 dnlc_update(dvp, nm, vp);
2206 
2207         *vpp = vp;
2208 
2209         return (error);
2210 }
2211 
2212 #ifdef DEBUG
2213 static int nfs3_create_misses = 0;
2214 #endif
2215 
2216 /* ARGSUSED */
2217 static int
2218 nfs3_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2219         int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct,
2220         vsecattr_t *vsecp)
2221 {
2222         int error;
2223         vnode_t *vp;
2224         rnode_t *rp;
2225         struct vattr vattr;
2226         rnode_t *drp;
2227         vnode_t *tempvp;
2228 
2229         drp = VTOR(dvp);
2230         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2231                 return (EPERM);
2232         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2233                 return (EINTR);
2234 
2235 top:
2236         /*
2237          * We make a copy of the attributes because the caller does not
2238          * expect us to change what va points to.
2239          */
2240         vattr = *va;
2241 
2242         /*
2243          * If the pathname is "", just use dvp.  Don't need
2244          * to send it over the wire, look it up in the dnlc,
2245          * or perform any access checks.
2246          */
2247         if (*nm == '\0') {
2248                 error = 0;
2249                 VN_HOLD(dvp);
2250                 vp = dvp;
2251         /*
2252          * If the pathname is ".", just use dvp.  Don't need
2253          * to send it over the wire or look it up in the dnlc,
2254          * just need to check access.
2255          */
2256         } else if (strcmp(nm, ".") == 0) {
2257                 error = nfs3_access(dvp, VEXEC, 0, cr, ct);
2258                 if (error) {
2259                         nfs_rw_exit(&drp->r_rwlock);
2260                         return (error);
2261                 }
2262                 VN_HOLD(dvp);
2263                 vp = dvp;
2264         /*
2265          * We need to go over the wire, just to be sure whether the
2266          * file exists or not.  Using the DNLC can be dangerous in
2267          * this case when making a decision regarding existence.
2268          */
2269         } else {
2270                 error = nfs3lookup_otw(dvp, nm, &vp, cr, 0);
2271         }
2272         if (!error) {
2273                 if (exclusive == EXCL)
2274                         error = EEXIST;
2275                 else if (vp->v_type == VDIR && (mode & VWRITE))
2276                         error = EISDIR;
2277                 else {
2278                         /*
2279                          * If vnode is a device, create special vnode.
2280                          */
2281                         if (IS_DEVVP(vp)) {
2282                                 tempvp = vp;
2283                                 vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2284                                 VN_RELE(tempvp);
2285                         }
2286                         if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
2287                                 if ((vattr.va_mask & AT_SIZE) &&
2288                                     vp->v_type == VREG) {
2289                                         rp = VTOR(vp);
2290                                         /*
2291                                          * Check here for large file handled
2292                                          * by LF-unaware process (as
2293                                          * ufs_create() does)
2294                                          */
2295                                         if (!(lfaware & FOFFMAX)) {
2296                                                 mutex_enter(&rp->r_statelock);
2297                                                 if (rp->r_size > MAXOFF32_T)
2298                                                         error = EOVERFLOW;
2299                                                 mutex_exit(&rp->r_statelock);
2300                                         }
2301                                         if (!error) {
2302                                                 vattr.va_mask = AT_SIZE;
2303                                                 error = nfs3setattr(vp,
2304                                                     &vattr, 0, cr);
2305 
2306                                                 /*
2307                                                  * Existing file was truncated;
2308                                                  * emit a create event.
2309                                                  */
2310                                                 vnevent_create(vp, ct);
2311                                         }
2312                                 }
2313                         }
2314                 }
2315                 nfs_rw_exit(&drp->r_rwlock);
2316                 if (error) {
2317                         VN_RELE(vp);
2318                 } else {
2319                         *vpp = vp;
2320                 }
2321 
2322                 return (error);
2323         }
2324 
2325         dnlc_remove(dvp, nm);
2326 
2327         /*
2328          * Decide what the group-id of the created file should be.
2329          * Set it in attribute list as advisory...
2330          */
2331         error = setdirgid(dvp, &vattr.va_gid, cr);
2332         if (error) {
2333                 nfs_rw_exit(&drp->r_rwlock);
2334                 return (error);
2335         }
2336         vattr.va_mask |= AT_GID;
2337 
2338         ASSERT(vattr.va_mask & AT_TYPE);
2339         if (vattr.va_type == VREG) {
2340                 ASSERT(vattr.va_mask & AT_MODE);
2341                 if (MANDMODE(vattr.va_mode)) {
2342                         nfs_rw_exit(&drp->r_rwlock);
2343                         return (EACCES);
2344                 }
2345                 error = nfs3create(dvp, nm, &vattr, exclusive, mode, vpp, cr,
2346                     lfaware);
2347                 /*
2348                  * If this is not an exclusive create, then the CREATE
2349                  * request will be made with the GUARDED mode set.  This
2350                  * means that the server will return EEXIST if the file
2351                  * exists.  The file could exist because of a retransmitted
2352                  * request.  In this case, we recover by starting over and
2353                  * checking to see whether the file exists.  This second
2354                  * time through it should and a CREATE request will not be
2355                  * sent.
2356                  *
2357                  * This handles the problem of a dangling CREATE request
2358                  * which contains attributes which indicate that the file
2359                  * should be truncated.  This retransmitted request could
2360                  * possibly truncate valid data in the file if not caught
2361                  * by the duplicate request mechanism on the server or if
2362                  * not caught by other means.  The scenario is:
2363                  *
2364                  * Client transmits CREATE request with size = 0
2365                  * Client times out, retransmits request.
2366                  * Response to the first request arrives from the server
2367                  *  and the client proceeds on.
2368                  * Client writes data to the file.
2369                  * The server now processes retransmitted CREATE request
2370                  *  and truncates file.
2371                  *
2372                  * The use of the GUARDED CREATE request prevents this from
2373                  * happening because the retransmitted CREATE would fail
2374                  * with EEXIST and would not truncate the file.
2375                  */
2376                 if (error == EEXIST && exclusive == NONEXCL) {
2377 #ifdef DEBUG
2378                         nfs3_create_misses++;
2379 #endif
2380                         goto top;
2381                 }
2382                 nfs_rw_exit(&drp->r_rwlock);
2383                 return (error);
2384         }
2385         error = nfs3mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
2386         nfs_rw_exit(&drp->r_rwlock);
2387         return (error);
2388 }
2389 
2390 /* ARGSUSED */
2391 static int
2392 nfs3create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2393         int mode, vnode_t **vpp, cred_t *cr, int lfaware)
2394 {
2395         int error;
2396         CREATE3args args;
2397         CREATE3res res;
2398         int douprintf;
2399         vnode_t *vp;
2400         struct vattr vattr;
2401         nfstime3 *verfp;
2402         rnode_t *rp;
2403         timestruc_t now;
2404         hrtime_t t;
2405 
2406         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2407         setdiropargs3(&args.where, nm, dvp);
2408         if (exclusive == EXCL) {
2409                 args.how.mode = EXCLUSIVE;
2410                 /*
2411                  * Construct the create verifier.  This verifier needs
2412                  * to be unique between different clients.  It also needs
2413                  * to vary for each exclusive create request generated
2414                  * from the client to the server.
2415                  *
2416                  * The first attempt is made to use the hostid and a
2417                  * unique number on the client.  If the hostid has not
2418                  * been set, the high resolution time that the exclusive
2419                  * create request is being made is used.  This will work
2420                  * unless two different clients, both with the hostid
2421                  * not set, attempt an exclusive create request on the
2422                  * same file, at exactly the same clock time.  The
2423                  * chances of this happening seem small enough to be
2424                  * reasonable.
2425                  */
2426                 verfp = (nfstime3 *)&args.how.createhow3_u.verf;
2427                 verfp->seconds = zone_get_hostid(NULL);
2428                 if (verfp->seconds != 0)
2429                         verfp->nseconds = newnum();
2430                 else {
2431                         gethrestime(&now);
2432                         verfp->seconds = now.tv_sec;
2433                         verfp->nseconds = now.tv_nsec;
2434                 }
2435                 /*
2436                  * Since the server will use this value for the mtime,
2437                  * make sure that it can't overflow. Zero out the MSB.
2438                  * The actual value does not matter here, only its uniqeness.
2439                  */
2440                 verfp->seconds %= INT32_MAX;
2441         } else {
2442                 /*
2443                  * Issue the non-exclusive create in guarded mode.  This
2444                  * may result in some false EEXIST responses for
2445                  * retransmitted requests, but these will be handled at
2446                  * a higher level.  By using GUARDED, duplicate requests
2447                  * to do file truncation and possible access problems
2448                  * can be avoided.
2449                  */
2450                 args.how.mode = GUARDED;
2451                 error = vattr_to_sattr3(va,
2452                     &args.how.createhow3_u.obj_attributes);
2453                 if (error) {
2454                         /* req time field(s) overflow - return immediately */
2455                         return (error);
2456                 }
2457         }
2458 
2459         douprintf = 1;
2460 
2461         t = gethrtime();
2462 
2463         error = rfs3call(VTOMI(dvp), NFSPROC3_CREATE,
2464             xdr_CREATE3args, (caddr_t)&args,
2465             xdr_CREATE3res, (caddr_t)&res, cr,
2466             &douprintf, &res.status, 0, NULL);
2467 
2468         if (error) {
2469                 PURGE_ATTRCACHE(dvp);
2470                 return (error);
2471         }
2472 
2473         error = geterrno3(res.status);
2474         if (!error) {
2475                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
2476                 if (HAVE_RDDIR_CACHE(VTOR(dvp)))
2477                         nfs_purge_rddir_cache(dvp);
2478 
2479                 /*
2480                  * On exclusive create the times need to be explicitly
2481                  * set to clear any potential verifier that may be stored
2482                  * in one of these fields (see comment below).  This
2483                  * is done here to cover the case where no post op attrs
2484                  * were returned or a 'invalid' time was returned in
2485                  * the attributes.
2486                  */
2487                 if (exclusive == EXCL)
2488                         va->va_mask |= (AT_MTIME | AT_ATIME);
2489 
2490                 if (!res.resok.obj.handle_follows) {
2491                         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2492                         if (error)
2493                                 return (error);
2494                 } else {
2495                         if (res.resok.obj_attributes.attributes) {
2496                                 vp = makenfs3node(&res.resok.obj.handle,
2497                                     &res.resok.obj_attributes.attr,
2498                                     dvp->v_vfsp, t, cr, NULL, NULL);
2499                         } else {
2500                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
2501                                     dvp->v_vfsp, t, cr, NULL, NULL);
2502 
2503                                 /*
2504                                  * On an exclusive create, it is possible
2505                                  * that attributes were returned but those
2506                                  * postop attributes failed to decode
2507                                  * properly.  If this is the case,
2508                                  * then most likely the atime or mtime
2509                                  * were invalid for our client; this
2510                                  * is caused by the server storing the
2511                                  * create verifier in one of the time
2512                                  * fields(most likely mtime).
2513                                  * So... we are going to setattr just the
2514                                  * atime/mtime to clear things up.
2515                                  */
2516                                 if (exclusive == EXCL) {
2517                                         if (error =
2518                                             nfs3excl_create_settimes(vp,
2519                                             va, cr)) {
2520                                                 /*
2521                                                  * Setting the times failed.
2522                                                  * Remove the file and return
2523                                                  * the error.
2524                                                  */
2525                                                 VN_RELE(vp);
2526                                                 (void) nfs3_remove(dvp,
2527                                                     nm, cr, NULL, 0);
2528                                                 return (error);
2529                                         }
2530                                 }
2531 
2532                                 /*
2533                                  * This handles the non-exclusive case
2534                                  * and the exclusive case where no post op
2535                                  * attrs were returned.
2536                                  */
2537                                 if (vp->v_type == VNON) {
2538                                         vattr.va_mask = AT_TYPE;
2539                                         error = nfs3getattr(vp, &vattr, cr);
2540                                         if (error) {
2541                                                 VN_RELE(vp);
2542                                                 return (error);
2543                                         }
2544                                         vp->v_type = vattr.va_type;
2545                                 }
2546                         }
2547                         dnlc_update(dvp, nm, vp);
2548                 }
2549 
2550                 rp = VTOR(vp);
2551 
2552                 /*
2553                  * Check here for large file handled by
2554                  * LF-unaware process (as ufs_create() does)
2555                  */
2556                 if ((va->va_mask & AT_SIZE) && vp->v_type == VREG &&
2557                     !(lfaware & FOFFMAX)) {
2558                         mutex_enter(&rp->r_statelock);
2559                         if (rp->r_size > MAXOFF32_T) {
2560                                 mutex_exit(&rp->r_statelock);
2561                                 VN_RELE(vp);
2562                                 return (EOVERFLOW);
2563                         }
2564                         mutex_exit(&rp->r_statelock);
2565                 }
2566 
2567                 if (exclusive == EXCL &&
2568                     (va->va_mask & ~(AT_GID | AT_SIZE))) {
2569                         /*
2570                          * If doing an exclusive create, then generate
2571                          * a SETATTR to set the initial attributes.
2572                          * Try to set the mtime and the atime to the
2573                          * server's current time.  It is somewhat
2574                          * expected that these fields will be used to
2575                          * store the exclusive create cookie.  If not,
2576                          * server implementors will need to know that
2577                          * a SETATTR will follow an exclusive create
2578                          * and the cookie should be destroyed if
2579                          * appropriate. This work may have been done
2580                          * earlier in this function if post op attrs
2581                          * were not available.
2582                          *
2583                          * The AT_GID and AT_SIZE bits are turned off
2584                          * so that the SETATTR request will not attempt
2585                          * to process these.  The gid will be set
2586                          * separately if appropriate.  The size is turned
2587                          * off because it is assumed that a new file will
2588                          * be created empty and if the file wasn't empty,
2589                          * then the exclusive create will have failed
2590                          * because the file must have existed already.
2591                          * Therefore, no truncate operation is needed.
2592                          */
2593                         va->va_mask &= ~(AT_GID | AT_SIZE);
2594                         error = nfs3setattr(vp, va, 0, cr);
2595                         if (error) {
2596                                 /*
2597                                  * Couldn't correct the attributes of
2598                                  * the newly created file and the
2599                                  * attributes are wrong.  Remove the
2600                                  * file and return an error to the
2601                                  * application.
2602                                  */
2603                                 VN_RELE(vp);
2604                                 (void) nfs3_remove(dvp, nm, cr, NULL, 0);
2605                                 return (error);
2606                         }
2607                 }
2608 
2609                 if (va->va_gid != rp->r_attr.va_gid) {
2610                         /*
2611                          * If the gid on the file isn't right, then
2612                          * generate a SETATTR to attempt to change
2613                          * it.  This may or may not work, depending
2614                          * upon the server's semantics for allowing
2615                          * file ownership changes.
2616                          */
2617                         va->va_mask = AT_GID;
2618                         (void) nfs3setattr(vp, va, 0, cr);
2619                 }
2620 
2621                 /*
2622                  * If vnode is a device create special vnode
2623                  */
2624                 if (IS_DEVVP(vp)) {
2625                         *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2626                         VN_RELE(vp);
2627                 } else
2628                         *vpp = vp;
2629         } else {
2630                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
2631                 PURGE_STALE_FH(error, dvp, cr);
2632         }
2633 
2634         return (error);
2635 }
2636 
2637 /*
2638  * Special setattr function to take care of rest of atime/mtime
2639  * after successful exclusive create.  This function exists to avoid
2640  * handling attributes from the server; exclusive the atime/mtime fields
2641  * may be 'invalid' in client's view and therefore can not be trusted.
2642  */
2643 static int
2644 nfs3excl_create_settimes(vnode_t *vp, struct vattr *vap, cred_t *cr)
2645 {
2646         int error;
2647         uint_t mask;
2648         SETATTR3args args;
2649         SETATTR3res res;
2650         int douprintf;
2651         rnode_t *rp;
2652         hrtime_t t;
2653 
2654         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
2655         /* save the caller's mask so that it can be reset later */
2656         mask = vap->va_mask;
2657 
2658         rp = VTOR(vp);
2659 
2660         args.object = *RTOFH3(rp);
2661         args.guard.check = FALSE;
2662 
2663         /* Use the mask to initialize the arguments */
2664         vap->va_mask = 0;
2665         error = vattr_to_sattr3(vap, &args.new_attributes);
2666 
2667         /* We want to set just atime/mtime on this request */
2668         args.new_attributes.atime.set_it = SET_TO_SERVER_TIME;
2669         args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME;
2670 
2671         douprintf = 1;
2672 
2673         t = gethrtime();
2674 
2675         error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR,
2676             xdr_SETATTR3args, (caddr_t)&args,
2677             xdr_SETATTR3res, (caddr_t)&res, cr,
2678             &douprintf, &res.status, 0, NULL);
2679 
2680         if (error) {
2681                 vap->va_mask = mask;
2682                 return (error);
2683         }
2684 
2685         error = geterrno3(res.status);
2686         if (!error) {
2687                 /*
2688                  * It is important to pick up the attributes.
2689                  * Since this is the exclusive create path, the
2690                  * attributes on the initial create were ignored
2691                  * and we need these to have the correct info.
2692                  */
2693                 nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr);
2694                 /*
2695                  * No need to do the atime/mtime work again so clear
2696                  * the bits.
2697                  */
2698                 mask &= ~(AT_ATIME | AT_MTIME);
2699         } else {
2700                 nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr);
2701         }
2702 
2703         vap->va_mask = mask;
2704 
2705         return (error);
2706 }
2707 
2708 /* ARGSUSED */
2709 static int
2710 nfs3mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2711         int mode, vnode_t **vpp, cred_t *cr)
2712 {
2713         int error;
2714         MKNOD3args args;
2715         MKNOD3res res;
2716         int douprintf;
2717         vnode_t *vp;
2718         struct vattr vattr;
2719         hrtime_t t;
2720 
2721         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2722         switch (va->va_type) {
2723         case VCHR:
2724         case VBLK:
2725                 setdiropargs3(&args.where, nm, dvp);
2726                 args.what.type = (va->va_type == VCHR) ? NF3CHR : NF3BLK;
2727                 error = vattr_to_sattr3(va,
2728                     &args.what.mknoddata3_u.device.dev_attributes);
2729                 if (error) {
2730                         /* req time field(s) overflow - return immediately */
2731                         return (error);
2732                 }
2733                 args.what.mknoddata3_u.device.spec.specdata1 =
2734                     getmajor(va->va_rdev);
2735                 args.what.mknoddata3_u.device.spec.specdata2 =
2736                     getminor(va->va_rdev);
2737                 break;
2738 
2739         case VFIFO:
2740         case VSOCK:
2741                 setdiropargs3(&args.where, nm, dvp);
2742                 args.what.type = (va->va_type == VFIFO) ? NF3FIFO : NF3SOCK;
2743                 error = vattr_to_sattr3(va,
2744                     &args.what.mknoddata3_u.pipe_attributes);
2745                 if (error) {
2746                         /* req time field(s) overflow - return immediately */
2747                         return (error);
2748                 }
2749                 break;
2750 
2751         default:
2752                 return (EINVAL);
2753         }
2754 
2755         douprintf = 1;
2756 
2757         t = gethrtime();
2758 
2759         error = rfs3call(VTOMI(dvp), NFSPROC3_MKNOD,
2760             xdr_MKNOD3args, (caddr_t)&args,
2761             xdr_MKNOD3res, (caddr_t)&res, cr,
2762             &douprintf, &res.status, 0, NULL);
2763 
2764         if (error) {
2765                 PURGE_ATTRCACHE(dvp);
2766                 return (error);
2767         }
2768 
2769         error = geterrno3(res.status);
2770         if (!error) {
2771                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
2772                 if (HAVE_RDDIR_CACHE(VTOR(dvp)))
2773                         nfs_purge_rddir_cache(dvp);
2774 
2775                 if (!res.resok.obj.handle_follows) {
2776                         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2777                         if (error)
2778                                 return (error);
2779                 } else {
2780                         if (res.resok.obj_attributes.attributes) {
2781                                 vp = makenfs3node(&res.resok.obj.handle,
2782                                     &res.resok.obj_attributes.attr,
2783                                     dvp->v_vfsp, t, cr, NULL, NULL);
2784                         } else {
2785                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
2786                                     dvp->v_vfsp, t, cr, NULL, NULL);
2787                                 if (vp->v_type == VNON) {
2788                                         vattr.va_mask = AT_TYPE;
2789                                         error = nfs3getattr(vp, &vattr, cr);
2790                                         if (error) {
2791                                                 VN_RELE(vp);
2792                                                 return (error);
2793                                         }
2794                                         vp->v_type = vattr.va_type;
2795                                 }
2796 
2797                         }
2798                         dnlc_update(dvp, nm, vp);
2799                 }
2800 
2801                 if (va->va_gid != VTOR(vp)->r_attr.va_gid) {
2802                         va->va_mask = AT_GID;
2803                         (void) nfs3setattr(vp, va, 0, cr);
2804                 }
2805 
2806                 /*
2807                  * If vnode is a device create special vnode
2808                  */
2809                 if (IS_DEVVP(vp)) {
2810                         *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2811                         VN_RELE(vp);
2812                 } else
2813                         *vpp = vp;
2814         } else {
2815                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
2816                 PURGE_STALE_FH(error, dvp, cr);
2817         }
2818         return (error);
2819 }
2820 
2821 /*
2822  * Weirdness: if the vnode to be removed is open
2823  * we rename it instead of removing it and nfs_inactive
2824  * will remove the new name.
2825  */
2826 /* ARGSUSED */
2827 static int
2828 nfs3_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
2829 {
2830         int error;
2831         REMOVE3args args;
2832         REMOVE3res res;
2833         vnode_t *vp;
2834         char *tmpname;
2835         int douprintf;
2836         rnode_t *rp;
2837         rnode_t *drp;
2838         hrtime_t t;
2839 
2840         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2841                 return (EPERM);
2842         drp = VTOR(dvp);
2843         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2844                 return (EINTR);
2845 
2846         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2847         if (error) {
2848                 nfs_rw_exit(&drp->r_rwlock);
2849                 return (error);
2850         }
2851 
2852         if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2853                 VN_RELE(vp);
2854                 nfs_rw_exit(&drp->r_rwlock);
2855                 return (EPERM);
2856         }
2857 
2858         /*
2859          * First just remove the entry from the name cache, as it
2860          * is most likely the only entry for this vp.
2861          */
2862         dnlc_remove(dvp, nm);
2863 
2864         /*
2865          * If the file has a v_count > 1 then there may be more than one
2866          * entry in the name cache due multiple links or an open file,
2867          * but we don't have the real reference count so flush all
2868          * possible entries.
2869          */
2870         if (vp->v_count > 1)
2871                 dnlc_purge_vp(vp);
2872 
2873         /*
2874          * Now we have the real reference count on the vnode
2875          */
2876         rp = VTOR(vp);
2877         mutex_enter(&rp->r_statelock);
2878         if (vp->v_count > 1 &&
2879             (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2880                 mutex_exit(&rp->r_statelock);
2881                 tmpname = newname();
2882                 error = nfs3rename(dvp, nm, dvp, tmpname, cr, ct);
2883                 if (error)
2884                         kmem_free(tmpname, MAXNAMELEN);
2885                 else {
2886                         mutex_enter(&rp->r_statelock);
2887                         if (rp->r_unldvp == NULL) {
2888                                 VN_HOLD(dvp);
2889                                 rp->r_unldvp = dvp;
2890                                 if (rp->r_unlcred != NULL)
2891                                         crfree(rp->r_unlcred);
2892                                 crhold(cr);
2893                                 rp->r_unlcred = cr;
2894                                 rp->r_unlname = tmpname;
2895                         } else {
2896                                 kmem_free(rp->r_unlname, MAXNAMELEN);
2897                                 rp->r_unlname = tmpname;
2898                         }
2899                         mutex_exit(&rp->r_statelock);
2900                 }
2901         } else {
2902                 mutex_exit(&rp->r_statelock);
2903                 /*
2904                  * We need to flush any dirty pages which happen to
2905                  * be hanging around before removing the file.  This
2906                  * shouldn't happen very often and mostly on file
2907                  * systems mounted "nocto".
2908                  */
2909                 if (vn_has_cached_data(vp) &&
2910                     ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2911                         error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, ct);
2912                         if (error && (error == ENOSPC || error == EDQUOT)) {
2913                                 mutex_enter(&rp->r_statelock);
2914                                 if (!rp->r_error)
2915                                         rp->r_error = error;
2916                                 mutex_exit(&rp->r_statelock);
2917                         }
2918                 }
2919 
2920                 setdiropargs3(&args.object, nm, dvp);
2921 
2922                 douprintf = 1;
2923 
2924                 t = gethrtime();
2925 
2926                 error = rfs3call(VTOMI(dvp), NFSPROC3_REMOVE,
2927                     xdr_diropargs3, (caddr_t)&args,
2928                     xdr_REMOVE3res, (caddr_t)&res, cr,
2929                     &douprintf, &res.status, 0, NULL);
2930 
2931                 /*
2932                  * The xattr dir may be gone after last attr is removed,
2933                  * so flush it from dnlc.
2934                  */
2935                 if (dvp->v_flag & V_XATTRDIR)
2936                         dnlc_purge_vp(dvp);
2937 
2938                 PURGE_ATTRCACHE(vp);
2939 
2940                 if (error) {
2941                         PURGE_ATTRCACHE(dvp);
2942                 } else {
2943                         error = geterrno3(res.status);
2944                         if (!error) {
2945                                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t,
2946                                     cr);
2947                                 if (HAVE_RDDIR_CACHE(drp))
2948                                         nfs_purge_rddir_cache(dvp);
2949                         } else {
2950                                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc,
2951                                     t, cr);
2952                                 PURGE_STALE_FH(error, dvp, cr);
2953                         }
2954                 }
2955         }
2956 
2957         if (error == 0) {
2958                 vnevent_remove(vp, dvp, nm, ct);
2959         }
2960         VN_RELE(vp);
2961 
2962         nfs_rw_exit(&drp->r_rwlock);
2963 
2964         return (error);
2965 }
2966 
2967 /* ARGSUSED */
2968 static int
2969 nfs3_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
2970         caller_context_t *ct, int flags)
2971 {
2972         int error;
2973         LINK3args args;
2974         LINK3res res;
2975         vnode_t *realvp;
2976         int douprintf;
2977         mntinfo_t *mi;
2978         rnode_t *tdrp;
2979         hrtime_t t;
2980 
2981         if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2982                 return (EPERM);
2983         if (VOP_REALVP(svp, &realvp, ct) == 0)
2984                 svp = realvp;
2985 
2986         mi = VTOMI(svp);
2987 
2988         if (!(mi->mi_flags & MI_LINK))
2989                 return (EOPNOTSUPP);
2990 
2991         args.file = *VTOFH3(svp);
2992         setdiropargs3(&args.link, tnm, tdvp);
2993 
2994         tdrp = VTOR(tdvp);
2995         if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2996                 return (EINTR);
2997 
2998         dnlc_remove(tdvp, tnm);
2999 
3000         douprintf = 1;
3001 
3002         t = gethrtime();
3003 
3004         error = rfs3call(mi, NFSPROC3_LINK,
3005             xdr_LINK3args, (caddr_t)&args,
3006             xdr_LINK3res, (caddr_t)&res, cr,
3007             &douprintf, &res.status, 0, NULL);
3008 
3009         if (error) {
3010                 PURGE_ATTRCACHE(tdvp);
3011                 PURGE_ATTRCACHE(svp);
3012                 nfs_rw_exit(&tdrp->r_rwlock);
3013                 return (error);
3014         }
3015 
3016         error = geterrno3(res.status);
3017 
3018         if (!error) {
3019                 nfs3_cache_post_op_attr(svp, &res.resok.file_attributes, t, cr);
3020                 nfs3_cache_wcc_data(tdvp, &res.resok.linkdir_wcc, t, cr);
3021                 if (HAVE_RDDIR_CACHE(tdrp))
3022                         nfs_purge_rddir_cache(tdvp);
3023                 dnlc_update(tdvp, tnm, svp);
3024         } else {
3025                 nfs3_cache_post_op_attr(svp, &res.resfail.file_attributes, t,
3026                     cr);
3027                 nfs3_cache_wcc_data(tdvp, &res.resfail.linkdir_wcc, t, cr);
3028                 if (error == EOPNOTSUPP) {
3029                         mutex_enter(&mi->mi_lock);
3030                         mi->mi_flags &= ~MI_LINK;
3031                         mutex_exit(&mi->mi_lock);
3032                 }
3033         }
3034 
3035         nfs_rw_exit(&tdrp->r_rwlock);
3036 
3037         if (!error) {
3038                 /*
3039                  * Notify the source file of this link operation.
3040                  */
3041                 vnevent_link(svp, ct);
3042         }
3043         return (error);
3044 }
3045 
3046 /* ARGSUSED */
3047 static int
3048 nfs3_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
3049         caller_context_t *ct, int flags)
3050 {
3051         vnode_t *realvp;
3052 
3053         if (nfs_zone() != VTOMI(odvp)->mi_zone)
3054                 return (EPERM);
3055         if (VOP_REALVP(ndvp, &realvp, ct) == 0)
3056                 ndvp = realvp;
3057 
3058         return (nfs3rename(odvp, onm, ndvp, nnm, cr, ct));
3059 }
3060 
3061 /*
3062  * nfs3rename does the real work of renaming in NFS Version 3.
3063  */
3064 static int
3065 nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
3066     caller_context_t *ct)
3067 {
3068         int error;
3069         RENAME3args args;
3070         RENAME3res res;
3071         int douprintf;
3072         vnode_t *nvp = NULL;
3073         vnode_t *ovp = NULL;
3074         char *tmpname;
3075         rnode_t *rp;
3076         rnode_t *odrp;
3077         rnode_t *ndrp;
3078         hrtime_t t;
3079 
3080         ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
3081 
3082         if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
3083             strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
3084                 return (EINVAL);
3085 
3086         odrp = VTOR(odvp);
3087         ndrp = VTOR(ndvp);
3088         if ((intptr_t)odrp < (intptr_t)ndrp) {
3089                 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
3090                         return (EINTR);
3091                 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
3092                         nfs_rw_exit(&odrp->r_rwlock);
3093                         return (EINTR);
3094                 }
3095         } else {
3096                 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
3097                         return (EINTR);
3098                 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
3099                         nfs_rw_exit(&ndrp->r_rwlock);
3100                         return (EINTR);
3101                 }
3102         }
3103 
3104         /*
3105          * Lookup the target file.  If it exists, it needs to be
3106          * checked to see whether it is a mount point and whether
3107          * it is active (open).
3108          */
3109         error = nfs3lookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
3110         if (!error) {
3111                 /*
3112                  * If this file has been mounted on, then just
3113                  * return busy because renaming to it would remove
3114                  * the mounted file system from the name space.
3115                  */
3116                 if (vn_mountedvfs(nvp) != NULL) {
3117                         VN_RELE(nvp);
3118                         nfs_rw_exit(&odrp->r_rwlock);
3119                         nfs_rw_exit(&ndrp->r_rwlock);
3120                         return (EBUSY);
3121                 }
3122 
3123                 /*
3124                  * Purge the name cache of all references to this vnode
3125                  * so that we can check the reference count to infer
3126                  * whether it is active or not.
3127                  */
3128                 /*
3129                  * First just remove the entry from the name cache, as it
3130                  * is most likely the only entry for this vp.
3131                  */
3132                 dnlc_remove(ndvp, nnm);
3133                 /*
3134                  * If the file has a v_count > 1 then there may be more
3135                  * than one entry in the name cache due multiple links
3136                  * or an open file, but we don't have the real reference
3137                  * count so flush all possible entries.
3138                  */
3139                 if (nvp->v_count > 1)
3140                         dnlc_purge_vp(nvp);
3141 
3142                 /*
3143                  * If the vnode is active and is not a directory,
3144                  * arrange to rename it to a
3145                  * temporary file so that it will continue to be
3146                  * accessible.  This implements the "unlink-open-file"
3147                  * semantics for the target of a rename operation.
3148                  * Before doing this though, make sure that the
3149                  * source and target files are not already the same.
3150                  */
3151                 if (nvp->v_count > 1 && nvp->v_type != VDIR) {
3152                         /*
3153                          * Lookup the source name.
3154                          */
3155                         error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL,
3156                             cr, 0);
3157 
3158                         /*
3159                          * The source name *should* already exist.
3160                          */
3161                         if (error) {
3162                                 VN_RELE(nvp);
3163                                 nfs_rw_exit(&odrp->r_rwlock);
3164                                 nfs_rw_exit(&ndrp->r_rwlock);
3165                                 return (error);
3166                         }
3167 
3168                         /*
3169                          * Compare the two vnodes.  If they are the same,
3170                          * just release all held vnodes and return success.
3171                          */
3172                         if (ovp == nvp) {
3173                                 VN_RELE(ovp);
3174                                 VN_RELE(nvp);
3175                                 nfs_rw_exit(&odrp->r_rwlock);
3176                                 nfs_rw_exit(&ndrp->r_rwlock);
3177                                 return (0);
3178                         }
3179 
3180                         /*
3181                          * Can't mix and match directories and non-
3182                          * directories in rename operations.  We already
3183                          * know that the target is not a directory.  If
3184                          * the source is a directory, return an error.
3185                          */
3186                         if (ovp->v_type == VDIR) {
3187                                 VN_RELE(ovp);
3188                                 VN_RELE(nvp);
3189                                 nfs_rw_exit(&odrp->r_rwlock);
3190                                 nfs_rw_exit(&ndrp->r_rwlock);
3191                                 return (ENOTDIR);
3192                         }
3193 
3194                         /*
3195                          * The target file exists, is not the same as
3196                          * the source file, and is active.  Link it
3197                          * to a temporary filename to avoid having
3198                          * the server removing the file completely.
3199                          */
3200                         tmpname = newname();
3201                         error = nfs3_link(ndvp, nvp, tmpname, cr, NULL, 0);
3202                         if (error == EOPNOTSUPP) {
3203                                 error = nfs3_rename(ndvp, nnm, ndvp, tmpname,
3204                                     cr, NULL, 0);
3205                         }
3206                         if (error) {
3207                                 kmem_free(tmpname, MAXNAMELEN);
3208                                 VN_RELE(ovp);
3209                                 VN_RELE(nvp);
3210                                 nfs_rw_exit(&odrp->r_rwlock);
3211                                 nfs_rw_exit(&ndrp->r_rwlock);
3212                                 return (error);
3213                         }
3214                         rp = VTOR(nvp);
3215                         mutex_enter(&rp->r_statelock);
3216                         if (rp->r_unldvp == NULL) {
3217                                 VN_HOLD(ndvp);
3218                                 rp->r_unldvp = ndvp;
3219                                 if (rp->r_unlcred != NULL)
3220                                         crfree(rp->r_unlcred);
3221                                 crhold(cr);
3222                                 rp->r_unlcred = cr;
3223                                 rp->r_unlname = tmpname;
3224                         } else {
3225                                 kmem_free(rp->r_unlname, MAXNAMELEN);
3226                                 rp->r_unlname = tmpname;
3227                         }
3228                         mutex_exit(&rp->r_statelock);
3229                 }
3230         }
3231 
3232         if (ovp == NULL) {
3233                 /*
3234                  * When renaming directories to be a subdirectory of a
3235                  * different parent, the dnlc entry for ".." will no
3236                  * longer be valid, so it must be removed.
3237                  *
3238                  * We do a lookup here to determine whether we are renaming
3239                  * a directory and we need to check if we are renaming
3240                  * an unlinked file.  This might have already been done
3241                  * in previous code, so we check ovp == NULL to avoid
3242                  * doing it twice.
3243                  */
3244 
3245                 error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
3246                 /*
3247                  * The source name *should* already exist.
3248                  */
3249                 if (error) {
3250                         nfs_rw_exit(&odrp->r_rwlock);
3251                         nfs_rw_exit(&ndrp->r_rwlock);
3252                         if (nvp) {
3253                                 VN_RELE(nvp);
3254                         }
3255                         return (error);
3256                 }
3257                 ASSERT(ovp != NULL);
3258         }
3259 
3260         dnlc_remove(odvp, onm);
3261         dnlc_remove(ndvp, nnm);
3262 
3263         setdiropargs3(&args.from, onm, odvp);
3264         setdiropargs3(&args.to, nnm, ndvp);
3265 
3266         douprintf = 1;
3267 
3268         t = gethrtime();
3269 
3270         error = rfs3call(VTOMI(odvp), NFSPROC3_RENAME,
3271             xdr_RENAME3args, (caddr_t)&args,
3272             xdr_RENAME3res, (caddr_t)&res, cr,
3273             &douprintf, &res.status, 0, NULL);
3274 
3275         if (error) {
3276                 PURGE_ATTRCACHE(odvp);
3277                 PURGE_ATTRCACHE(ndvp);
3278                 VN_RELE(ovp);
3279                 nfs_rw_exit(&odrp->r_rwlock);
3280                 nfs_rw_exit(&ndrp->r_rwlock);
3281                 if (nvp) {
3282                         VN_RELE(nvp);
3283                 }
3284                 return (error);
3285         }
3286 
3287         error = geterrno3(res.status);
3288 
3289         if (!error) {
3290                 nfs3_cache_wcc_data(odvp, &res.resok.fromdir_wcc, t, cr);
3291                 if (HAVE_RDDIR_CACHE(odrp))
3292                         nfs_purge_rddir_cache(odvp);
3293                 if (ndvp != odvp) {
3294                         nfs3_cache_wcc_data(ndvp, &res.resok.todir_wcc, t, cr);
3295                         if (HAVE_RDDIR_CACHE(ndrp))
3296                                 nfs_purge_rddir_cache(ndvp);
3297                 }
3298                 /*
3299                  * when renaming directories to be a subdirectory of a
3300                  * different parent, the dnlc entry for ".." will no
3301                  * longer be valid, so it must be removed
3302                  */
3303                 rp = VTOR(ovp);
3304                 if (ndvp != odvp) {
3305                         if (ovp->v_type == VDIR) {
3306                                 dnlc_remove(ovp, "..");
3307                                 if (HAVE_RDDIR_CACHE(rp))
3308                                         nfs_purge_rddir_cache(ovp);
3309                         }
3310                 }
3311 
3312                 /*
3313                  * If we are renaming the unlinked file, update the
3314                  * r_unldvp and r_unlname as needed.
3315                  */
3316                 mutex_enter(&rp->r_statelock);
3317                 if (rp->r_unldvp != NULL) {
3318                         if (strcmp(rp->r_unlname, onm) == 0) {
3319                                 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
3320                                 rp->r_unlname[MAXNAMELEN - 1] = '\0';
3321 
3322                                 if (ndvp != rp->r_unldvp) {
3323                                         VN_RELE(rp->r_unldvp);
3324                                         rp->r_unldvp = ndvp;
3325                                         VN_HOLD(ndvp);
3326                                 }
3327                         }
3328                 }
3329                 mutex_exit(&rp->r_statelock);
3330         } else {
3331                 nfs3_cache_wcc_data(odvp, &res.resfail.fromdir_wcc, t, cr);
3332                 if (ndvp != odvp) {
3333                         nfs3_cache_wcc_data(ndvp, &res.resfail.todir_wcc, t,
3334                             cr);
3335                 }
3336                 /*
3337                  * System V defines rename to return EEXIST, not
3338                  * ENOTEMPTY if the target directory is not empty.
3339                  * Over the wire, the error is NFSERR_ENOTEMPTY
3340                  * which geterrno maps to ENOTEMPTY.
3341                  */
3342                 if (error == ENOTEMPTY)
3343                         error = EEXIST;
3344         }
3345 
3346         if (error == 0) {
3347                 if (nvp)
3348                         vnevent_rename_dest(nvp, ndvp, nnm, ct);
3349 
3350                 if (odvp != ndvp)
3351                         vnevent_rename_dest_dir(ndvp, ct);
3352                 ASSERT(ovp != NULL);
3353                 vnevent_rename_src(ovp, odvp, onm, ct);
3354         }
3355 
3356         if (nvp) {
3357                 VN_RELE(nvp);
3358         }
3359         VN_RELE(ovp);
3360 
3361         nfs_rw_exit(&odrp->r_rwlock);
3362         nfs_rw_exit(&ndrp->r_rwlock);
3363 
3364         return (error);
3365 }
3366 
3367 /* ARGSUSED */
3368 static int
3369 nfs3_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
3370         caller_context_t *ct, int flags, vsecattr_t *vsecp)
3371 {
3372         int error;
3373         MKDIR3args args;
3374         MKDIR3res res;
3375         int douprintf;
3376         struct vattr vattr;
3377         vnode_t *vp;
3378         rnode_t *drp;
3379         hrtime_t t;
3380 
3381         if (nfs_zone() != VTOMI(dvp)->mi_zone)
3382                 return (EPERM);
3383         setdiropargs3(&args.where, nm, dvp);
3384 
3385         /*
3386          * Decide what the group-id and set-gid bit of the created directory
3387          * should be.  May have to do a setattr to get the gid right.
3388          */
3389         error = setdirgid(dvp, &va->va_gid, cr);
3390         if (error)
3391                 return (error);
3392         error = setdirmode(dvp, &va->va_mode, cr);
3393         if (error)
3394                 return (error);
3395         va->va_mask |= AT_MODE|AT_GID;
3396 
3397         error = vattr_to_sattr3(va, &args.attributes);
3398         if (error) {
3399                 /* req time field(s) overflow - return immediately */
3400                 return (error);
3401         }
3402 
3403         drp = VTOR(dvp);
3404         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3405                 return (EINTR);
3406 
3407         dnlc_remove(dvp, nm);
3408 
3409         douprintf = 1;
3410 
3411         t = gethrtime();
3412 
3413         error = rfs3call(VTOMI(dvp), NFSPROC3_MKDIR,
3414             xdr_MKDIR3args, (caddr_t)&args,
3415             xdr_MKDIR3res, (caddr_t)&res, cr,
3416             &douprintf, &res.status, 0, NULL);
3417 
3418         if (error) {
3419                 PURGE_ATTRCACHE(dvp);
3420                 nfs_rw_exit(&drp->r_rwlock);
3421                 return (error);
3422         }
3423 
3424         error = geterrno3(res.status);
3425         if (!error) {
3426                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3427                 if (HAVE_RDDIR_CACHE(drp))
3428                         nfs_purge_rddir_cache(dvp);
3429 
3430                 if (!res.resok.obj.handle_follows) {
3431                         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
3432                         if (error) {
3433                                 nfs_rw_exit(&drp->r_rwlock);
3434                                 return (error);
3435                         }
3436                 } else {
3437                         if (res.resok.obj_attributes.attributes) {
3438                                 vp = makenfs3node(&res.resok.obj.handle,
3439                                     &res.resok.obj_attributes.attr,
3440                                     dvp->v_vfsp, t, cr, NULL, NULL);
3441                         } else {
3442                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
3443                                     dvp->v_vfsp, t, cr, NULL, NULL);
3444                                 if (vp->v_type == VNON) {
3445                                         vattr.va_mask = AT_TYPE;
3446                                         error = nfs3getattr(vp, &vattr, cr);
3447                                         if (error) {
3448                                                 VN_RELE(vp);
3449                                                 nfs_rw_exit(&drp->r_rwlock);
3450                                                 return (error);
3451                                         }
3452                                         vp->v_type = vattr.va_type;
3453                                 }
3454                         }
3455                         dnlc_update(dvp, nm, vp);
3456                 }
3457                 if (va->va_gid != VTOR(vp)->r_attr.va_gid) {
3458                         va->va_mask = AT_GID;
3459                         (void) nfs3setattr(vp, va, 0, cr);
3460                 }
3461                 *vpp = vp;
3462         } else {
3463                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3464                 PURGE_STALE_FH(error, dvp, cr);
3465         }
3466 
3467         nfs_rw_exit(&drp->r_rwlock);
3468 
3469         return (error);
3470 }
3471 
3472 /* ARGSUSED */
3473 static int
3474 nfs3_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
3475         caller_context_t *ct, int flags)
3476 {
3477         int error;
3478         RMDIR3args args;
3479         RMDIR3res res;
3480         vnode_t *vp;
3481         int douprintf;
3482         rnode_t *drp;
3483         hrtime_t t;
3484 
3485         if (nfs_zone() != VTOMI(dvp)->mi_zone)
3486                 return (EPERM);
3487         drp = VTOR(dvp);
3488         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3489                 return (EINTR);
3490 
3491         /*
3492          * Attempt to prevent a rmdir(".") from succeeding.
3493          */
3494         error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
3495         if (error) {
3496                 nfs_rw_exit(&drp->r_rwlock);
3497                 return (error);
3498         }
3499 
3500         if (vp == cdir) {
3501                 VN_RELE(vp);
3502                 nfs_rw_exit(&drp->r_rwlock);
3503                 return (EINVAL);
3504         }
3505 
3506         setdiropargs3(&args.object, nm, dvp);
3507 
3508         /*
3509          * First just remove the entry from the name cache, as it
3510          * is most likely an entry for this vp.
3511          */
3512         dnlc_remove(dvp, nm);
3513 
3514         /*
3515          * If there vnode reference count is greater than one, then
3516          * there may be additional references in the DNLC which will
3517          * need to be purged.  First, trying removing the entry for
3518          * the parent directory and see if that removes the additional
3519          * reference(s).  If that doesn't do it, then use dnlc_purge_vp
3520          * to completely remove any references to the directory which
3521          * might still exist in the DNLC.
3522          */
3523         if (vp->v_count > 1) {
3524                 dnlc_remove(vp, "..");
3525                 if (vp->v_count > 1)
3526                         dnlc_purge_vp(vp);
3527         }
3528 
3529         douprintf = 1;
3530 
3531         t = gethrtime();
3532 
3533         error = rfs3call(VTOMI(dvp), NFSPROC3_RMDIR,
3534             xdr_diropargs3, (caddr_t)&args,
3535             xdr_RMDIR3res, (caddr_t)&res, cr,
3536             &douprintf, &res.status, 0, NULL);
3537 
3538         PURGE_ATTRCACHE(vp);
3539 
3540         if (error) {
3541                 PURGE_ATTRCACHE(dvp);
3542                 VN_RELE(vp);
3543                 nfs_rw_exit(&drp->r_rwlock);
3544                 return (error);
3545         }
3546 
3547         error = geterrno3(res.status);
3548         if (!error) {
3549                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3550                 if (HAVE_RDDIR_CACHE(drp))
3551                         nfs_purge_rddir_cache(dvp);
3552                 if (HAVE_RDDIR_CACHE(VTOR(vp)))
3553                         nfs_purge_rddir_cache(vp);
3554         } else {
3555                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3556                 PURGE_STALE_FH(error, dvp, cr);
3557                 /*
3558                  * System V defines rmdir to return EEXIST, not
3559                  * ENOTEMPTY if the directory is not empty.  Over
3560                  * the wire, the error is NFSERR_ENOTEMPTY which
3561                  * geterrno maps to ENOTEMPTY.
3562                  */
3563                 if (error == ENOTEMPTY)
3564                         error = EEXIST;
3565         }
3566 
3567         if (error == 0) {
3568                 vnevent_rmdir(vp, dvp, nm, ct);
3569         }
3570         VN_RELE(vp);
3571 
3572         nfs_rw_exit(&drp->r_rwlock);
3573 
3574         return (error);
3575 }
3576 
3577 /* ARGSUSED */
3578 static int
3579 nfs3_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
3580         caller_context_t *ct, int flags)
3581 {
3582         int error;
3583         SYMLINK3args args;
3584         SYMLINK3res res;
3585         int douprintf;
3586         mntinfo_t *mi;
3587         vnode_t *vp;
3588         rnode_t *rp;
3589         char *contents;
3590         rnode_t *drp;
3591         hrtime_t t;
3592 
3593         mi = VTOMI(dvp);
3594 
3595         if (nfs_zone() != mi->mi_zone)
3596                 return (EPERM);
3597         if (!(mi->mi_flags & MI_SYMLINK))
3598                 return (EOPNOTSUPP);
3599 
3600         setdiropargs3(&args.where, lnm, dvp);
3601         error = vattr_to_sattr3(tva, &args.symlink.symlink_attributes);
3602         if (error) {
3603                 /* req time field(s) overflow - return immediately */
3604                 return (error);
3605         }
3606         args.symlink.symlink_data = tnm;
3607 
3608         drp = VTOR(dvp);
3609         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3610                 return (EINTR);
3611 
3612         dnlc_remove(dvp, lnm);
3613 
3614         douprintf = 1;
3615 
3616         t = gethrtime();
3617 
3618         error = rfs3call(mi, NFSPROC3_SYMLINK,
3619             xdr_SYMLINK3args, (caddr_t)&args,
3620             xdr_SYMLINK3res, (caddr_t)&res, cr,
3621             &douprintf, &res.status, 0, NULL);
3622 
3623         if (error) {
3624                 PURGE_ATTRCACHE(dvp);
3625                 nfs_rw_exit(&drp->r_rwlock);
3626                 return (error);
3627         }
3628 
3629         error = geterrno3(res.status);
3630         if (!error) {
3631                 nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3632                 if (HAVE_RDDIR_CACHE(drp))
3633                         nfs_purge_rddir_cache(dvp);
3634 
3635                 if (res.resok.obj.handle_follows) {
3636                         if (res.resok.obj_attributes.attributes) {
3637                                 vp = makenfs3node(&res.resok.obj.handle,
3638                                     &res.resok.obj_attributes.attr,
3639                                     dvp->v_vfsp, t, cr, NULL, NULL);
3640                         } else {
3641                                 vp = makenfs3node(&res.resok.obj.handle, NULL,
3642                                     dvp->v_vfsp, t, cr, NULL, NULL);
3643                                 vp->v_type = VLNK;
3644                                 vp->v_rdev = 0;
3645                         }
3646                         dnlc_update(dvp, lnm, vp);
3647                         rp = VTOR(vp);
3648                         if (nfs3_do_symlink_cache &&
3649                             rp->r_symlink.contents == NULL) {
3650 
3651                                 contents = kmem_alloc(MAXPATHLEN,
3652                                     KM_NOSLEEP);
3653 
3654                                 if (contents != NULL) {
3655                                         mutex_enter(&rp->r_statelock);
3656                                         if (rp->r_symlink.contents == NULL) {
3657                                                 rp->r_symlink.len = strlen(tnm);
3658                                                 bcopy(tnm, contents,
3659                                                     rp->r_symlink.len);
3660                                                 rp->r_symlink.contents =
3661                                                     contents;
3662                                                 rp->r_symlink.size = MAXPATHLEN;
3663                                                 mutex_exit(&rp->r_statelock);
3664                                         } else {
3665                                                 mutex_exit(&rp->r_statelock);
3666                                                 kmem_free((void *)contents,
3667                                                     MAXPATHLEN);
3668                                         }
3669                                 }
3670                         }
3671                         VN_RELE(vp);
3672                 }
3673         } else {
3674                 nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3675                 PURGE_STALE_FH(error, dvp, cr);
3676                 if (error == EOPNOTSUPP) {
3677                         mutex_enter(&mi->mi_lock);
3678                         mi->mi_flags &= ~MI_SYMLINK;
3679                         mutex_exit(&mi->mi_lock);
3680                 }
3681         }
3682 
3683         nfs_rw_exit(&drp->r_rwlock);
3684 
3685         return (error);
3686 }
3687 
3688 #ifdef DEBUG
3689 static int nfs3_readdir_cache_hits = 0;
3690 static int nfs3_readdir_cache_shorts = 0;
3691 static int nfs3_readdir_cache_waits = 0;
3692 static int nfs3_readdir_cache_misses = 0;
3693 static int nfs3_readdir_readahead = 0;
3694 #endif
3695 
3696 static int nfs3_shrinkreaddir = 0;
3697 
3698 /*
3699  * Read directory entries.
3700  * There are some weird things to look out for here.  The uio_loffset
3701  * field is either 0 or it is the offset returned from a previous
3702  * readdir.  It is an opaque value used by the server to find the
3703  * correct directory block to read. The count field is the number
3704  * of blocks to read on the server.  This is advisory only, the server
3705  * may return only one block's worth of entries.  Entries may be compressed
3706  * on the server.
3707  */
3708 /* ARGSUSED */
3709 static int
3710 nfs3_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
3711         caller_context_t *ct, int flags)
3712 {
3713         int error;
3714         size_t count;
3715         rnode_t *rp;
3716         rddir_cache *rdc;
3717         rddir_cache *nrdc;
3718         rddir_cache *rrdc;
3719 #ifdef DEBUG
3720         int missed;
3721 #endif
3722         int doreadahead;
3723         rddir_cache srdc;
3724         avl_index_t where;
3725 
3726         if (nfs_zone() != VTOMI(vp)->mi_zone)
3727                 return (EIO);
3728         rp = VTOR(vp);
3729 
3730         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
3731 
3732         /*
3733          * Make sure that the directory cache is valid.
3734          */
3735         if (HAVE_RDDIR_CACHE(rp)) {
3736                 if (nfs_disable_rddir_cache) {
3737                         /*
3738                          * Setting nfs_disable_rddir_cache in /etc/system
3739                          * allows interoperability with servers that do not
3740                          * properly update the attributes of directories.
3741                          * Any cached information gets purged before an
3742                          * access is made to it.
3743                          */
3744                         nfs_purge_rddir_cache(vp);
3745                 } else {
3746                         error = nfs3_validate_caches(vp, cr);
3747                         if (error)
3748                                 return (error);
3749                 }
3750         }
3751 
3752         /*
3753          * It is possible that some servers may not be able to correctly
3754          * handle a large READDIR or READDIRPLUS request due to bugs in
3755          * their implementation.  In order to continue to interoperate
3756          * with them, this workaround is provided to limit the maximum
3757          * size of a READDIRPLUS request to 1024.  In any case, the request
3758          * size is limited to MAXBSIZE.
3759          */
3760         count = MIN(uiop->uio_iov->iov_len,
3761             nfs3_shrinkreaddir ? 1024 : MAXBSIZE);
3762 
3763         nrdc = NULL;
3764 #ifdef DEBUG
3765         missed = 0;
3766 #endif
3767 top:
3768         /*
3769          * Short circuit last readdir which always returns 0 bytes.
3770          * This can be done after the directory has been read through
3771          * completely at least once.  This will set r_direof which
3772          * can be used to find the value of the last cookie.
3773          */
3774         mutex_enter(&rp->r_statelock);
3775         if (rp->r_direof != NULL &&
3776             uiop->uio_loffset == rp->r_direof->nfs3_ncookie) {
3777                 mutex_exit(&rp->r_statelock);
3778 #ifdef DEBUG
3779                 nfs3_readdir_cache_shorts++;
3780 #endif
3781                 if (eofp)
3782                         *eofp = 1;
3783                 if (nrdc != NULL)
3784                         rddir_cache_rele(nrdc);
3785                 return (0);
3786         }
3787         /*
3788          * Look for a cache entry.  Cache entries are identified
3789          * by the NFS cookie value and the byte count requested.
3790          */
3791         srdc.nfs3_cookie = uiop->uio_loffset;
3792         srdc.buflen = count;
3793         rdc = avl_find(&rp->r_dir, &srdc, &where);
3794         if (rdc != NULL) {
3795                 rddir_cache_hold(rdc);
3796                 /*
3797                  * If the cache entry is in the process of being
3798                  * filled in, wait until this completes.  The
3799                  * RDDIRWAIT bit is set to indicate that someone
3800                  * is waiting and then the thread currently
3801                  * filling the entry is done, it should do a
3802                  * cv_broadcast to wakeup all of the threads
3803                  * waiting for it to finish.
3804                  */
3805                 if (rdc->flags & RDDIR) {
3806                         nfs_rw_exit(&rp->r_rwlock);
3807                         rdc->flags |= RDDIRWAIT;
3808 #ifdef DEBUG
3809                         nfs3_readdir_cache_waits++;
3810 #endif
3811                         if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
3812                                 /*
3813                                  * We got interrupted, probably
3814                                  * the user typed ^C or an alarm
3815                                  * fired.  We free the new entry
3816                                  * if we allocated one.
3817                                  */
3818                                 mutex_exit(&rp->r_statelock);
3819                                 (void) nfs_rw_enter_sig(&rp->r_rwlock,
3820                                     RW_READER, FALSE);
3821                                 rddir_cache_rele(rdc);
3822                                 if (nrdc != NULL)
3823                                         rddir_cache_rele(nrdc);
3824                                 return (EINTR);
3825                         }
3826                         mutex_exit(&rp->r_statelock);
3827                         (void) nfs_rw_enter_sig(&rp->r_rwlock,
3828                             RW_READER, FALSE);
3829                         rddir_cache_rele(rdc);
3830                         goto top;
3831                 }
3832                 /*
3833                  * Check to see if a readdir is required to
3834                  * fill the entry.  If so, mark this entry
3835                  * as being filled, remove our reference,
3836                  * and branch to the code to fill the entry.
3837                  */
3838                 if (rdc->flags & RDDIRREQ) {
3839                         rdc->flags &= ~RDDIRREQ;
3840                         rdc->flags |= RDDIR;
3841                         if (nrdc != NULL)
3842                                 rddir_cache_rele(nrdc);
3843                         nrdc = rdc;
3844                         mutex_exit(&rp->r_statelock);
3845                         goto bottom;
3846                 }
3847 #ifdef DEBUG
3848                 if (!missed)
3849                         nfs3_readdir_cache_hits++;
3850 #endif
3851                 /*
3852                  * If an error occurred while attempting
3853                  * to fill the cache entry, just return it.
3854                  */
3855                 if (rdc->error) {
3856                         error = rdc->error;
3857                         mutex_exit(&rp->r_statelock);
3858                         rddir_cache_rele(rdc);
3859                         if (nrdc != NULL)
3860                                 rddir_cache_rele(nrdc);
3861                         return (error);
3862                 }
3863 
3864                 /*
3865                  * The cache entry is complete and good,
3866                  * copyout the dirent structs to the calling
3867                  * thread.
3868                  */
3869                 error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3870 
3871                 /*
3872                  * If no error occurred during the copyout,
3873                  * update the offset in the uio struct to
3874                  * contain the value of the next cookie
3875                  * and set the eof value appropriately.
3876                  */
3877                 if (!error) {
3878                         uiop->uio_loffset = rdc->nfs3_ncookie;
3879                         if (eofp)
3880                                 *eofp = rdc->eof;
3881                 }
3882 
3883                 /*
3884                  * Decide whether to do readahead.
3885                  *
3886                  * Don't if have already read to the end of
3887                  * directory.  There is nothing more to read.
3888                  *
3889                  * Don't if the application is not doing
3890                  * lookups in the directory.  The readahead
3891                  * is only effective if the application can
3892                  * be doing work while an async thread is
3893                  * handling the over the wire request.
3894                  */
3895                 if (rdc->eof) {
3896                         rp->r_direof = rdc;
3897                         doreadahead = FALSE;
3898                 } else if (!(rp->r_flags & RLOOKUP))
3899                         doreadahead = FALSE;
3900                 else
3901                         doreadahead = TRUE;
3902 
3903                 if (!doreadahead) {
3904                         mutex_exit(&rp->r_statelock);
3905                         rddir_cache_rele(rdc);
3906                         if (nrdc != NULL)
3907                                 rddir_cache_rele(nrdc);
3908                         return (error);
3909                 }
3910 
3911                 /*
3912                  * Check to see whether we found an entry
3913                  * for the readahead.  If so, we don't need
3914                  * to do anything further, so free the new
3915                  * entry if one was allocated.  Otherwise,
3916                  * allocate a new entry, add it to the cache,
3917                  * and then initiate an asynchronous readdir
3918                  * operation to fill it.
3919                  */
3920                 srdc.nfs3_cookie = rdc->nfs3_ncookie;
3921                 srdc.buflen = count;
3922                 rrdc = avl_find(&rp->r_dir, &srdc, &where);
3923                 if (rrdc != NULL) {
3924                         if (nrdc != NULL)
3925                                 rddir_cache_rele(nrdc);
3926                 } else {
3927                         if (nrdc != NULL)
3928                                 rrdc = nrdc;
3929                         else {
3930                                 rrdc = rddir_cache_alloc(KM_NOSLEEP);
3931                         }
3932                         if (rrdc != NULL) {
3933                                 rrdc->nfs3_cookie = rdc->nfs3_ncookie;
3934                                 rrdc->buflen = count;
3935                                 avl_insert(&rp->r_dir, rrdc, where);
3936                                 rddir_cache_hold(rrdc);
3937                                 mutex_exit(&rp->r_statelock);
3938                                 rddir_cache_rele(rdc);
3939 #ifdef DEBUG
3940                                 nfs3_readdir_readahead++;
3941 #endif
3942                                 nfs_async_readdir(vp, rrdc, cr, do_nfs3readdir);
3943                                 return (error);
3944                         }
3945                 }
3946 
3947                 mutex_exit(&rp->r_statelock);
3948                 rddir_cache_rele(rdc);
3949                 return (error);
3950         }
3951 
3952         /*
3953          * Didn't find an entry in the cache.  Construct a new empty
3954          * entry and link it into the cache.  Other processes attempting
3955          * to access this entry will need to wait until it is filled in.
3956          *
3957          * Since kmem_alloc may block, another pass through the cache
3958          * will need to be taken to make sure that another process
3959          * hasn't already added an entry to the cache for this request.
3960          */
3961         if (nrdc == NULL) {
3962                 mutex_exit(&rp->r_statelock);
3963                 nrdc = rddir_cache_alloc(KM_SLEEP);
3964                 nrdc->nfs3_cookie = uiop->uio_loffset;
3965                 nrdc->buflen = count;
3966                 goto top;
3967         }
3968 
3969         /*
3970          * Add this entry to the cache.
3971          */
3972         avl_insert(&rp->r_dir, nrdc, where);
3973         rddir_cache_hold(nrdc);
3974         mutex_exit(&rp->r_statelock);
3975 
3976 bottom:
3977 #ifdef DEBUG
3978         missed = 1;
3979         nfs3_readdir_cache_misses++;
3980 #endif
3981         /*
3982          * Do the readdir.  This routine decides whether to use
3983          * READDIR or READDIRPLUS.
3984          */
3985         error = do_nfs3readdir(vp, nrdc, cr);
3986 
3987         /*
3988          * If this operation failed, just return the error which occurred.
3989          */
3990         if (error != 0)
3991                 return (error);
3992 
3993         /*
3994          * Since the RPC operation will have taken sometime and blocked
3995          * this process, another pass through the cache will need to be
3996          * taken to find the correct cache entry.  It is possible that
3997          * the correct cache entry will not be there (although one was
3998          * added) because the directory changed during the RPC operation
3999          * and the readdir cache was flushed.  In this case, just start
4000          * over.  It is hoped that this will not happen too often... :-)
4001          */
4002         nrdc = NULL;
4003         goto top;
4004         /* NOTREACHED */
4005 }
4006 
4007 static int
4008 do_nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4009 {
4010         int error;
4011         rnode_t *rp;
4012         mntinfo_t *mi;
4013 
4014         rp = VTOR(vp);
4015         mi = VTOMI(vp);
4016         ASSERT(nfs_zone() == mi->mi_zone);
4017         /*
4018          * Issue the proper request.
4019          *
4020          * If the server does not support READDIRPLUS, then use READDIR.
4021          *
4022          * Otherwise --
4023          * Issue a READDIRPLUS if reading to fill an empty cache or if
4024          * an application has performed a lookup in the directory which
4025          * required an over the wire lookup.  The use of READDIRPLUS
4026          * will help to (re)populate the DNLC.
4027          */
4028         if (!(mi->mi_flags & MI_READDIRONLY) &&
4029             (rp->r_flags & (RLOOKUP | RREADDIRPLUS))) {
4030                 if (rp->r_flags & RREADDIRPLUS) {
4031                         mutex_enter(&rp->r_statelock);
4032                         rp->r_flags &= ~RREADDIRPLUS;
4033                         mutex_exit(&rp->r_statelock);
4034                 }
4035                 nfs3readdirplus(vp, rdc, cr);
4036                 if (rdc->error == EOPNOTSUPP)
4037                         nfs3readdir(vp, rdc, cr);
4038         } else
4039                 nfs3readdir(vp, rdc, cr);
4040 
4041         mutex_enter(&rp->r_statelock);
4042         rdc->flags &= ~RDDIR;
4043         if (rdc->flags & RDDIRWAIT) {
4044                 rdc->flags &= ~RDDIRWAIT;
4045                 cv_broadcast(&rdc->cv);
4046         }
4047         error = rdc->error;
4048         if (error)
4049                 rdc->flags |= RDDIRREQ;
4050         mutex_exit(&rp->r_statelock);
4051 
4052         rddir_cache_rele(rdc);
4053 
4054         return (error);
4055 }
4056 
4057 static void
4058 nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4059 {
4060         int error;
4061         READDIR3args args;
4062         READDIR3vres res;
4063         vattr_t dva;
4064         rnode_t *rp;
4065         int douprintf;
4066         failinfo_t fi, *fip = NULL;
4067         mntinfo_t *mi;
4068         hrtime_t t;
4069 
4070         rp = VTOR(vp);
4071         mi = VTOMI(vp);
4072         ASSERT(nfs_zone() == mi->mi_zone);
4073 
4074         args.dir = *RTOFH3(rp);
4075         args.cookie = (cookie3)rdc->nfs3_cookie;
4076         args.cookieverf = rp->r_cookieverf;
4077         args.count = rdc->buflen;
4078 
4079         /*
4080          * NFS client failover support
4081          * suppress failover unless we have a zero cookie
4082          */
4083         if (args.cookie == (cookie3) 0) {
4084                 fi.vp = vp;
4085                 fi.fhp = (caddr_t)&args.dir;
4086                 fi.copyproc = nfs3copyfh;
4087                 fi.lookupproc = nfs3lookup;
4088                 fi.xattrdirproc = acl_getxattrdir3;
4089                 fip = &fi;
4090         }
4091 
4092 #ifdef DEBUG
4093         rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP);
4094 #else
4095         rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
4096 #endif
4097 
4098         res.entries = (dirent64_t *)rdc->entries;
4099         res.entries_size = rdc->buflen;
4100         res.dir_attributes.fres.vap = &dva;
4101         res.dir_attributes.fres.vp = vp;
4102         res.loff = rdc->nfs3_cookie;
4103 
4104         douprintf = 1;
4105 
4106         if (mi->mi_io_kstats) {
4107                 mutex_enter(&mi->mi_lock);
4108                 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
4109                 mutex_exit(&mi->mi_lock);
4110         }
4111 
4112         t = gethrtime();
4113 
4114         error = rfs3call(VTOMI(vp), NFSPROC3_READDIR,
4115             xdr_READDIR3args, (caddr_t)&args,
4116             xdr_READDIR3vres, (caddr_t)&res, cr,
4117             &douprintf, &res.status, 0, fip);
4118 
4119         if (mi->mi_io_kstats) {
4120                 mutex_enter(&mi->mi_lock);
4121                 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
4122                 mutex_exit(&mi->mi_lock);
4123         }
4124 
4125         if (error)
4126                 goto err;
4127 
4128         nfs3_cache_post_op_vattr(vp, &res.dir_attributes, t, cr);
4129 
4130         error = geterrno3(res.status);
4131         if (error) {
4132                 PURGE_STALE_FH(error, vp, cr);
4133                 goto err;
4134         }
4135 
4136         if (mi->mi_io_kstats) {
4137                 mutex_enter(&mi->mi_lock);
4138                 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
4139                 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size;
4140                 mutex_exit(&mi->mi_lock);
4141         }
4142 
4143         rdc->nfs3_ncookie = res.loff;
4144         rp->r_cookieverf = res.cookieverf;
4145         rdc->eof = res.eof ? 1 : 0;
4146         rdc->entlen = res.size;
4147         ASSERT(rdc->entlen <= rdc->buflen);
4148         rdc->error = 0;
4149         return;
4150 
4151 err:
4152         kmem_free(rdc->entries, rdc->buflen);
4153         rdc->entries = NULL;
4154         rdc->error = error;
4155 }
4156 
4157 /*
4158  * Read directory entries.
4159  * There are some weird things to look out for here.  The uio_loffset
4160  * field is either 0 or it is the offset returned from a previous
4161  * readdir.  It is an opaque value used by the server to find the
4162  * correct directory block to read. The count field is the number
4163  * of blocks to read on the server.  This is advisory only, the server
4164  * may return only one block's worth of entries.  Entries may be compressed
4165  * on the server.
4166  */
4167 static void
4168 nfs3readdirplus(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4169 {
4170         int error;
4171         READDIRPLUS3args args;
4172         READDIRPLUS3vres res;
4173         vattr_t dva;
4174         rnode_t *rp;
4175         mntinfo_t *mi;
4176         int douprintf;
4177         failinfo_t fi, *fip = NULL;
4178 
4179         rp = VTOR(vp);
4180         mi = VTOMI(vp);
4181         ASSERT(nfs_zone() == mi->mi_zone);
4182 
4183         args.dir = *RTOFH3(rp);
4184         args.cookie = (cookie3)rdc->nfs3_cookie;
4185         args.cookieverf = rp->r_cookieverf;
4186         args.dircount = rdc->buflen;
4187         args.maxcount = mi->mi_tsize;
4188 
4189         /*
4190          * NFS client failover support
4191          * suppress failover unless we have a zero cookie
4192          */
4193         if (args.cookie == (cookie3)0) {
4194                 fi.vp = vp;
4195                 fi.fhp = (caddr_t)&args.dir;
4196                 fi.copyproc = nfs3copyfh;
4197                 fi.lookupproc = nfs3lookup;
4198                 fi.xattrdirproc = acl_getxattrdir3;
4199                 fip = &fi;
4200         }
4201 
4202 #ifdef DEBUG
4203         rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP);
4204 #else
4205         rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
4206 #endif
4207 
4208         res.entries = (dirent64_t *)rdc->entries;
4209         res.entries_size = rdc->buflen;
4210         res.dir_attributes.fres.vap = &dva;
4211         res.dir_attributes.fres.vp = vp;
4212         res.loff = rdc->nfs3_cookie;
4213         res.credentials = cr;
4214 
4215         douprintf = 1;
4216 
4217         if (mi->mi_io_kstats) {
4218                 mutex_enter(&mi->mi_lock);
4219                 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
4220                 mutex_exit(&mi->mi_lock);
4221         }
4222 
4223         res.time = gethrtime();
4224 
4225         error = rfs3call(mi, NFSPROC3_READDIRPLUS,
4226             xdr_READDIRPLUS3args, (caddr_t)&args,
4227             xdr_READDIRPLUS3vres, (caddr_t)&res, cr,
4228             &douprintf, &res.status, 0, fip);
4229 
4230         if (mi->mi_io_kstats) {
4231                 mutex_enter(&mi->mi_lock);
4232                 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
4233                 mutex_exit(&mi->mi_lock);
4234         }
4235 
4236         if (error) {
4237                 goto err;
4238         }
4239 
4240         nfs3_cache_post_op_vattr(vp, &res.dir_attributes, res.time, cr);
4241 
4242         error = geterrno3(res.status);
4243         if (error) {
4244                 PURGE_STALE_FH(error, vp, cr);
4245                 if (error == EOPNOTSUPP) {
4246                         mutex_enter(&mi->mi_lock);
4247                         mi->mi_flags |= MI_READDIRONLY;
4248                         mutex_exit(&mi->mi_lock);
4249                 }
4250                 goto err;
4251         }
4252 
4253         if (mi->mi_io_kstats) {
4254                 mutex_enter(&mi->mi_lock);
4255                 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
4256                 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size;
4257                 mutex_exit(&mi->mi_lock);
4258         }
4259 
4260         rdc->nfs3_ncookie = res.loff;
4261         rp->r_cookieverf = res.cookieverf;
4262         rdc->eof = res.eof ? 1 : 0;
4263         rdc->entlen = res.size;
4264         ASSERT(rdc->entlen <= rdc->buflen);
4265         rdc->error = 0;
4266 
4267         return;
4268 
4269 err:
4270         kmem_free(rdc->entries, rdc->buflen);
4271         rdc->entries = NULL;
4272         rdc->error = error;
4273 }
4274 
4275 #ifdef DEBUG
4276 static int nfs3_bio_do_stop = 0;
4277 #endif
4278 
4279 static int
4280 nfs3_bio(struct buf *bp, stable_how *stab_comm, cred_t *cr)
4281 {
4282         rnode_t *rp = VTOR(bp->b_vp);
4283         int count;
4284         int error;
4285         cred_t *cred;
4286         offset_t offset;
4287 
4288         ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
4289         offset = ldbtob(bp->b_lblkno);
4290 
4291         DTRACE_IO1(start, struct buf *, bp);
4292 
4293         if (bp->b_flags & B_READ) {
4294                 mutex_enter(&rp->r_statelock);
4295                 if (rp->r_cred != NULL) {
4296                         cred = rp->r_cred;
4297                         crhold(cred);
4298                 } else {
4299                         rp->r_cred = cr;
4300                         crhold(cr);
4301                         cred = cr;
4302                         crhold(cred);
4303                 }
4304                 mutex_exit(&rp->r_statelock);
4305         read_again:
4306                 error = bp->b_error = nfs3read(bp->b_vp, bp->b_un.b_addr,
4307                     offset, bp->b_bcount, &bp->b_resid, cred);
4308                 crfree(cred);
4309                 if (!error) {
4310                         if (bp->b_resid) {
4311                                 /*
4312                                  * Didn't get it all because we hit EOF,
4313                                  * zero all the memory beyond the EOF.
4314                                  */
4315                                 /* bzero(rdaddr + */
4316                                 bzero(bp->b_un.b_addr +
4317                                     bp->b_bcount - bp->b_resid, bp->b_resid);
4318                         }
4319                         mutex_enter(&rp->r_statelock);
4320                         if (bp->b_resid == bp->b_bcount &&
4321                             offset >= rp->r_size) {
4322                                 /*
4323                                  * We didn't read anything at all as we are
4324                                  * past EOF.  Return an error indicator back
4325                                  * but don't destroy the pages (yet).
4326                                  */
4327                                 error = NFS_EOF;
4328                         }
4329                         mutex_exit(&rp->r_statelock);
4330                 } else if (error == EACCES) {
4331                         mutex_enter(&rp->r_statelock);
4332                         if (cred != cr) {
4333                                 if (rp->r_cred != NULL)
4334                                         crfree(rp->r_cred);
4335                                 rp->r_cred = cr;
4336                                 crhold(cr);
4337                                 cred = cr;
4338                                 crhold(cred);
4339                                 mutex_exit(&rp->r_statelock);
4340                                 goto read_again;
4341                         }
4342                         mutex_exit(&rp->r_statelock);
4343                 }
4344         } else {
4345                 if (!(rp->r_flags & RSTALE)) {
4346                         mutex_enter(&rp->r_statelock);
4347                         if (rp->r_cred != NULL) {
4348                                 cred = rp->r_cred;
4349                                 crhold(cred);
4350                         } else {
4351                                 rp->r_cred = cr;
4352                                 crhold(cr);
4353                                 cred = cr;
4354                                 crhold(cred);
4355                         }
4356                         mutex_exit(&rp->r_statelock);
4357                 write_again:
4358                         mutex_enter(&rp->r_statelock);
4359                         count = MIN(bp->b_bcount, rp->r_size - offset);
4360                         mutex_exit(&rp->r_statelock);
4361                         if (count < 0)
4362                                 cmn_err(CE_PANIC, "nfs3_bio: write count < 0");
4363 #ifdef DEBUG
4364                         if (count == 0) {
4365                                 zcmn_err(getzoneid(), CE_WARN,
4366                                     "nfs3_bio: zero length write at %lld",
4367                                     offset);
4368                                 nfs_printfhandle(&rp->r_fh);
4369                                 if (nfs3_bio_do_stop)
4370                                         debug_enter("nfs3_bio");
4371                         }
4372 #endif
4373                         error = nfs3write(bp->b_vp, bp->b_un.b_addr, offset,
4374                             count, cred, stab_comm);
4375                         if (error == EACCES) {
4376                                 mutex_enter(&rp->r_statelock);
4377                                 if (cred != cr) {
4378                                         if (rp->r_cred != NULL)
4379                                                 crfree(rp->r_cred);
4380                                         rp->r_cred = cr;
4381                                         crhold(cr);
4382                                         crfree(cred);
4383                                         cred = cr;
4384                                         crhold(cred);
4385                                         mutex_exit(&rp->r_statelock);
4386                                         goto write_again;
4387                                 }
4388                                 mutex_exit(&rp->r_statelock);
4389                         }
4390                         bp->b_error = error;
4391                         if (error && error != EINTR) {
4392                                 /*
4393                                  * Don't print EDQUOT errors on the console.
4394                                  * Don't print asynchronous EACCES errors.
4395                                  * Don't print EFBIG errors.
4396                                  * Print all other write errors.
4397                                  */
4398                                 if (error != EDQUOT && error != EFBIG &&
4399                                     (error != EACCES ||
4400                                     !(bp->b_flags & B_ASYNC)))
4401                                         nfs_write_error(bp->b_vp, error, cred);
4402                                 /*
4403                                  * Update r_error and r_flags as appropriate.
4404                                  * If the error was ESTALE, then mark the
4405                                  * rnode as not being writeable and save
4406                                  * the error status.  Otherwise, save any
4407                                  * errors which occur from asynchronous
4408                                  * page invalidations.  Any errors occurring
4409                                  * from other operations should be saved
4410                                  * by the caller.
4411                                  */
4412                                 mutex_enter(&rp->r_statelock);
4413                                 if (error == ESTALE) {
4414                                         rp->r_flags |= RSTALE;
4415                                         if (!rp->r_error)
4416                                                 rp->r_error = error;
4417                                 } else if (!rp->r_error &&
4418                                     (bp->b_flags &
4419                                     (B_INVAL|B_FORCE|B_ASYNC)) ==
4420                                     (B_INVAL|B_FORCE|B_ASYNC)) {
4421                                         rp->r_error = error;
4422                                 }
4423                                 mutex_exit(&rp->r_statelock);
4424                         }
4425                         crfree(cred);
4426                 } else {
4427                         error = rp->r_error;
4428                         /*
4429                          * A close may have cleared r_error, if so,
4430                          * propagate ESTALE error return properly
4431                          */
4432                         if (error == 0)
4433                                 error = ESTALE;
4434                 }
4435         }
4436 
4437         if (error != 0 && error != NFS_EOF)
4438                 bp->b_flags |= B_ERROR;
4439 
4440         DTRACE_IO1(done, struct buf *, bp);
4441 
4442         return (error);
4443 }
4444 
4445 /* ARGSUSED */
4446 static int
4447 nfs3_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4448 {
4449         rnode_t *rp;
4450 
4451         if (nfs_zone() != VTOMI(vp)->mi_zone)
4452                 return (EIO);
4453         rp = VTOR(vp);
4454 
4455         if (fidp->fid_len < (ushort_t)rp->r_fh.fh_len) {
4456                 fidp->fid_len = rp->r_fh.fh_len;
4457                 return (ENOSPC);
4458         }
4459         fidp->fid_len = rp->r_fh.fh_len;
4460         bcopy(rp->r_fh.fh_buf, fidp->fid_data, fidp->fid_len);
4461         return (0);
4462 }
4463 
4464 /* ARGSUSED2 */
4465 static int
4466 nfs3_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
4467 {
4468         rnode_t *rp = VTOR(vp);
4469 
4470         if (!write_lock) {
4471                 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
4472                 return (V_WRITELOCK_FALSE);
4473         }
4474 
4475         if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
4476                 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
4477                 if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
4478                         return (V_WRITELOCK_FALSE);
4479                 nfs_rw_exit(&rp->r_rwlock);
4480         }
4481 
4482         (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
4483         return (V_WRITELOCK_TRUE);
4484 }
4485 
4486 /* ARGSUSED */
4487 static void
4488 nfs3_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
4489 {
4490         rnode_t *rp = VTOR(vp);
4491 
4492         nfs_rw_exit(&rp->r_rwlock);
4493 }
4494 
4495 /* ARGSUSED */
4496 static int
4497 nfs3_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
4498 {
4499 
4500         /*
4501          * Because we stuff the readdir cookie into the offset field
4502          * someone may attempt to do an lseek with the cookie which
4503          * we want to succeed.
4504          */
4505         if (vp->v_type == VDIR)
4506                 return (0);
4507         if (*noffp < 0)
4508                 return (EINVAL);
4509         return (0);
4510 }
4511 
4512 /*
4513  * number of nfs3_bsize blocks to read ahead.
4514  */
4515 static int nfs3_nra = 4;
4516 
4517 #ifdef DEBUG
4518 static int nfs3_lostpage = 0;   /* number of times we lost original page */
4519 #endif
4520 
4521 /*
4522  * Return all the pages from [off..off+len) in file
4523  */
4524 /* ARGSUSED */
4525 static int
4526 nfs3_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4527         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4528         enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4529 {
4530         rnode_t *rp;
4531         int error;
4532         mntinfo_t *mi;
4533 
4534         if (vp->v_flag & VNOMAP)
4535                 return (ENOSYS);
4536 
4537         if (nfs_zone() != VTOMI(vp)->mi_zone)
4538                 return (EIO);
4539         if (protp != NULL)
4540                 *protp = PROT_ALL;
4541 
4542         /*
4543          * Now valididate that the caches are up to date.
4544          */
4545         error = nfs3_validate_caches(vp, cr);
4546         if (error)
4547                 return (error);
4548 
4549         rp = VTOR(vp);
4550         mi = VTOMI(vp);
4551 retry:
4552         mutex_enter(&rp->r_statelock);
4553 
4554         /*
4555          * Don't create dirty pages faster than they
4556          * can be cleaned so that the system doesn't
4557          * get imbalanced.  If the async queue is
4558          * maxed out, then wait for it to drain before
4559          * creating more dirty pages.  Also, wait for
4560          * any threads doing pagewalks in the vop_getattr
4561          * entry points so that they don't block for
4562          * long periods.
4563          */
4564         if (rw == S_CREATE) {
4565                 while ((mi->mi_max_threads != 0 &&
4566                     rp->r_awcount > 2 * mi->mi_max_threads) ||
4567                     rp->r_gcount > 0)
4568                         cv_wait(&rp->r_cv, &rp->r_statelock);
4569         }
4570 
4571         /*
4572          * If we are getting called as a side effect of an nfs_write()
4573          * operation the local file size might not be extended yet.
4574          * In this case we want to be able to return pages of zeroes.
4575          */
4576         if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
4577                 mutex_exit(&rp->r_statelock);
4578                 return (EFAULT);                /* beyond EOF */
4579         }
4580 
4581         mutex_exit(&rp->r_statelock);
4582 
4583         if (len <= PAGESIZE) {
4584                 error = nfs3_getapage(vp, off, len, protp, pl, plsz,
4585                     seg, addr, rw, cr);
4586         } else {
4587                 error = pvn_getpages(nfs3_getapage, vp, off, len, protp,
4588                     pl, plsz, seg, addr, rw, cr);
4589         }
4590 
4591         switch (error) {
4592         case NFS_EOF:
4593                 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
4594                 goto retry;
4595         case ESTALE:
4596                 PURGE_STALE_FH(error, vp, cr);
4597         }
4598 
4599         return (error);
4600 }
4601 
4602 /*
4603  * Called from pvn_getpages or nfs3_getpage to get a particular page.
4604  */
4605 /* ARGSUSED */
4606 static int
4607 nfs3_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
4608         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4609         enum seg_rw rw, cred_t *cr)
4610 {
4611         rnode_t *rp;
4612         uint_t bsize;
4613         struct buf *bp;
4614         page_t *pp;
4615         u_offset_t lbn;
4616         u_offset_t io_off;
4617         u_offset_t blkoff;
4618         u_offset_t rablkoff;
4619         size_t io_len;
4620         uint_t blksize;
4621         int error;
4622         int readahead;
4623         int readahead_issued = 0;
4624         int ra_window; /* readahead window */
4625         page_t *pagefound;
4626         page_t *savepp;
4627 
4628         if (nfs_zone() != VTOMI(vp)->mi_zone)
4629                 return (EIO);
4630         rp = VTOR(vp);
4631         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4632 
4633 reread:
4634         bp = NULL;
4635         pp = NULL;
4636         pagefound = NULL;
4637 
4638         if (pl != NULL)
4639                 pl[0] = NULL;
4640 
4641         error = 0;
4642         lbn = off / bsize;
4643         blkoff = lbn * bsize;
4644 
4645         /*
4646          * Queueing up the readahead before doing the synchronous read
4647          * results in a significant increase in read throughput because
4648          * of the increased parallelism between the async threads and
4649          * the process context.
4650          */
4651         if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
4652             rw != S_CREATE &&
4653             !(vp->v_flag & VNOCACHE)) {
4654                 mutex_enter(&rp->r_statelock);
4655 
4656                 /*
4657                  * Calculate the number of readaheads to do.
4658                  * a) No readaheads at offset = 0.
4659                  * b) Do maximum(nfs3_nra) readaheads when the readahead
4660                  *    window is closed.
4661                  * c) Do readaheads between 1 to (nfs3_nra - 1) depending
4662                  *    upon how far the readahead window is open or close.
4663                  * d) No readaheads if rp->r_nextr is not within the scope
4664                  *    of the readahead window (random i/o).
4665                  */
4666 
4667                 if (off == 0)
4668                         readahead = 0;
4669                 else if (blkoff == rp->r_nextr)
4670                         readahead = nfs3_nra;
4671                 else if (rp->r_nextr > blkoff &&
4672                     ((ra_window = (rp->r_nextr - blkoff) / bsize)
4673                     <= (nfs3_nra - 1)))
4674                         readahead = nfs3_nra - ra_window;
4675                 else
4676                         readahead = 0;
4677 
4678                 rablkoff = rp->r_nextr;
4679                 while (readahead > 0 && rablkoff + bsize < rp->r_size) {
4680                         mutex_exit(&rp->r_statelock);
4681                         if (nfs_async_readahead(vp, rablkoff + bsize,
4682                             addr + (rablkoff + bsize - off), seg, cr,
4683                             nfs3_readahead) < 0) {
4684                                 mutex_enter(&rp->r_statelock);
4685                                 break;
4686                         }
4687                         readahead--;
4688                         rablkoff += bsize;
4689                         /*
4690                          * Indicate that we did a readahead so
4691                          * readahead offset is not updated
4692                          * by the synchronous read below.
4693                          */
4694                         readahead_issued = 1;
4695                         mutex_enter(&rp->r_statelock);
4696                         /*
4697                          * set readahead offset to
4698                          * offset of last async readahead
4699                          * request.
4700                          */
4701                         rp->r_nextr = rablkoff;
4702                 }
4703                 mutex_exit(&rp->r_statelock);
4704         }
4705 
4706 again:
4707         if ((pagefound = page_exists(vp, off)) == NULL) {
4708                 if (pl == NULL) {
4709                         (void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
4710                             nfs3_readahead);
4711                 } else if (rw == S_CREATE) {
4712                         /*
4713                          * Block for this page is not allocated, or the offset
4714                          * is beyond the current allocation size, or we're
4715                          * allocating a swap slot and the page was not found,
4716                          * so allocate it and return a zero page.
4717                          */
4718                         if ((pp = page_create_va(vp, off,
4719                             PAGESIZE, PG_WAIT, seg, addr)) == NULL)
4720                                 cmn_err(CE_PANIC, "nfs3_getapage: page_create");
4721                         io_len = PAGESIZE;
4722                         mutex_enter(&rp->r_statelock);
4723                         rp->r_nextr = off + PAGESIZE;
4724                         mutex_exit(&rp->r_statelock);
4725                 } else {
4726                         /*
4727                          * Need to go to server to get a BLOCK, exception to
4728                          * that being while reading at offset = 0 or doing
4729                          * random i/o, in that case read only a PAGE.
4730                          */
4731                         mutex_enter(&rp->r_statelock);
4732                         if (blkoff < rp->r_size &&
4733                             blkoff + bsize >= rp->r_size) {
4734                                 /*
4735                                  * If only a block or less is left in
4736                                  * the file, read all that is remaining.
4737                                  */
4738                                 if (rp->r_size <= off) {
4739                                         /*
4740                                          * Trying to access beyond EOF,
4741                                          * set up to get at least one page.
4742                                          */
4743                                         blksize = off + PAGESIZE - blkoff;
4744                                 } else
4745                                         blksize = rp->r_size - blkoff;
4746                         } else if ((off == 0) ||
4747                             (off != rp->r_nextr && !readahead_issued)) {
4748                                 blksize = PAGESIZE;
4749                                 blkoff = off; /* block = page here */
4750                         } else
4751                                 blksize = bsize;
4752                         mutex_exit(&rp->r_statelock);
4753 
4754                         pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4755                             &io_len, blkoff, blksize, 0);
4756 
4757                         /*
4758                          * Some other thread has entered the page,
4759                          * so just use it.
4760                          */
4761                         if (pp == NULL)
4762                                 goto again;
4763 
4764                         /*
4765                          * Now round the request size up to page boundaries.
4766                          * This ensures that the entire page will be
4767                          * initialized to zeroes if EOF is encountered.
4768                          */
4769                         io_len = ptob(btopr(io_len));
4770 
4771                         bp = pageio_setup(pp, io_len, vp, B_READ);
4772                         ASSERT(bp != NULL);
4773 
4774                         /*
4775                          * pageio_setup should have set b_addr to 0.  This
4776                          * is correct since we want to do I/O on a page
4777                          * boundary.  bp_mapin will use this addr to calculate
4778                          * an offset, and then set b_addr to the kernel virtual
4779                          * address it allocated for us.
4780                          */
4781                         ASSERT(bp->b_un.b_addr == 0);
4782 
4783                         bp->b_edev = 0;
4784                         bp->b_dev = 0;
4785                         bp->b_lblkno = lbtodb(io_off);
4786                         bp->b_file = vp;
4787                         bp->b_offset = (offset_t)off;
4788                         bp_mapin(bp);
4789 
4790                         /*
4791                          * If doing a write beyond what we believe is EOF,
4792                          * don't bother trying to read the pages from the
4793                          * server, we'll just zero the pages here.  We
4794                          * don't check that the rw flag is S_WRITE here
4795                          * because some implementations may attempt a
4796                          * read access to the buffer before copying data.
4797                          */
4798                         mutex_enter(&rp->r_statelock);
4799                         if (io_off >= rp->r_size && seg == segkmap) {
4800                                 mutex_exit(&rp->r_statelock);
4801                                 bzero(bp->b_un.b_addr, io_len);
4802                         } else {
4803                                 mutex_exit(&rp->r_statelock);
4804                                 error = nfs3_bio(bp, NULL, cr);
4805                         }
4806 
4807                         /*
4808                          * Unmap the buffer before freeing it.
4809                          */
4810                         bp_mapout(bp);
4811                         pageio_done(bp);
4812 
4813                         savepp = pp;
4814                         do {
4815                                 pp->p_fsdata = C_NOCOMMIT;
4816                         } while ((pp = pp->p_next) != savepp);
4817 
4818                         if (error == NFS_EOF) {
4819                                 /*
4820                                  * If doing a write system call just return
4821                                  * zeroed pages, else user tried to get pages
4822                                  * beyond EOF, return error.  We don't check
4823                                  * that the rw flag is S_WRITE here because
4824                                  * some implementations may attempt a read
4825                                  * access to the buffer before copying data.
4826                                  */
4827                                 if (seg == segkmap)
4828                                         error = 0;
4829                                 else
4830                                         error = EFAULT;
4831                         }
4832 
4833                         if (!readahead_issued && !error) {
4834                                 mutex_enter(&rp->r_statelock);
4835                                 rp->r_nextr = io_off + io_len;
4836                                 mutex_exit(&rp->r_statelock);
4837                         }
4838                 }
4839         }
4840 
4841 out:
4842         if (pl == NULL)
4843                 return (error);
4844 
4845         if (error) {
4846                 if (pp != NULL)
4847                         pvn_read_done(pp, B_ERROR);
4848                 return (error);
4849         }
4850 
4851         if (pagefound) {
4852                 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
4853 
4854                 /*
4855                  * Page exists in the cache, acquire the appropriate lock.
4856                  * If this fails, start all over again.
4857                  */
4858                 if ((pp = page_lookup(vp, off, se)) == NULL) {
4859 #ifdef DEBUG
4860                         nfs3_lostpage++;
4861 #endif
4862                         goto reread;
4863                 }
4864                 pl[0] = pp;
4865                 pl[1] = NULL;
4866                 return (0);
4867         }
4868 
4869         if (pp != NULL)
4870                 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4871 
4872         return (error);
4873 }
4874 
4875 static void
4876 nfs3_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
4877         cred_t *cr)
4878 {
4879         int error;
4880         page_t *pp;
4881         u_offset_t io_off;
4882         size_t io_len;
4883         struct buf *bp;
4884         uint_t bsize, blksize;
4885         rnode_t *rp = VTOR(vp);
4886         page_t *savepp;
4887 
4888         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4889         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4890 
4891         mutex_enter(&rp->r_statelock);
4892         if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
4893                 /*
4894                  * If less than a block left in file read less
4895                  * than a block.
4896                  */
4897                 blksize = rp->r_size - blkoff;
4898         } else
4899                 blksize = bsize;
4900         mutex_exit(&rp->r_statelock);
4901 
4902         pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
4903             &io_off, &io_len, blkoff, blksize, 1);
4904         /*
4905          * The isra flag passed to the kluster function is 1, we may have
4906          * gotten a return value of NULL for a variety of reasons (# of free
4907          * pages < minfree, someone entered the page on the vnode etc). In all
4908          * cases, we want to punt on the readahead.
4909          */
4910         if (pp == NULL)
4911                 return;
4912 
4913         /*
4914          * Now round the request size up to page boundaries.
4915          * This ensures that the entire page will be
4916          * initialized to zeroes if EOF is encountered.
4917          */
4918         io_len = ptob(btopr(io_len));
4919 
4920         bp = pageio_setup(pp, io_len, vp, B_READ);
4921         ASSERT(bp != NULL);
4922 
4923         /*
4924          * pageio_setup should have set b_addr to 0.  This is correct since
4925          * we want to do I/O on a page boundary. bp_mapin() will use this addr
4926          * to calculate an offset, and then set b_addr to the kernel virtual
4927          * address it allocated for us.
4928          */
4929         ASSERT(bp->b_un.b_addr == 0);
4930 
4931         bp->b_edev = 0;
4932         bp->b_dev = 0;
4933         bp->b_lblkno = lbtodb(io_off);
4934         bp->b_file = vp;
4935         bp->b_offset = (offset_t)blkoff;
4936         bp_mapin(bp);
4937 
4938         /*
4939          * If doing a write beyond what we believe is EOF, don't bother trying
4940          * to read the pages from the server, we'll just zero the pages here.
4941          * We don't check that the rw flag is S_WRITE here because some
4942          * implementations may attempt a read access to the buffer before
4943          * copying data.
4944          */
4945         mutex_enter(&rp->r_statelock);
4946         if (io_off >= rp->r_size && seg == segkmap) {
4947                 mutex_exit(&rp->r_statelock);
4948                 bzero(bp->b_un.b_addr, io_len);
4949                 error = 0;
4950         } else {
4951                 mutex_exit(&rp->r_statelock);
4952                 error = nfs3_bio(bp, NULL, cr);
4953                 if (error == NFS_EOF)
4954                         error = 0;
4955         }
4956 
4957         /*
4958          * Unmap the buffer before freeing it.
4959          */
4960         bp_mapout(bp);
4961         pageio_done(bp);
4962 
4963         savepp = pp;
4964         do {
4965                 pp->p_fsdata = C_NOCOMMIT;
4966         } while ((pp = pp->p_next) != savepp);
4967 
4968         pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
4969 
4970         /*
4971          * In case of error set readahead offset
4972          * to the lowest offset.
4973          * pvn_read_done() calls VN_DISPOSE to destroy the pages
4974          */
4975         if (error && rp->r_nextr > io_off) {
4976                 mutex_enter(&rp->r_statelock);
4977                 if (rp->r_nextr > io_off)
4978                         rp->r_nextr = io_off;
4979                 mutex_exit(&rp->r_statelock);
4980         }
4981 }
4982 
4983 /*
4984  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
4985  * If len == 0, do from off to EOF.
4986  *
4987  * The normal cases should be len == 0 && off == 0 (entire vp list),
4988  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4989  * (from pageout).
4990  */
4991 /* ARGSUSED */
4992 static int
4993 nfs3_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4994         caller_context_t *ct)
4995 {
4996         int error;
4997         rnode_t *rp;
4998 
4999         ASSERT(cr != NULL);
5000 
5001         /*
5002          * XXX - Why should this check be made here?
5003          */
5004         if (vp->v_flag & VNOMAP)
5005                 return (ENOSYS);
5006         if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
5007                 return (0);
5008         if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
5009                 return (EIO);
5010 
5011         rp = VTOR(vp);
5012         mutex_enter(&rp->r_statelock);
5013         rp->r_count++;
5014         mutex_exit(&rp->r_statelock);
5015         error = nfs_putpages(vp, off, len, flags, cr);
5016         mutex_enter(&rp->r_statelock);
5017         rp->r_count--;
5018         cv_broadcast(&rp->r_cv);
5019         mutex_exit(&rp->r_statelock);
5020 
5021         return (error);
5022 }
5023 
5024 /*
5025  * Write out a single page, possibly klustering adjacent dirty pages.
5026  */
5027 int
5028 nfs3_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
5029         int flags, cred_t *cr)
5030 {
5031         u_offset_t io_off;
5032         u_offset_t lbn_off;
5033         u_offset_t lbn;
5034         size_t io_len;
5035         uint_t bsize;
5036         int error;
5037         rnode_t *rp;
5038 
5039         ASSERT(!vn_is_readonly(vp));
5040         ASSERT(pp != NULL);
5041         ASSERT(cr != NULL);
5042         ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
5043 
5044         rp = VTOR(vp);
5045         ASSERT(rp->r_count > 0);
5046 
5047         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
5048         lbn = pp->p_offset / bsize;
5049         lbn_off = lbn * bsize;
5050 
5051         /*
5052          * Find a kluster that fits in one block, or in
5053          * one page if pages are bigger than blocks.  If
5054          * there is less file space allocated than a whole
5055          * page, we'll shorten the i/o request below.
5056          */
5057         pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
5058             roundup(bsize, PAGESIZE), flags);
5059 
5060         /*
5061          * pvn_write_kluster shouldn't have returned a page with offset
5062          * behind the original page we were given.  Verify that.
5063          */
5064         ASSERT((pp->p_offset / bsize) >= lbn);
5065 
5066         /*
5067          * Now pp will have the list of kept dirty pages marked for
5068          * write back.  It will also handle invalidation and freeing
5069          * of pages that are not dirty.  Check for page length rounding
5070          * problems.
5071          */
5072         if (io_off + io_len > lbn_off + bsize) {
5073                 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
5074                 io_len = lbn_off + bsize - io_off;
5075         }
5076         /*
5077          * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
5078          * consistent value of r_size. RMODINPROGRESS is set in writerp().
5079          * When RMODINPROGRESS is set it indicates that a uiomove() is in
5080          * progress and the r_size has not been made consistent with the
5081          * new size of the file. When the uiomove() completes the r_size is
5082          * updated and the RMODINPROGRESS flag is cleared.
5083          *
5084          * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
5085          * consistent value of r_size. Without this handshaking, it is
5086          * possible that nfs(3)_bio() picks  up the old value of r_size
5087          * before the uiomove() in writerp() completes. This will result
5088          * in the write through nfs(3)_bio() being dropped.
5089          *
5090          * More precisely, there is a window between the time the uiomove()
5091          * completes and the time the r_size is updated. If a VOP_PUTPAGE()
5092          * operation intervenes in this window, the page will be picked up,
5093          * because it is dirty (it will be unlocked, unless it was
5094          * pagecreate'd). When the page is picked up as dirty, the dirty
5095          * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
5096          * checked. This will still be the old size. Therefore the page will
5097          * not be written out. When segmap_release() calls VOP_PUTPAGE(),
5098          * the page will be found to be clean and the write will be dropped.
5099          */
5100         if (rp->r_flags & RMODINPROGRESS) {
5101                 mutex_enter(&rp->r_statelock);
5102                 if ((rp->r_flags & RMODINPROGRESS) &&
5103                     rp->r_modaddr + MAXBSIZE > io_off &&
5104                     rp->r_modaddr < io_off + io_len) {
5105                         page_t *plist;
5106                         /*
5107                          * A write is in progress for this region of the file.
5108                          * If we did not detect RMODINPROGRESS here then this
5109                          * path through nfs_putapage() would eventually go to
5110                          * nfs(3)_bio() and may not write out all of the data
5111                          * in the pages. We end up losing data. So we decide
5112                          * to set the modified bit on each page in the page
5113                          * list and mark the rnode with RDIRTY. This write
5114                          * will be restarted at some later time.
5115                          */
5116                         plist = pp;
5117                         while (plist != NULL) {
5118                                 pp = plist;
5119                                 page_sub(&plist, pp);
5120                                 hat_setmod(pp);
5121                                 page_io_unlock(pp);
5122                                 page_unlock(pp);
5123                         }
5124                         rp->r_flags |= RDIRTY;
5125                         mutex_exit(&rp->r_statelock);
5126                         if (offp)
5127                                 *offp = io_off;
5128                         if (lenp)
5129                                 *lenp = io_len;
5130                         return (0);
5131                 }
5132                 mutex_exit(&rp->r_statelock);
5133         }
5134 
5135         if (flags & B_ASYNC) {
5136                 error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
5137                     nfs3_sync_putapage);
5138         } else
5139                 error = nfs3_sync_putapage(vp, pp, io_off, io_len, flags, cr);
5140 
5141         if (offp)
5142                 *offp = io_off;
5143         if (lenp)
5144                 *lenp = io_len;
5145         return (error);
5146 }
5147 
5148 static int
5149 nfs3_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5150         int flags, cred_t *cr)
5151 {
5152         int error;
5153         rnode_t *rp;
5154 
5155         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5156 
5157         flags |= B_WRITE;
5158 
5159         error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5160 
5161         rp = VTOR(vp);
5162 
5163         if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
5164             error == EACCES) &&
5165             (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
5166                 if (!(rp->r_flags & ROUTOFSPACE)) {
5167                         mutex_enter(&rp->r_statelock);
5168                         rp->r_flags |= ROUTOFSPACE;
5169                         mutex_exit(&rp->r_statelock);
5170                 }
5171                 flags |= B_ERROR;
5172                 pvn_write_done(pp, flags);
5173                 /*
5174                  * If this was not an async thread, then try again to
5175                  * write out the pages, but this time, also destroy
5176                  * them whether or not the write is successful.  This
5177                  * will prevent memory from filling up with these
5178                  * pages and destroying them is the only alternative
5179                  * if they can't be written out.
5180                  *
5181                  * Don't do this if this is an async thread because
5182                  * when the pages are unlocked in pvn_write_done,
5183                  * some other thread could have come along, locked
5184                  * them, and queued for an async thread.  It would be
5185                  * possible for all of the async threads to be tied
5186                  * up waiting to lock the pages again and they would
5187                  * all already be locked and waiting for an async
5188                  * thread to handle them.  Deadlock.
5189                  */
5190                 if (!(flags & B_ASYNC)) {
5191                         error = nfs3_putpage(vp, io_off, io_len,
5192                             B_INVAL | B_FORCE, cr, NULL);
5193                 }
5194         } else {
5195                 if (error)
5196                         flags |= B_ERROR;
5197                 else if (rp->r_flags & ROUTOFSPACE) {
5198                         mutex_enter(&rp->r_statelock);
5199                         rp->r_flags &= ~ROUTOFSPACE;
5200                         mutex_exit(&rp->r_statelock);
5201                 }
5202                 pvn_write_done(pp, flags);
5203                 if (freemem < desfree)
5204                         (void) nfs3_commit_vp(vp, (u_offset_t)0, 0, cr);
5205         }
5206 
5207         return (error);
5208 }
5209 
5210 /* ARGSUSED */
5211 static int
5212 nfs3_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
5213         size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
5214         cred_t *cr, caller_context_t *ct)
5215 {
5216         struct segvn_crargs vn_a;
5217         int error;
5218         rnode_t *rp;
5219         struct vattr va;
5220 
5221         if (nfs_zone() != VTOMI(vp)->mi_zone)
5222                 return (EIO);
5223 
5224         if (vp->v_flag & VNOMAP)
5225                 return (ENOSYS);
5226 
5227         if (off < 0 || off + len < 0)
5228                 return (ENXIO);
5229 
5230         if (vp->v_type != VREG)
5231                 return (ENODEV);
5232 
5233         /*
5234          * If there is cached data and if close-to-open consistency
5235          * checking is not turned off and if the file system is not
5236          * mounted readonly, then force an over the wire getattr.
5237          * Otherwise, just invoke nfs3getattr to get a copy of the
5238          * attributes.  The attribute cache will be used unless it
5239          * is timed out and if it is, then an over the wire getattr
5240          * will be issued.
5241          */
5242         va.va_mask = AT_ALL;
5243         if (vn_has_cached_data(vp) &&
5244             !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
5245                 error = nfs3_getattr_otw(vp, &va, cr);
5246         else
5247                 error = nfs3getattr(vp, &va, cr);
5248         if (error)
5249                 return (error);
5250 
5251         /*
5252          * Check to see if the vnode is currently marked as not cachable.
5253          * This means portions of the file are locked (through VOP_FRLOCK).
5254          * In this case the map request must be refused.  We use
5255          * rp->r_lkserlock to avoid a race with concurrent lock requests.
5256          */
5257         rp = VTOR(vp);
5258 
5259         /*
5260          * Atomically increment r_inmap after acquiring r_rwlock. The
5261          * idea here is to acquire r_rwlock to block read/write and
5262          * not to protect r_inmap. r_inmap will inform nfs3_read/write()
5263          * that we are in nfs3_map(). Now, r_rwlock is acquired in order
5264          * and we can prevent the deadlock that would have occurred
5265          * when nfs3_addmap() would have acquired it out of order.
5266          *
5267          * Since we are not protecting r_inmap by any lock, we do not
5268          * hold any lock when we decrement it. We atomically decrement
5269          * r_inmap after we release r_lkserlock.
5270          */
5271 
5272         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
5273                 return (EINTR);
5274         atomic_add_int(&rp->r_inmap, 1);
5275         nfs_rw_exit(&rp->r_rwlock);
5276 
5277         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
5278                 atomic_add_int(&rp->r_inmap, -1);
5279                 return (EINTR);
5280         }
5281 
5282         if (vp->v_flag & VNOCACHE) {
5283                 error = EAGAIN;
5284                 goto done;
5285         }
5286 
5287         /*
5288          * Don't allow concurrent locks and mapping if mandatory locking is
5289          * enabled.
5290          */
5291         if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
5292             MANDLOCK(vp, va.va_mode)) {
5293                 error = EAGAIN;
5294                 goto done;
5295         }
5296 
5297         as_rangelock(as);
5298         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5299         if (error != 0) {
5300                 as_rangeunlock(as);
5301                 goto done;
5302         }
5303 
5304         vn_a.vp = vp;
5305         vn_a.offset = off;
5306         vn_a.type = (flags & MAP_TYPE);
5307         vn_a.prot = (uchar_t)prot;
5308         vn_a.maxprot = (uchar_t)maxprot;
5309         vn_a.flags = (flags & ~MAP_TYPE);
5310         vn_a.cred = cr;
5311         vn_a.amp = NULL;
5312         vn_a.szc = 0;
5313         vn_a.lgrp_mem_policy_flags = 0;
5314 
5315         error = as_map(as, *addrp, len, segvn_create, &vn_a);
5316         as_rangeunlock(as);
5317 
5318 done:
5319         nfs_rw_exit(&rp->r_lkserlock);
5320         atomic_add_int(&rp->r_inmap, -1);
5321         return (error);
5322 }
5323 
5324 /* ARGSUSED */
5325 static int
5326 nfs3_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5327         size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
5328         cred_t *cr, caller_context_t *ct)
5329 {
5330         rnode_t *rp;
5331 
5332         if (vp->v_flag & VNOMAP)
5333                 return (ENOSYS);
5334         if (nfs_zone() != VTOMI(vp)->mi_zone)
5335                 return (EIO);
5336 
5337         rp = VTOR(vp);
5338         atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
5339 
5340         return (0);
5341 }
5342 
5343 /* ARGSUSED */
5344 static int
5345 nfs3_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
5346         offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
5347         caller_context_t *ct)
5348 {
5349         netobj lm_fh3;
5350         int rc;
5351         u_offset_t start, end;
5352         rnode_t *rp;
5353         int error = 0, intr = INTR(vp);
5354 
5355         if (nfs_zone() != VTOMI(vp)->mi_zone)
5356                 return (EIO);
5357         /* check for valid cmd parameter */
5358         if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
5359                 return (EINVAL);
5360 
5361         /* Verify l_type. */
5362         switch (bfp->l_type) {
5363         case F_RDLCK:
5364                 if (cmd != F_GETLK && !(flag & FREAD))
5365                         return (EBADF);
5366                 break;
5367         case F_WRLCK:
5368                 if (cmd != F_GETLK && !(flag & FWRITE))
5369                         return (EBADF);
5370                 break;
5371         case F_UNLCK:
5372                 intr = 0;
5373                 break;
5374 
5375         default:
5376                 return (EINVAL);
5377         }
5378 
5379         /* check the validity of the lock range */
5380         if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
5381                 return (rc);
5382         if (rc = flk_check_lock_data(start, end, MAXEND))
5383                 return (rc);
5384 
5385         /*
5386          * If the filesystem is mounted using local locking, pass the
5387          * request off to the local locking code.
5388          */
5389         if (VTOMI(vp)->mi_flags & MI_LLOCK) {
5390                 if (cmd == F_SETLK || cmd == F_SETLKW) {
5391                         /*
5392                          * For complete safety, we should be holding
5393                          * r_lkserlock.  However, we can't call
5394                          * lm_safelock and then fs_frlock while
5395                          * holding r_lkserlock, so just invoke
5396                          * lm_safelock and expect that this will
5397                          * catch enough of the cases.
5398                          */
5399                         if (!lm_safelock(vp, bfp, cr))
5400                                 return (EAGAIN);
5401                 }
5402                 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
5403         }
5404 
5405         rp = VTOR(vp);
5406 
5407         /*
5408          * Check whether the given lock request can proceed, given the
5409          * current file mappings.
5410          */
5411         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
5412                 return (EINTR);
5413         if (cmd == F_SETLK || cmd == F_SETLKW) {
5414                 if (!lm_safelock(vp, bfp, cr)) {
5415                         rc = EAGAIN;
5416                         goto done;
5417                 }
5418         }
5419 
5420         /*
5421          * Flush the cache after waiting for async I/O to finish.  For new
5422          * locks, this is so that the process gets the latest bits from the
5423          * server.  For unlocks, this is so that other clients see the
5424          * latest bits once the file has been unlocked.  If currently dirty
5425          * pages can't be flushed, then don't allow a lock to be set.  But
5426          * allow unlocks to succeed, to avoid having orphan locks on the
5427          * server.
5428          */
5429         if (cmd != F_GETLK) {
5430                 mutex_enter(&rp->r_statelock);
5431                 while (rp->r_count > 0) {
5432                         if (intr) {
5433                                 klwp_t *lwp = ttolwp(curthread);
5434 
5435                                 if (lwp != NULL)
5436                                         lwp->lwp_nostop++;
5437                                 if (cv_wait_sig(&rp->r_cv,
5438                                     &rp->r_statelock) == 0) {
5439                                         if (lwp != NULL)
5440                                                 lwp->lwp_nostop--;
5441                                         rc = EINTR;
5442                                         break;
5443                                 }
5444                                 if (lwp != NULL)
5445                                         lwp->lwp_nostop--;
5446                         } else
5447                                 cv_wait(&rp->r_cv, &rp->r_statelock);
5448                 }
5449                 mutex_exit(&rp->r_statelock);
5450                 if (rc != 0)
5451                         goto done;
5452                 error = nfs3_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
5453                 if (error) {
5454                         if (error == ENOSPC || error == EDQUOT) {
5455                                 mutex_enter(&rp->r_statelock);
5456                                 if (!rp->r_error)
5457                                         rp->r_error = error;
5458                                 mutex_exit(&rp->r_statelock);
5459                         }
5460                         if (bfp->l_type != F_UNLCK) {
5461                                 rc = ENOLCK;
5462                                 goto done;
5463                         }
5464                 }
5465         }
5466 
5467         lm_fh3.n_len = VTOFH3(vp)->fh3_length;
5468         lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data);
5469 
5470         /*
5471          * Call the lock manager to do the real work of contacting
5472          * the server and obtaining the lock.
5473          */
5474         rc = lm4_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh3, flk_cbp);
5475 
5476         if (rc == 0)
5477                 nfs_lockcompletion(vp, cmd);
5478 
5479 done:
5480         nfs_rw_exit(&rp->r_lkserlock);
5481         return (rc);
5482 }
5483 
5484 /*
5485  * Free storage space associated with the specified vnode.  The portion
5486  * to be freed is specified by bfp->l_start and bfp->l_len (already
5487  * normalized to a "whence" of 0).
5488  *
5489  * This is an experimental facility whose continued existence is not
5490  * guaranteed.  Currently, we only support the special case
5491  * of l_len == 0, meaning free to end of file.
5492  */
5493 /* ARGSUSED */
5494 static int
5495 nfs3_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
5496         offset_t offset, cred_t *cr, caller_context_t *ct)
5497 {
5498         int error;
5499 
5500         ASSERT(vp->v_type == VREG);
5501         if (cmd != F_FREESP)
5502                 return (EINVAL);
5503         if (nfs_zone() != VTOMI(vp)->mi_zone)
5504                 return (EIO);
5505 
5506         error = convoff(vp, bfp, 0, offset);
5507         if (!error) {
5508                 ASSERT(bfp->l_start >= 0);
5509                 if (bfp->l_len == 0) {
5510                         struct vattr va;
5511 
5512                         /*
5513                          * ftruncate should not change the ctime and
5514                          * mtime if we truncate the file to its
5515                          * previous size.
5516                          */
5517                         va.va_mask = AT_SIZE;
5518                         error = nfs3getattr(vp, &va, cr);
5519                         if (error || va.va_size == bfp->l_start)
5520                                 return (error);
5521                         va.va_mask = AT_SIZE;
5522                         va.va_size = bfp->l_start;
5523                         error = nfs3setattr(vp, &va, 0, cr);
5524                 } else
5525                         error = EINVAL;
5526         }
5527 
5528         return (error);
5529 }
5530 
5531 /* ARGSUSED */
5532 static int
5533 nfs3_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
5534 {
5535 
5536         return (EINVAL);
5537 }
5538 
5539 /*
5540  * Setup and add an address space callback to do the work of the delmap call.
5541  * The callback will (and must be) deleted in the actual callback function.
5542  *
5543  * This is done in order to take care of the problem that we have with holding
5544  * the address space's a_lock for a long period of time (e.g. if the NFS server
5545  * is down).  Callbacks will be executed in the address space code while the
5546  * a_lock is not held.  Holding the address space's a_lock causes things such
5547  * as ps and fork to hang because they are trying to acquire this lock as well.
5548  */
5549 /* ARGSUSED */
5550 static int
5551 nfs3_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5552         size_t len, uint_t prot, uint_t maxprot, uint_t flags,
5553         cred_t *cr, caller_context_t *ct)
5554 {
5555         int                     caller_found;
5556         int                     error;
5557         rnode_t                 *rp;
5558         nfs_delmap_args_t       *dmapp;
5559         nfs_delmapcall_t        *delmap_call;
5560 
5561         if (vp->v_flag & VNOMAP)
5562                 return (ENOSYS);
5563         /*
5564          * A process may not change zones if it has NFS pages mmap'ed
5565          * in, so we can't legitimately get here from the wrong zone.
5566          */
5567         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5568 
5569         rp = VTOR(vp);
5570 
5571         /*
5572          * The way that the address space of this process deletes its mapping
5573          * of this file is via the following call chains:
5574          * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs3_delmap()
5575          * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs3_delmap()
5576          *
5577          * With the use of address space callbacks we are allowed to drop the
5578          * address space lock, a_lock, while executing the NFS operations that
5579          * need to go over the wire.  Returning EAGAIN to the caller of this
5580          * function is what drives the execution of the callback that we add
5581          * below.  The callback will be executed by the address space code
5582          * after dropping the a_lock.  When the callback is finished, since
5583          * we dropped the a_lock, it must be re-acquired and segvn_unmap()
5584          * is called again on the same segment to finish the rest of the work
5585          * that needs to happen during unmapping.
5586          *
5587          * This action of calling back into the segment driver causes
5588          * nfs3_delmap() to get called again, but since the callback was
5589          * already executed at this point, it already did the work and there
5590          * is nothing left for us to do.
5591          *
5592          * To Summarize:
5593          * - The first time nfs3_delmap is called by the current thread is when
5594          * we add the caller associated with this delmap to the delmap caller
5595          * list, add the callback, and return EAGAIN.
5596          * - The second time in this call chain when nfs3_delmap is called we
5597          * will find this caller in the delmap caller list and realize there
5598          * is no more work to do thus removing this caller from the list and
5599          * returning the error that was set in the callback execution.
5600          */
5601         caller_found = nfs_find_and_delete_delmapcall(rp, &error);
5602         if (caller_found) {
5603                 /*
5604                  * 'error' is from the actual delmap operations.  To avoid
5605                  * hangs, we need to handle the return of EAGAIN differently
5606                  * since this is what drives the callback execution.
5607                  * In this case, we don't want to return EAGAIN and do the
5608                  * callback execution because there are none to execute.
5609                  */
5610                 if (error == EAGAIN)
5611                         return (0);
5612                 else
5613                         return (error);
5614         }
5615 
5616         /* current caller was not in the list */
5617         delmap_call = nfs_init_delmapcall();
5618 
5619         mutex_enter(&rp->r_statelock);
5620         list_insert_tail(&rp->r_indelmap, delmap_call);
5621         mutex_exit(&rp->r_statelock);
5622 
5623         dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
5624 
5625         dmapp->vp = vp;
5626         dmapp->off = off;
5627         dmapp->addr = addr;
5628         dmapp->len = len;
5629         dmapp->prot = prot;
5630         dmapp->maxprot = maxprot;
5631         dmapp->flags = flags;
5632         dmapp->cr = cr;
5633         dmapp->caller = delmap_call;
5634 
5635         error = as_add_callback(as, nfs3_delmap_callback, dmapp,
5636             AS_UNMAP_EVENT, addr, len, KM_SLEEP);
5637 
5638         return (error ? error : EAGAIN);
5639 }
5640 
5641 /*
5642  * Remove some pages from an mmap'd vnode.  Just update the
5643  * count of pages.  If doing close-to-open, then flush and
5644  * commit all of the pages associated with this file.
5645  * Otherwise, start an asynchronous page flush to write out
5646  * any dirty pages.  This will also associate a credential
5647  * with the rnode which can be used to write the pages.
5648  */
5649 /* ARGSUSED */
5650 static void
5651 nfs3_delmap_callback(struct as *as, void *arg, uint_t event)
5652 {
5653         int                     error;
5654         rnode_t                 *rp;
5655         mntinfo_t               *mi;
5656         nfs_delmap_args_t       *dmapp = (nfs_delmap_args_t *)arg;
5657 
5658         rp = VTOR(dmapp->vp);
5659         mi = VTOMI(dmapp->vp);
5660 
5661         atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
5662         ASSERT(rp->r_mapcnt >= 0);
5663 
5664         /*
5665          * Initiate a page flush and potential commit if there are
5666          * pages, the file system was not mounted readonly, the segment
5667          * was mapped shared, and the pages themselves were writeable.
5668          */
5669         if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
5670             dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
5671                 mutex_enter(&rp->r_statelock);
5672                 rp->r_flags |= RDIRTY;
5673                 mutex_exit(&rp->r_statelock);
5674                 /*
5675                  * If this is a cross-zone access a sync putpage won't work, so
5676                  * the best we can do is try an async putpage.  That seems
5677                  * better than something more draconian such as discarding the
5678                  * dirty pages.
5679                  */
5680                 if ((mi->mi_flags & MI_NOCTO) ||
5681                     nfs_zone() != mi->mi_zone)
5682                         error = nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len,
5683                             B_ASYNC, dmapp->cr, NULL);
5684                 else
5685                         error = nfs3_putpage_commit(dmapp->vp, dmapp->off,
5686                             dmapp->len, dmapp->cr);
5687                 if (!error) {
5688                         mutex_enter(&rp->r_statelock);
5689                         error = rp->r_error;
5690                         rp->r_error = 0;
5691                         mutex_exit(&rp->r_statelock);
5692                 }
5693         } else
5694                 error = 0;
5695 
5696         if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
5697                 (void) nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len,
5698                     B_INVAL, dmapp->cr, NULL);
5699 
5700         dmapp->caller->error = error;
5701         (void) as_delete_callback(as, arg);
5702         kmem_free(dmapp, sizeof (nfs_delmap_args_t));
5703 }
5704 
5705 static int nfs3_pathconf_disable_cache = 0;
5706 
5707 #ifdef DEBUG
5708 static int nfs3_pathconf_cache_hits = 0;
5709 static int nfs3_pathconf_cache_misses = 0;
5710 #endif
5711 
5712 /* ARGSUSED */
5713 static int
5714 nfs3_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5715         caller_context_t *ct)
5716 {
5717         int error;
5718         PATHCONF3args args;
5719         PATHCONF3res res;
5720         int douprintf;
5721         failinfo_t fi;
5722         rnode_t *rp;
5723         hrtime_t t;
5724 
5725         if (nfs_zone() != VTOMI(vp)->mi_zone)
5726                 return (EIO);
5727         /*
5728          * Large file spec - need to base answer on info stored
5729          * on original FSINFO response.
5730          */
5731         if (cmd == _PC_FILESIZEBITS) {
5732                 unsigned long long ll;
5733                 long l = 1;
5734 
5735                 ll = VTOMI(vp)->mi_maxfilesize;
5736 
5737                 if (ll == 0) {
5738                         *valp = 0;
5739                         return (0);
5740                 }
5741 
5742                 if (ll & 0xffffffff00000000) {
5743                         l += 32; ll >>= 32;
5744                 }
5745                 if (ll & 0xffff0000) {
5746                         l += 16; ll >>= 16;
5747                 }
5748                 if (ll & 0xff00) {
5749                         l += 8; ll >>= 8;
5750                 }
5751                 if (ll & 0xf0) {
5752                         l += 4; ll >>= 4;
5753                 }
5754                 if (ll & 0xc) {
5755                         l += 2; ll >>= 2;
5756                 }
5757                 if (ll & 0x2)
5758                         l += 2;
5759                 else if (ll & 0x1)
5760                         l += 1;
5761                 *valp = l;
5762                 return (0);
5763         }
5764 
5765         if (cmd == _PC_ACL_ENABLED) {
5766                 *valp = _ACL_ACLENT_ENABLED;
5767                 return (0);
5768         }
5769 
5770         if (cmd == _PC_XATTR_EXISTS) {
5771                 error = 0;
5772                 *valp = 0;
5773                 if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
5774                         vnode_t *avp;
5775                         rnode_t *rp;
5776                         int error = 0;
5777                         mntinfo_t *mi = VTOMI(vp);
5778 
5779                         if (!(mi->mi_flags & MI_EXTATTR))
5780                                 return (0);
5781 
5782                         rp = VTOR(vp);
5783                         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
5784                             INTR(vp)))
5785                                 return (EINTR);
5786 
5787                         error = nfs3lookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
5788                         if (error || avp == NULL)
5789                                 error = acl_getxattrdir3(vp, &avp, 0, cr, 0);
5790 
5791                         nfs_rw_exit(&rp->r_rwlock);
5792 
5793                         if (error == 0 && avp != NULL) {
5794                                 error = do_xattr_exists_check(avp, valp, cr);
5795                                 VN_RELE(avp);
5796                         } else if (error == ENOENT) {
5797                                 error = 0;
5798                                 *valp = 0;
5799                         }
5800                 }
5801                 return (error);
5802         }
5803 
5804         rp = VTOR(vp);
5805         if (rp->r_pathconf != NULL) {
5806                 mutex_enter(&rp->r_statelock);
5807                 if (rp->r_pathconf != NULL && nfs3_pathconf_disable_cache) {
5808                         kmem_free(rp->r_pathconf, sizeof (*rp->r_pathconf));
5809                         rp->r_pathconf = NULL;
5810                 }
5811                 if (rp->r_pathconf != NULL) {
5812                         error = 0;
5813                         switch (cmd) {
5814                         case _PC_LINK_MAX:
5815                                 *valp = rp->r_pathconf->link_max;
5816                                 break;
5817                         case _PC_NAME_MAX:
5818                                 *valp = rp->r_pathconf->name_max;
5819                                 break;
5820                         case _PC_PATH_MAX:
5821                         case _PC_SYMLINK_MAX:
5822                                 *valp = MAXPATHLEN;
5823                                 break;
5824                         case _PC_CHOWN_RESTRICTED:
5825                                 *valp = rp->r_pathconf->chown_restricted;
5826                                 break;
5827                         case _PC_NO_TRUNC:
5828                                 *valp = rp->r_pathconf->no_trunc;
5829                                 break;
5830                         default:
5831                                 error = EINVAL;
5832                                 break;
5833                         }
5834                         mutex_exit(&rp->r_statelock);
5835 #ifdef DEBUG
5836                         nfs3_pathconf_cache_hits++;
5837 #endif
5838                         return (error);
5839                 }
5840                 mutex_exit(&rp->r_statelock);
5841         }
5842 #ifdef DEBUG
5843         nfs3_pathconf_cache_misses++;
5844 #endif
5845 
5846         args.object = *VTOFH3(vp);
5847         fi.vp = vp;
5848         fi.fhp = (caddr_t)&args.object;
5849         fi.copyproc = nfs3copyfh;
5850         fi.lookupproc = nfs3lookup;
5851         fi.xattrdirproc = acl_getxattrdir3;
5852 
5853         douprintf = 1;
5854 
5855         t = gethrtime();
5856 
5857         error = rfs3call(VTOMI(vp), NFSPROC3_PATHCONF,
5858             xdr_nfs_fh3, (caddr_t)&args,
5859             xdr_PATHCONF3res, (caddr_t)&res, cr,
5860             &douprintf, &res.status, 0, &fi);
5861 
5862         if (error)
5863                 return (error);
5864 
5865         error = geterrno3(res.status);
5866 
5867         if (!error) {
5868                 nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
5869                 if (!nfs3_pathconf_disable_cache) {
5870                         mutex_enter(&rp->r_statelock);
5871                         if (rp->r_pathconf == NULL) {
5872                                 rp->r_pathconf = kmem_alloc(
5873                                     sizeof (*rp->r_pathconf), KM_NOSLEEP);
5874                                 if (rp->r_pathconf != NULL)
5875                                         *rp->r_pathconf = res.resok.info;
5876                         }
5877                         mutex_exit(&rp->r_statelock);
5878                 }
5879                 switch (cmd) {
5880                 case _PC_LINK_MAX:
5881                         *valp = res.resok.info.link_max;
5882                         break;
5883                 case _PC_NAME_MAX:
5884                         *valp = res.resok.info.name_max;
5885                         break;
5886                 case _PC_PATH_MAX:
5887                 case _PC_SYMLINK_MAX:
5888                         *valp = MAXPATHLEN;
5889                         break;
5890                 case _PC_CHOWN_RESTRICTED:
5891                         *valp = res.resok.info.chown_restricted;
5892                         break;
5893                 case _PC_NO_TRUNC:
5894                         *valp = res.resok.info.no_trunc;
5895                         break;
5896                 default:
5897                         return (EINVAL);
5898                 }
5899         } else {
5900                 nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
5901                 PURGE_STALE_FH(error, vp, cr);
5902         }
5903 
5904         return (error);
5905 }
5906 
5907 /*
5908  * Called by async thread to do synchronous pageio. Do the i/o, wait
5909  * for it to complete, and cleanup the page list when done.
5910  */
5911 static int
5912 nfs3_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5913         int flags, cred_t *cr)
5914 {
5915         int error;
5916 
5917         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5918         error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5919         if (flags & B_READ)
5920                 pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
5921         else
5922                 pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
5923         return (error);
5924 }
5925 
5926 /* ARGSUSED */
5927 static int
5928 nfs3_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5929         int flags, cred_t *cr, caller_context_t *ct)
5930 {
5931         int error;
5932         rnode_t *rp;
5933 
5934         if (pp == NULL)
5935                 return (EINVAL);
5936         if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
5937                 return (EIO);
5938 
5939         rp = VTOR(vp);
5940         mutex_enter(&rp->r_statelock);
5941         rp->r_count++;
5942         mutex_exit(&rp->r_statelock);
5943 
5944         if (flags & B_ASYNC) {
5945                 error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
5946                     nfs3_sync_pageio);
5947         } else
5948                 error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5949         mutex_enter(&rp->r_statelock);
5950         rp->r_count--;
5951         cv_broadcast(&rp->r_cv);
5952         mutex_exit(&rp->r_statelock);
5953         return (error);
5954 }
5955 
5956 /* ARGSUSED */
5957 static void
5958 nfs3_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
5959         caller_context_t *ct)
5960 {
5961         int error;
5962         rnode_t *rp;
5963         page_t *plist;
5964         page_t *pptr;
5965         offset3 offset;
5966         count3 len;
5967         k_sigset_t smask;
5968 
5969         /*
5970          * We should get called with fl equal to either B_FREE or
5971          * B_INVAL.  Any other value is illegal.
5972          *
5973          * The page that we are either supposed to free or destroy
5974          * should be exclusive locked and its io lock should not
5975          * be held.
5976          */
5977         ASSERT(fl == B_FREE || fl == B_INVAL);
5978         ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
5979         rp = VTOR(vp);
5980 
5981         /*
5982          * If the page doesn't need to be committed or we shouldn't
5983          * even bother attempting to commit it, then just make sure
5984          * that the p_fsdata byte is clear and then either free or
5985          * destroy the page as appropriate.
5986          */
5987         if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & RSTALE)) {
5988                 pp->p_fsdata = C_NOCOMMIT;
5989                 if (fl == B_FREE)
5990                         page_free(pp, dn);
5991                 else
5992                         page_destroy(pp, dn);
5993                 return;
5994         }
5995 
5996         /*
5997          * If there is a page invalidation operation going on, then
5998          * if this is one of the pages being destroyed, then just
5999          * clear the p_fsdata byte and then either free or destroy
6000          * the page as appropriate.
6001          */
6002         mutex_enter(&rp->r_statelock);
6003         if ((rp->r_flags & RTRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
6004                 mutex_exit(&rp->r_statelock);
6005                 pp->p_fsdata = C_NOCOMMIT;
6006                 if (fl == B_FREE)
6007                         page_free(pp, dn);
6008                 else
6009                         page_destroy(pp, dn);
6010                 return;
6011         }
6012 
6013         /*
6014          * If we are freeing this page and someone else is already
6015          * waiting to do a commit, then just unlock the page and
6016          * return.  That other thread will take care of commiting
6017          * this page.  The page can be freed sometime after the
6018          * commit has finished.  Otherwise, if the page is marked
6019          * as delay commit, then we may be getting called from
6020          * pvn_write_done, one page at a time.   This could result
6021          * in one commit per page, so we end up doing lots of small
6022          * commits instead of fewer larger commits.  This is bad,
6023          * we want do as few commits as possible.
6024          */
6025         if (fl == B_FREE) {
6026                 if (rp->r_flags & RCOMMITWAIT) {
6027                         page_unlock(pp);
6028                         mutex_exit(&rp->r_statelock);
6029                         return;
6030                 }
6031                 if (pp->p_fsdata == C_DELAYCOMMIT) {
6032                         pp->p_fsdata = C_COMMIT;
6033                         page_unlock(pp);
6034                         mutex_exit(&rp->r_statelock);
6035                         return;
6036                 }
6037         }
6038 
6039         /*
6040          * Check to see if there is a signal which would prevent an
6041          * attempt to commit the pages from being successful.  If so,
6042          * then don't bother with all of the work to gather pages and
6043          * generate the unsuccessful RPC.  Just return from here and
6044          * let the page be committed at some later time.
6045          */
6046         sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
6047         if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
6048                 sigunintr(&smask);
6049                 page_unlock(pp);
6050                 mutex_exit(&rp->r_statelock);
6051                 return;
6052         }
6053         sigunintr(&smask);
6054 
6055         /*
6056          * We are starting to need to commit pages, so let's try
6057          * to commit as many as possible at once to reduce the
6058          * overhead.
6059          *
6060          * Set the `commit inprogress' state bit.  We must
6061          * first wait until any current one finishes.  Then
6062          * we initialize the c_pages list with this page.
6063          */
6064         while (rp->r_flags & RCOMMIT) {
6065                 rp->r_flags |= RCOMMITWAIT;
6066                 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
6067                 rp->r_flags &= ~RCOMMITWAIT;
6068         }
6069         rp->r_flags |= RCOMMIT;
6070         mutex_exit(&rp->r_statelock);
6071         ASSERT(rp->r_commit.c_pages == NULL);
6072         rp->r_commit.c_pages = pp;
6073         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6074         rp->r_commit.c_commlen = PAGESIZE;
6075 
6076         /*
6077          * Gather together all other pages which can be committed.
6078          * They will all be chained off r_commit.c_pages.
6079          */
6080         nfs3_get_commit(vp);
6081 
6082         /*
6083          * Clear the `commit inprogress' status and disconnect
6084          * the list of pages to be committed from the rnode.
6085          * At this same time, we also save the starting offset
6086          * and length of data to be committed on the server.
6087          */
6088         plist = rp->r_commit.c_pages;
6089         rp->r_commit.c_pages = NULL;
6090         offset = rp->r_commit.c_commbase;
6091         len = rp->r_commit.c_commlen;
6092         mutex_enter(&rp->r_statelock);
6093         rp->r_flags &= ~RCOMMIT;
6094         cv_broadcast(&rp->r_commit.c_cv);
6095         mutex_exit(&rp->r_statelock);
6096 
6097         if (curproc == proc_pageout || curproc == proc_fsflush ||
6098             nfs_zone() != VTOMI(vp)->mi_zone) {
6099                 nfs_async_commit(vp, plist, offset, len, cr, nfs3_async_commit);
6100                 return;
6101         }
6102 
6103         /*
6104          * Actually generate the COMMIT3 over the wire operation.
6105          */
6106         error = nfs3_commit(vp, offset, len, cr);
6107 
6108         /*
6109          * If we got an error during the commit, just unlock all
6110          * of the pages.  The pages will get retransmitted to the
6111          * server during a putpage operation.
6112          */
6113         if (error) {
6114                 while (plist != NULL) {
6115                         pptr = plist;
6116                         page_sub(&plist, pptr);
6117                         page_unlock(pptr);
6118                 }
6119                 return;
6120         }
6121 
6122         /*
6123          * We've tried as hard as we can to commit the data to stable
6124          * storage on the server.  We release the rest of the pages
6125          * and clear the commit required state.  They will be put
6126          * onto the tail of the cachelist if they are nolonger
6127          * mapped.
6128          */
6129         while (plist != pp) {
6130                 pptr = plist;
6131                 page_sub(&plist, pptr);
6132                 pptr->p_fsdata = C_NOCOMMIT;
6133                 (void) page_release(pptr, 1);
6134         }
6135 
6136         /*
6137          * It is possible that nfs3_commit didn't return error but
6138          * some other thread has modified the page we are going
6139          * to free/destroy.
6140          *    In this case we need to rewrite the page. Do an explicit check
6141          * before attempting to free/destroy the page. If modified, needs to
6142          * be rewritten so unlock the page and return.
6143          */
6144         if (hat_ismod(pp)) {
6145                 pp->p_fsdata = C_NOCOMMIT;
6146                 page_unlock(pp);
6147                 return;
6148         }
6149 
6150         /*
6151          * Now, as appropriate, either free or destroy the page
6152          * that we were called with.
6153          */
6154         pp->p_fsdata = C_NOCOMMIT;
6155         if (fl == B_FREE)
6156                 page_free(pp, dn);
6157         else
6158                 page_destroy(pp, dn);
6159 }
6160 
6161 static int
6162 nfs3_commit(vnode_t *vp, offset3 offset, count3 count, cred_t *cr)
6163 {
6164         int error;
6165         rnode_t *rp;
6166         COMMIT3args args;
6167         COMMIT3res res;
6168         int douprintf;
6169         cred_t *cred;
6170 
6171         rp = VTOR(vp);
6172         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6173 
6174         mutex_enter(&rp->r_statelock);
6175         if (rp->r_cred != NULL) {
6176                 cred = rp->r_cred;
6177                 crhold(cred);
6178         } else {
6179                 rp->r_cred = cr;
6180                 crhold(cr);
6181                 cred = cr;
6182                 crhold(cred);
6183         }
6184         mutex_exit(&rp->r_statelock);
6185 
6186         args.file = *VTOFH3(vp);
6187         args.offset = offset;
6188         args.count = count;
6189 
6190 doitagain:
6191         douprintf = 1;
6192         error = rfs3call(VTOMI(vp), NFSPROC3_COMMIT,
6193             xdr_COMMIT3args, (caddr_t)&args,
6194             xdr_COMMIT3res, (caddr_t)&res, cred,
6195             &douprintf, &res.status, 0, NULL);
6196 
6197         crfree(cred);
6198 
6199         if (error)
6200                 return (error);
6201 
6202         error = geterrno3(res.status);
6203         if (!error) {
6204                 ASSERT(rp->r_flags & RHAVEVERF);
6205                 mutex_enter(&rp->r_statelock);
6206                 if (rp->r_verf == res.resok.verf) {
6207                         mutex_exit(&rp->r_statelock);
6208                         return (0);
6209                 }
6210                 nfs3_set_mod(vp);
6211                 rp->r_verf = res.resok.verf;
6212                 mutex_exit(&rp->r_statelock);
6213                 error = NFS_VERF_MISMATCH;
6214         } else {
6215                 if (error == EACCES) {
6216                         mutex_enter(&rp->r_statelock);
6217                         if (cred != cr) {
6218                                 if (rp->r_cred != NULL)
6219                                         crfree(rp->r_cred);
6220                                 rp->r_cred = cr;
6221                                 crhold(cr);
6222                                 cred = cr;
6223                                 crhold(cred);
6224                                 mutex_exit(&rp->r_statelock);
6225                                 goto doitagain;
6226                         }
6227                         mutex_exit(&rp->r_statelock);
6228                 }
6229                 /*
6230                  * Can't do a PURGE_STALE_FH here because this
6231                  * can cause a deadlock.  nfs3_commit can
6232                  * be called from nfs3_dispose which can be called
6233                  * indirectly via pvn_vplist_dirty.  PURGE_STALE_FH
6234                  * can call back to pvn_vplist_dirty.
6235                  */
6236                 if (error == ESTALE) {
6237                         mutex_enter(&rp->r_statelock);
6238                         rp->r_flags |= RSTALE;
6239                         if (!rp->r_error)
6240                                 rp->r_error = error;
6241                         mutex_exit(&rp->r_statelock);
6242                         PURGE_ATTRCACHE(vp);
6243                 } else {
6244                         mutex_enter(&rp->r_statelock);
6245                         if (!rp->r_error)
6246                                 rp->r_error = error;
6247                         mutex_exit(&rp->r_statelock);
6248                 }
6249         }
6250 
6251         return (error);
6252 }
6253 
6254 static void
6255 nfs3_set_mod(vnode_t *vp)
6256 {
6257         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6258 
6259         pvn_vplist_setdirty(vp, nfs_setmod_check);
6260 }
6261 
6262 /*
6263  * This routine is used to gather together a page list of the pages
6264  * which are to be committed on the server.  This routine must not
6265  * be called if the calling thread holds any locked pages.
6266  *
6267  * The calling thread must have set RCOMMIT.  This bit is used to
6268  * serialize access to the commit structure in the rnode.  As long
6269  * as the thread has set RCOMMIT, then it can manipulate the commit
6270  * structure without requiring any other locks.
6271  */
6272 static void
6273 nfs3_get_commit(vnode_t *vp)
6274 {
6275         rnode_t *rp;
6276         page_t *pp;
6277         kmutex_t *vphm;
6278 
6279         rp = VTOR(vp);
6280 
6281         ASSERT(rp->r_flags & RCOMMIT);
6282 
6283         vphm = page_vnode_mutex(vp);
6284         mutex_enter(vphm);
6285 
6286         /*
6287          * If there are no pages associated with this vnode, then
6288          * just return.
6289          */
6290         if ((pp = vp->v_pages) == NULL) {
6291                 mutex_exit(vphm);
6292                 return;
6293         }
6294 
6295         /*
6296          * Step through all of the pages associated with this vnode
6297          * looking for pages which need to be committed.
6298          */
6299         do {
6300                 /* Skip marker pages. */
6301                 if (pp->p_hash == PVN_VPLIST_HASH_TAG)
6302                         continue;
6303 
6304                 /*
6305                  * If this page does not need to be committed or is
6306                  * modified, then just skip it.
6307                  */
6308                 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
6309                         continue;
6310 
6311                 /*
6312                  * Attempt to lock the page.  If we can't, then
6313                  * someone else is messing with it and we will
6314                  * just skip it.
6315                  */
6316                 if (!page_trylock(pp, SE_EXCL))
6317                         continue;
6318 
6319                 /*
6320                  * If this page does not need to be committed or is
6321                  * modified, then just skip it.  Recheck now that
6322                  * the page is locked.
6323                  */
6324                 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
6325                         page_unlock(pp);
6326                         continue;
6327                 }
6328 
6329                 if (PP_ISFREE(pp)) {
6330                         cmn_err(CE_PANIC, "nfs3_get_commit: %p is free",
6331                             (void *)pp);
6332                 }
6333 
6334                 /*
6335                  * The page needs to be committed and we locked it.
6336                  * Update the base and length parameters and add it
6337                  * to r_pages.
6338                  */
6339                 if (rp->r_commit.c_pages == NULL) {
6340                         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6341                         rp->r_commit.c_commlen = PAGESIZE;
6342                 } else if (pp->p_offset < rp->r_commit.c_commbase) {
6343                         rp->r_commit.c_commlen = rp->r_commit.c_commbase -
6344                             (offset3)pp->p_offset + rp->r_commit.c_commlen;
6345                         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6346                 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
6347                     <= pp->p_offset) {
6348                         rp->r_commit.c_commlen = (offset3)pp->p_offset -
6349                             rp->r_commit.c_commbase + PAGESIZE;
6350                 }
6351                 page_add(&rp->r_commit.c_pages, pp);
6352         } while ((pp = pp->p_vpnext) != vp->v_pages);
6353 
6354         mutex_exit(vphm);
6355 }
6356 
6357 /*
6358  * This routine is used to gather together a page list of the pages
6359  * which are to be committed on the server.  This routine must not
6360  * be called if the calling thread holds any locked pages.
6361  *
6362  * The calling thread must have set RCOMMIT.  This bit is used to
6363  * serialize access to the commit structure in the rnode.  As long
6364  * as the thread has set RCOMMIT, then it can manipulate the commit
6365  * structure without requiring any other locks.
6366  */
6367 static void
6368 nfs3_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len)
6369 {
6370 
6371         rnode_t *rp;
6372         page_t *pp;
6373         u_offset_t end;
6374         u_offset_t off;
6375 
6376         ASSERT(len != 0);
6377 
6378         rp = VTOR(vp);
6379 
6380         ASSERT(rp->r_flags & RCOMMIT);
6381         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6382 
6383         /*
6384          * If there are no pages associated with this vnode, then
6385          * just return.
6386          */
6387         if ((pp = vp->v_pages) == NULL)
6388                 return;
6389 
6390         /*
6391          * Calculate the ending offset.
6392          */
6393         end = soff + len;
6394 
6395         for (off = soff; off < end; off += PAGESIZE) {
6396                 /*
6397                  * Lookup each page by vp, offset.
6398                  */
6399                 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL)
6400                         continue;
6401 
6402                 /*
6403                  * If this page does not need to be committed or is
6404                  * modified, then just skip it.
6405                  */
6406                 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
6407                         page_unlock(pp);
6408                         continue;
6409                 }
6410 
6411                 ASSERT(PP_ISFREE(pp) == 0);
6412 
6413                 /*
6414                  * The page needs to be committed and we locked it.
6415                  * Update the base and length parameters and add it
6416                  * to r_pages.
6417                  */
6418                 if (rp->r_commit.c_pages == NULL) {
6419                         rp->r_commit.c_commbase = (offset3)pp->p_offset;
6420                         rp->r_commit.c_commlen = PAGESIZE;
6421                 } else {
6422                         rp->r_commit.c_commlen = (offset3)pp->p_offset -
6423                             rp->r_commit.c_commbase + PAGESIZE;
6424                 }
6425                 page_add(&rp->r_commit.c_pages, pp);
6426         }
6427 }
6428 
6429 static int
6430 nfs3_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
6431 {
6432         int error;
6433         writeverf3 write_verf;
6434         rnode_t *rp = VTOR(vp);
6435 
6436         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6437         /*
6438          * Flush the data portion of the file and then commit any
6439          * portions which need to be committed.  This may need to
6440          * be done twice if the server has changed state since
6441          * data was last written.  The data will need to be
6442          * rewritten to the server and then a new commit done.
6443          *
6444          * In fact, this may need to be done several times if the
6445          * server is having problems and crashing while we are
6446          * attempting to do this.
6447          */
6448 
6449 top:
6450         /*
6451          * Do a flush based on the poff and plen arguments.  This
6452          * will asynchronously write out any modified pages in the
6453          * range specified by (poff, plen).  This starts all of the
6454          * i/o operations which will be waited for in the next
6455          * call to nfs3_putpage
6456          */
6457 
6458         mutex_enter(&rp->r_statelock);
6459         write_verf = rp->r_verf;
6460         mutex_exit(&rp->r_statelock);
6461 
6462         error = nfs3_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
6463         if (error == EAGAIN)
6464                 error = 0;
6465 
6466         /*
6467          * Do a flush based on the poff and plen arguments.  This
6468          * will synchronously write out any modified pages in the
6469          * range specified by (poff, plen) and wait until all of
6470          * the asynchronous i/o's in that range are done as well.
6471          */
6472         if (!error)
6473                 error = nfs3_putpage(vp, poff, plen, 0, cr, NULL);
6474 
6475         if (error)
6476                 return (error);
6477 
6478         mutex_enter(&rp->r_statelock);
6479         if (rp->r_verf != write_verf) {
6480                 mutex_exit(&rp->r_statelock);
6481                 goto top;
6482         }
6483         mutex_exit(&rp->r_statelock);
6484 
6485         /*
6486          * Now commit any pages which might need to be committed.
6487          * If the error, NFS_VERF_MISMATCH, is returned, then
6488          * start over with the flush operation.
6489          */
6490 
6491         error = nfs3_commit_vp(vp, poff, plen, cr);
6492 
6493         if (error == NFS_VERF_MISMATCH)
6494                 goto top;
6495 
6496         return (error);
6497 }
6498 
6499 static int
6500 nfs3_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, cred_t *cr)
6501 {
6502         rnode_t *rp;
6503         page_t *plist;
6504         offset3 offset;
6505         count3 len;
6506 
6507 
6508         rp = VTOR(vp);
6509 
6510         if (nfs_zone() != VTOMI(vp)->mi_zone)
6511                 return (EIO);
6512         /*
6513          * Set the `commit inprogress' state bit.  We must
6514          * first wait until any current one finishes.
6515          */
6516         mutex_enter(&rp->r_statelock);
6517         while (rp->r_flags & RCOMMIT) {
6518                 rp->r_flags |= RCOMMITWAIT;
6519                 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
6520                 rp->r_flags &= ~RCOMMITWAIT;
6521         }
6522         rp->r_flags |= RCOMMIT;
6523         mutex_exit(&rp->r_statelock);
6524 
6525         /*
6526          * Gather together all of the pages which need to be
6527          * committed.
6528          */
6529         if (plen == 0)
6530                 nfs3_get_commit(vp);
6531         else
6532                 nfs3_get_commit_range(vp, poff, plen);
6533 
6534         /*
6535          * Clear the `commit inprogress' bit and disconnect the
6536          * page list which was gathered together in nfs3_get_commit.
6537          */
6538         plist = rp->r_commit.c_pages;
6539         rp->r_commit.c_pages = NULL;
6540         offset = rp->r_commit.c_commbase;
6541         len = rp->r_commit.c_commlen;
6542         mutex_enter(&rp->r_statelock);
6543         rp->r_flags &= ~RCOMMIT;
6544         cv_broadcast(&rp->r_commit.c_cv);
6545         mutex_exit(&rp->r_statelock);
6546 
6547         /*
6548          * If any pages need to be committed, commit them and
6549          * then unlock them so that they can be freed some
6550          * time later.
6551          */
6552         if (plist != NULL) {
6553                 /*
6554                  * No error occurred during the flush portion
6555                  * of this operation, so now attempt to commit
6556                  * the data to stable storage on the server.
6557                  *
6558                  * This will unlock all of the pages on the list.
6559                  */
6560                 return (nfs3_sync_commit(vp, plist, offset, len, cr));
6561         }
6562         return (0);
6563 }
6564 
6565 static int
6566 nfs3_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
6567         cred_t *cr)
6568 {
6569         int error;
6570         page_t *pp;
6571 
6572         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6573         error = nfs3_commit(vp, offset, count, cr);
6574 
6575         /*
6576          * If we got an error, then just unlock all of the pages
6577          * on the list.
6578          */
6579         if (error) {
6580                 while (plist != NULL) {
6581                         pp = plist;
6582                         page_sub(&plist, pp);
6583                         page_unlock(pp);
6584                 }
6585                 return (error);
6586         }
6587         /*
6588          * We've tried as hard as we can to commit the data to stable
6589          * storage on the server.  We just unlock the pages and clear
6590          * the commit required state.  They will get freed later.
6591          */
6592         while (plist != NULL) {
6593                 pp = plist;
6594                 page_sub(&plist, pp);
6595                 pp->p_fsdata = C_NOCOMMIT;
6596                 page_unlock(pp);
6597         }
6598 
6599         return (error);
6600 }
6601 
6602 static void
6603 nfs3_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
6604         cred_t *cr)
6605 {
6606         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6607         (void) nfs3_sync_commit(vp, plist, offset, count, cr);
6608 }
6609 
6610 /* ARGSUSED */
6611 static int
6612 nfs3_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
6613         caller_context_t *ct)
6614 {
6615         int error;
6616         mntinfo_t *mi;
6617 
6618         mi = VTOMI(vp);
6619 
6620         if (nfs_zone() != mi->mi_zone)
6621                 return (EIO);
6622 
6623         if (mi->mi_flags & MI_ACL) {
6624                 error = acl_setacl3(vp, vsecattr, flag, cr);
6625                 if (mi->mi_flags & MI_ACL)
6626                         return (error);
6627         }
6628 
6629         return (ENOSYS);
6630 }
6631 
6632 /* ARGSUSED */
6633 static int
6634 nfs3_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
6635         caller_context_t *ct)
6636 {
6637         int error;
6638         mntinfo_t *mi;
6639 
6640         mi = VTOMI(vp);
6641 
6642         if (nfs_zone() != mi->mi_zone)
6643                 return (EIO);
6644 
6645         if (mi->mi_flags & MI_ACL) {
6646                 error = acl_getacl3(vp, vsecattr, flag, cr);
6647                 if (mi->mi_flags & MI_ACL)
6648                         return (error);
6649         }
6650 
6651         return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
6652 }
6653 
6654 /* ARGSUSED */
6655 static int
6656 nfs3_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
6657         caller_context_t *ct)
6658 {
6659         int error;
6660         struct shrlock nshr;
6661         struct nfs_owner nfs_owner;
6662         netobj lm_fh3;
6663 
6664         if (nfs_zone() != VTOMI(vp)->mi_zone)
6665                 return (EIO);
6666 
6667         /*
6668          * check for valid cmd parameter
6669          */
6670         if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
6671                 return (EINVAL);
6672 
6673         /*
6674          * Check access permissions
6675          */
6676         if (cmd == F_SHARE &&
6677             (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
6678             ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
6679                 return (EBADF);
6680 
6681         /*
6682          * If the filesystem is mounted using local locking, pass the
6683          * request off to the local share code.
6684          */
6685         if (VTOMI(vp)->mi_flags & MI_LLOCK)
6686                 return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
6687 
6688         switch (cmd) {
6689         case F_SHARE:
6690         case F_UNSHARE:
6691                 lm_fh3.n_len = VTOFH3(vp)->fh3_length;
6692                 lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data);
6693 
6694                 /*
6695                  * If passed an owner that is too large to fit in an
6696                  * nfs_owner it is likely a recursive call from the
6697                  * lock manager client and pass it straight through.  If
6698                  * it is not a nfs_owner then simply return an error.
6699                  */
6700                 if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
6701                         if (((struct nfs_owner *)shr->s_owner)->magic !=
6702                             NFS_OWNER_MAGIC)
6703                                 return (EINVAL);
6704 
6705                         if (error = lm4_shrlock(vp, cmd, shr, flag, &lm_fh3)) {
6706                                 error = set_errno(error);
6707                         }
6708                         return (error);
6709                 }
6710                 /*
6711                  * Remote share reservations owner is a combination of
6712                  * a magic number, hostname, and the local owner
6713                  */
6714                 bzero(&nfs_owner, sizeof (nfs_owner));
6715                 nfs_owner.magic = NFS_OWNER_MAGIC;
6716                 (void) strncpy(nfs_owner.hname, uts_nodename(),
6717                     sizeof (nfs_owner.hname));
6718                 bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
6719                 nshr.s_access = shr->s_access;
6720                 nshr.s_deny = shr->s_deny;
6721                 nshr.s_sysid = 0;
6722                 nshr.s_pid = ttoproc(curthread)->p_pid;
6723                 nshr.s_own_len = sizeof (nfs_owner);
6724                 nshr.s_owner = (caddr_t)&nfs_owner;
6725 
6726                 if (error = lm4_shrlock(vp, cmd, &nshr, flag, &lm_fh3)) {
6727                         error = set_errno(error);
6728                 }
6729 
6730                 break;
6731 
6732         case F_HASREMOTELOCKS:
6733                 /*
6734                  * NFS client can't store remote locks itself
6735                  */
6736                 shr->s_access = 0;
6737                 error = 0;
6738                 break;
6739 
6740         default:
6741                 error = EINVAL;
6742                 break;
6743         }
6744 
6745         return (error);
6746 }